# Predicting Housing prices using SVM

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import fetch_california_housing

In [7]:
housing = fetch_california_housing()

In [8]:
housing.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])

In [9]:
print(housing.DESCR)

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [10]:
data = housing.data
targets = housing.target

In [11]:
data.shape

(20640, 8)

In [12]:
targets.shape

(20640,)

In [13]:
from sklearn.model_selection import train_test_split

In [16]:
train_X, test_X, train_y, test_y = train_test_split(data, targets, test_size=0.2, random_state = 42)

In [17]:
train_X.shape

(16512, 8)

In [18]:
from sklearn.svm import LinearSVR

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [65]:
svm_reg = make_pipeline(StandardScaler(),
                      LinearSVR(random_state = 0, max_iter=10000))

In [66]:
svm_reg.fit(data,targets)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvr', LinearSVR(max_iter=10000, random_state=0))])

In [67]:
from sklearn.metrics import mean_squared_error

In [102]:
pred_y = svm_reg.predict(train_X)

In [103]:
mse = mean_squared_error(train_y, pred_y)
mse

1.4437235671256883

In [104]:
np.sqrt(mse)

1.2015504846346192

## using gaussian kernel

In [71]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [79]:
nonlinear_svr = make_pipeline(StandardScaler(),SVR())

In [84]:
param_dis = {
    "svr__gamma": reciprocal(0.001,0.1),
    "svr__C": uniform(1,10)
}

In [85]:
rnd_search_cv = RandomizedSearchCV(nonlinear_svr, param_dis, n_iter = 1000,
                                  cv = 3, random_state=42)

In [86]:
rnd_search_cv.fit(train_X[:2000], train_y[:2000])

RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('standardscaler',
                                              StandardScaler()),
                                             ('svr', SVR())]),
                   n_iter=1000,
                   param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa1802faf20>,
                                        'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fa172a53ca0>},
                   random_state=42)

In [87]:
rnd_search_cv.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svr', SVR(C=7.169269183757422, gamma=0.09170063774757266))])

In [88]:
from sklearn.model_selection import cross_val_score

In [90]:
cross_val_score(rnd_search_cv.best_estimator_, train_X,train_y, scoring="neg_root_mean_squared_error")

array([-0.58293768, -0.56778609, -0.57446822, -0.56512889, -0.59171333])

In [105]:
pred_y = rnd_search_cv.best_estimator_.predict(test_X)

In [106]:
mean_squared_error(test_y, pred_y)

0.33903484808790574