In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

In [4]:
train = train.drop(['Fence','Alley','PoolQC','MiscFeature', 'FireplaceQu'], axis=1)
test = test.drop(['Fence','Alley','PoolQC','MiscFeature', 'FireplaceQu'], axis=1)

In [5]:
train.shape

(1460, 76)

In [6]:
test.shape

(1459, 75)

In [7]:
def preprocess(dataset):
    check = dataset.isnull().sum(axis=0)
    nan = [train.columns[i] for i in range(len(check)) if check[i] != 0]
    for column in nan:
        if dataset[column].dtype == object:
                dataset[column].fillna(dataset[column].value_counts()[0], inplace=True)
        else:
                dataset[column].fillna(dataset[column].mean(), inplace=True)

In [8]:
preprocess(train)

In [9]:
preprocess(test)

In [10]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1460 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non

In [11]:
X_train = train[train.columns].drop(['SalePrice'], axis=1)
y_train = train['SalePrice']

In [12]:
X_test = test[test.columns]

**Vectorize features**

In [13]:
from sklearn.feature_extraction import DictVectorizer

In [14]:
dictVec = DictVectorizer()

In [15]:
X_train = dictVec.fit_transform(X_train.to_dict(orient='record'))
X_test = dictVec.transform(X_test.to_dict(orient='record'))

**Training**

In [16]:
from sklearn.svm import SVR

In [17]:
svrLinear = SVR(kernel='linear')
svrRbf = SVR(kernel='rbf')

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
print(cross_val_score(svrLinear, X_train, y_train, cv=4).mean())

0.734756572565032


In [20]:
svrLinear.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='linear', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)

In [21]:
from sklearn.model_selection import GridSearchCV

In [22]:
params = {'C':[1, 10, 100],'gamma':[1e-4, 1e-3, 0.01]}

In [23]:
grid = GridSearchCV(estimator=svrLinear, param_grid=params, cv=4, verbose=10)

In [24]:
grid.fit(X_train, y_train)

Fitting 4 folds for each of 9 candidates, totalling 36 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................... C=1, gamma=0.0001, score=0.803, total=  10.4s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.3s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.756, total=  12.1s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   22.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.782, total=  21.0s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   43.4s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.598, total=  12.1s
[CV] C=1, gamma=0.001 ................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   55.5s remaining:    0.0s


[CV] .................... C=1, gamma=0.001, score=0.803, total=  11.3s
[CV] C=1, gamma=0.001 ................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.1min remaining:    0.0s


[CV] .................... C=1, gamma=0.001, score=0.756, total=  12.9s
[CV] C=1, gamma=0.001 ................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.3min remaining:    0.0s


[CV] .................... C=1, gamma=0.001, score=0.782, total=  23.2s
[CV] C=1, gamma=0.001 ................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  1.7min remaining:    0.0s


[CV] .................... C=1, gamma=0.001, score=0.598, total=  11.7s
[CV] C=1, gamma=0.01 .................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  1.9min remaining:    0.0s


[CV] ..................... C=1, gamma=0.01, score=0.803, total=  10.5s
[CV] C=1, gamma=0.01 .................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  2.1min remaining:    0.0s


[CV] ..................... C=1, gamma=0.01, score=0.756, total=  11.9s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.782, total=  21.1s
[CV] C=1, gamma=0.01 .................................................
[CV] ..................... C=1, gamma=0.01, score=0.598, total=  11.8s
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.832, total= 3.1min
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.776, total=11.3min
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.804, total= 7.0min
[CV] C=10, gamma=0.0001 ..............................................
[CV] .................. C=10, gamma=0.0001, score=0.628, total= 6.3min
[CV] C=10, gamma=0.001 ...............................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 256.9min finished


GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='auto_deprecated',
                           kernel='linear', max_iter=-1, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 10, 100], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=10)

In [25]:
print(grid.best_score_)

0.8004322195697848


In [26]:
print(grid.best_params_)

{'C': 100, 'gamma': 0.0001}


In [27]:
y_pred_best_svr = grid.predict(X_test)

In [28]:
best_svr_submission = pd.DataFrame({'Id': test['Id'], 'SalePrice':y_pred_best_svr})

In [29]:
best_svr_submission.to_csv('best_svr_submission.csv', index=False)