## score
    by the time I submit my predictions, the score of first place is 0.06629, 2nd place is 0.10567

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, LassoCV
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
import time

In [2]:
train = pd.read_csv('trainC.csv', index_col= 'Id', na_filter = False)
test = pd.read_csv('testC.csv', index_col= 'Id', na_filter = False)

In [3]:
# Using pd.get_dummies separately on train and test dataset results in them having different columns, 
# which means unable to predict the price of test dataset.
ntrain = len(train)
temp= pd.concat((train, test))
temp= pd.get_dummies(temp)
train= temp[0:ntrain]
test= temp[ntrain:]
features = train.iloc[:, train.columns != 'SalePrice']
labels = train['SalePrice']
test= test.iloc[:, test.columns != 'SalePrice']

In [16]:
print temp.shape
print train.isnull().sum().sort_values(ascending= False).head(5)
print test.isnull().sum().sort_values(ascending= False).head(5)

(2919, 5979)
Utilities_NoSeWa    0
BsmtUnfSF_1411.0    0
BsmtUnfSF_1400.0    0
BsmtUnfSF_1402.0    0
BsmtUnfSF_1404.0    0
dtype: int64
Utilities_NoSeWa    0
BsmtUnfSF_1413      0
BsmtUnfSF_1402.0    0
BsmtUnfSF_1404.0    0
BsmtUnfSF_1405      0
dtype: int64


In [5]:
def ScoreOnTrain(pred, features, labels):
    temp = pd.DataFrame(pred - labels)
    print 'describe of (predictions - actual) on training dataset\n'
    print temp.describe()
    print '\nThe score of the model: ', reg.score(features, labels) 

### simple linear regression

In [37]:
start = time.time()
reg = LinearRegression()
reg.fit(features, labels)
end = time.time()

print 'training time: ', end - start

training time:  2.04299998283


In [40]:
# predicting result looks perfect on training dataset.
pred = reg.predict(features)
ScoreOnTrain(pred, features, labels)

describe of (predictions - actual) on training dataset

          SalePrice
count  1.460000e+03
mean  -1.208997e-08
std    2.942974e-06
min   -6.360002e-06
25%   -1.477310e-06
50%   -1.625158e-07
75%    1.005828e-06
max    4.293397e-05

The score of the model:  1.0


In [84]:
# some predicting prices are negative when predicting using the test dataset.
pred= reg.predict(test)
for i in range(len(pred)):
    if pred[i] < 0:
        pred[i] = 0

In [83]:
result= pd.DataFrame({'Id': test.index, 'SalePrice': pred})
result.to_csv('Simple_linear_regression.csv', index= False)

### The score of simple linear regression
    RMSLE: 4.04180 (1586/1600= 0.99125)

### ridge regression

In [42]:
start = time.time()
reg= RidgeCV(alphas= [i for i in range(20, 30, 1)], cv= 10, scoring= 'r2')
reg.fit(features, labels)
end = time.time()
print 'training time: ', end - start

training time:  40.4800000191


In [43]:
reg.alpha_

24

In [44]:
pred = reg.predict(features)
ScoreOnTrain(pred, features, labels)

describe of (predictions - actual) on training dataset

          SalePrice
count  1.460000e+03
mean  -3.500832e-10
std    2.226661e+04
min   -1.986505e+05
25%   -8.188226e+03
50%    4.181573e+02
75%    9.175404e+03
max    2.884474e+05

The score of the model:  0.9214398035


In [34]:
pred= reg.predict(test)
result= pd.DataFrame({'Id': test.index, 'SalePrice': pred})
result.to_csv('ridge_regression_01.csv', index= False)

### The score of ridge regression
    RMSLE: 0.14251 (956/1601= 0.597)

### lasso regression

In [6]:
start = time.time()
reg= LassoCV(max_iter= 50000, n_jobs= -1, cv= 10)
reg.fit(features, labels)
end = time.time()
print 'training time: ', end - start

training time:  67.2750000954


In [15]:
print reg.alpha_
print reg.n_iter_
print sum(reg.coef_ != 0)
print sum(reg.coef_ == 0)

209067.774683
144
11
5967


In [8]:
pred = reg.predict(features)
ScoreOnTrain(pred, features, labels)

describe of (predictions - actual) on training dataset

          SalePrice
count  1.460000e+03
mean   9.185647e-11
std    4.334759e+04
min   -2.934230e+05
25%   -1.527180e+04
50%    9.009689e+02
75%    1.855845e+04
max    5.399607e+05

The score of the model:  0.702268996184


In [53]:
pred= reg.predict(test)
result= pd.DataFrame({'Id': test.index, 'SalePrice': pred})
result.to_csv('lasso_regression_01.csv', index= False)

### the score of lasso regression
    RMSLE: 0.21158 (1371/1601= 0.856)

### Kernel Ridge

In [None]:
start = time.time()


parameters= {
    'alpha': [10**i for i in range(-10, 1)],
    'kernel': ('linear', 'polynomial', 'chi2'),
    'degree': range(1, 5)
    
}

reg= GridSearchCV(KernelRidge(), param_grid= parameters, n_jobs= -1)
reg.fit(features, labels)
end = time.time()
print 'training time: ', end - start

In [None]:
print reg.best_estimator_
print reg.best_score_
pred = reg.predict(features)
ScoreOnTrain(pred, features, labels)