In [25]:
#load traning data
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [26]:
df_train = pd.read_csv('./data/train.csv')
df_test =  pd.read_csv('./data/test.csv')
total = df_train.isnull().sum().sort_values(ascending=False)
print(total.head(10))

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageCond        81
GarageType        81
GarageYrBlt       81
GarageFinish      81
dtype: int64


In [27]:
def datapreprocessing(df):
    df.loc[df.MasVnrType.isnull(), 'MasVnrType'] = 'None' # no good
    df.loc[df.MasVnrType == 'None', 'MasVnrArea'] = 0
    df.loc[df.LotFrontage.isnull(), 'LotFrontage'] = df.LotFrontage.median()
    df.loc[df.LotArea.isnull(), 'MasVnrType'] = 0
    df.loc[df.BsmtQual.isnull(), 'BsmtQual'] = 'NoBsmt'
    df.loc[df.BsmtCond.isnull(), 'BsmtCond'] = 'NoBsmt'
    df.loc[df.BsmtExposure.isnull(), 'BsmtExposure'] = 'NoBsmt'
    df.loc[df.BsmtFinType1.isnull(), 'BsmtFinType1'] = 'NoBsmt'
    df.loc[df.BsmtFinType2.isnull(), 'BsmtFinType2'] = 'NoBsmt'
    df.loc[df.BsmtFinType1=='NoBsmt', 'BsmtFinSF1'] = 0
    df.loc[df.BsmtFinType2=='NoBsmt', 'BsmtFinSF2'] = 0
    df.loc[df.BsmtFinSF1.isnull(), 'BsmtFinSF1'] = df.BsmtFinSF1.median()
    df.loc[df.BsmtQual=='NoBsmt', 'BsmtUnfSF'] = 0
    df.loc[df.BsmtUnfSF.isnull(), 'BsmtUnfSF'] = df.BsmtUnfSF.median()
    df.loc[df.BsmtQual=='NoBsmt', 'TotalBsmtSF'] = 0
    df.loc[df.FireplaceQu.isnull(), 'FireplaceQu'] = 'NoFireplace'
    df.loc[df.GarageType.isnull(), 'GarageType'] = 'NoGarage'
    df.loc[df.GarageFinish.isnull(), 'GarageFinish'] = 'NoGarage'
    df.loc[df.GarageQual.isnull(), 'GarageQual'] = 'NoGarage'
    df.loc[df.GarageCond.isnull(), 'GarageCond'] = 'NoGarage'
    df.loc[df.BsmtFullBath.isnull(), 'BsmtFullBath'] = 0
    df.loc[df.BsmtHalfBath.isnull(), 'BsmtHalfBath'] = 0
    df.loc[df.KitchenQual.isnull(), 'KitchenQual'] = 'TA'
    df.loc[df.MSZoning.isnull(), 'MSZoning'] = 'RL'
    df.loc[df.Utilities.isnull(), 'Utilities'] = 'AllPub'
    df.loc[df.Exterior1st.isnull(), 'Exterior1st'] = 'VinylSd'
    df.loc[df.Exterior2nd.isnull(), 'Exterior2nd'] = 'VinylSd'
    df.loc[df.Functional.isnull(), 'Functional'] = 'Typ'
    df.loc[df.SaleCondition.isnull(), 'SaleCondition'] = 'Normal'
    df.loc[df.SaleCondition.isnull(), 'SaleType'] = 'WD'
    df.loc[df['Electrical'].isnull(), 'Electrical'] = 'SBrkr'
    df.loc[df['SaleType'].isnull(), 'SaleType'] = 'NoSale'
    #GarageYrBlt
    df.loc[df.GarageYrBlt.isnull(), 'GarageYrBlt'] = df.GarageYrBlt.median()
    # only one is null and it has type Detchd
    df.loc[df['GarageArea'].isnull(), 'GarageArea'] = df.loc[df['GarageType']=='Detchd', 'GarageArea'].mean()
    df.loc[df['GarageCars'].isnull(), 'GarageCars'] = df.loc[df['GarageType']=='Detchd', 'GarageCars'].median()
    
#datapreprocessing(df_train)
#datapreprocessing(df_test)

In [28]:
def convertasCategoty(df,col):
    df[col] = df[col].astype('category')
    df[col+'_code'] = df[col].cat.codes
    df[col] = df[col +'_code']
    df.drop(labels=col+'_code', axis="columns", inplace=True)
    #return df
    
cols = df_train.columns
for col in cols:
    #print(df_train[col].dtype)
    if(df_train[col].dtype == 'object'):
        #print("----Ste at category")
        convertasCategoty(df_train,col)
        
cols = df_test.columns
for col in cols:
    #print(df_train[col].dtype)
    if(df_test[col].dtype == 'object'):
        #print("----Ste at category")
        convertasCategoty(df_test,col)
        


In [29]:
df_test_id = df_test['Id']
#print(df_test_id)

df_train.drop(['Id','PoolQC','MiscFeature','Alley'],axis=1)
df_test.drop(['Id','PoolQC','MiscFeature','Alley'],axis=1)

print(df_train.shape)
print(df_test.shape)

(1460, 81)
(1459, 80)


In [30]:
   

#print(df_train.head())
#print(df_train.dtypes)
# Handle remaining missing values for numerical features by using median as replacement
print("NAs for numerical features in train : " + str(df_train.isnull().values.sum()))
df_train = df_train.fillna(df_train.median())
df_test = df_test.fillna(df_train.median())
print("Remaining NAs for numerical features in train : " + str(df_train.isnull().values.sum()))
print("Remaining NAs for numerical features in test : " + str(df_test.isnull().values.sum()))

NAs for numerical features in train : 348
Remaining NAs for numerical features in train : 0
Remaining NAs for numerical features in test : 0


In [31]:
from sklearn.model_selection import train_test_split


X = df_train.iloc[:, 1:-2].values  
print(X[1]) 
print(X.shape)
y = df_train.iloc[:, -1].values
print(y)

X_test = df_test.iloc[:, 1:-1].values  
print(X_test[1]) 
print(X_test.shape)
y_test = df_test.iloc[:, -1].values
print(y_test)


X_train, X_train_test, y_train, y_train_test = train_test_split(X, y, test_size=0.1, random_state=101)

[ 2.000e+01  3.000e+00  8.000e+01  9.600e+03  1.000e+00 -1.000e+00
  3.000e+00  3.000e+00  0.000e+00  2.000e+00  0.000e+00  2.400e+01
  1.000e+00  2.000e+00  0.000e+00  2.000e+00  6.000e+00  8.000e+00
  1.976e+03  1.976e+03  1.000e+00  1.000e+00  8.000e+00  8.000e+00
  2.000e+00  0.000e+00  3.000e+00  4.000e+00  1.000e+00  2.000e+00
  3.000e+00  1.000e+00  0.000e+00  9.780e+02  5.000e+00  0.000e+00
  2.840e+02  1.262e+03  1.000e+00  0.000e+00  1.000e+00  4.000e+00
  1.262e+03  0.000e+00  0.000e+00  1.262e+03  0.000e+00  1.000e+00
  2.000e+00  0.000e+00  3.000e+00  1.000e+00  3.000e+00  6.000e+00
  6.000e+00  1.000e+00  4.000e+00  1.000e+00  1.976e+03  1.000e+00
  2.000e+00  4.600e+02  4.000e+00  4.000e+00  2.000e+00  2.980e+02
  0.000e+00  0.000e+00  0.000e+00  0.000e+00  0.000e+00 -1.000e+00
 -1.000e+00 -1.000e+00  0.000e+00  5.000e+00  2.007e+03  8.000e+00]
(1460, 78)
[208500 181500 223500 ... 266500 142125 147500]
[ 2.0000e+01  3.0000e+00  8.1000e+01  1.4267e+04  1.0000e+00 -1.0000e

In [32]:
# we are going to scale to data

y_train= y_train.reshape(-1,1)
y_train_test= y_train_test.reshape(-1,1)
y_test= y_train_test.reshape(-1,1)

from sklearn.preprocessing import StandardScaler
sc_X_train = StandardScaler()
sc_y_train = StandardScaler()

sc_X_train_test = StandardScaler()
sc_y_train_test = StandardScaler()

sc_X_test = StandardScaler()
sc_y_test = StandardScaler()


X_train = sc_X_train.fit_transform(X_train)
y_train = sc_y_train.fit_transform(y_train)

X_train_test = sc_X_train_test.fit_transform(X_train_test)
y_train_test = sc_y_train_test.fit_transform(y_train_test)

X_test = sc_X_test.fit_transform(X_test)
y_test = sc_y_test.fit_transform(y_test)



In [33]:
#X, y = make_regression(n_features=4, n_informative=2,random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)
RandomForestRegressor(max_depth=2, random_state=0)
print(regr.feature_importances_)



[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.8562382  0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.00882001 0.         0.         0.         0.
 0.         0.02116058 0.         0.09284664 0.         0.
 0.         0.         0.         0.         0.         0.0109722
 0.         0.         0.         0.         0.         0.
 0.00996235 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]


In [34]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics
import numpy as np

svr = SVR(kernel = 'rbf',gamma='auto')
svr.fit(X_train, y_train)

svr_pred = svr.predict(X_train_test)
svr_pred= svr_pred.reshape(-1,1)

print('MAE:', metrics.mean_absolute_error(y_train_test, svr_pred))
print('MSE:', metrics.mean_squared_error(y_train_test, svr_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train_test, svr_pred)))

  y = column_or_1d(y, warn=True)


MAE: 0.236584609155939
MSE: 0.13527806728032782
RMSE: 0.36780166840340434


In [35]:
from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score

params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
params1={'n_estimators':3000, 'learning_rate':0.05, 'max_depth':4, 'max_features':'sqrt',
                                   'min_samples_leaf':15, 'min_samples_split':2, 
                                   'loss':'huber', 'random_state':5}
clf = ensemble.GradientBoostingRegressor(**params1)

clf.fit(X_train, y_train)

clf_pred=clf.predict(X_train_test)
clf_pred= clf_pred.reshape(-1,1)

print('MAE:', metrics.mean_absolute_error(y_train_test, clf_pred))
print('MSE:', metrics.mean_squared_error(y_train_test, clf_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_train_test, clf_pred)))



  y = column_or_1d(y, warn=True)


MAE: 0.21022910696011998
MSE: 0.10793411607512507
RMSE: 0.3285332800115158


In [36]:
clf_pred =sc_y_train_test.inverse_transform(clf_pred)
#print(clf_pred)
df_clf_pred = pd.DataFrame(clf_pred, columns=['SalePrice'])
df_clf_pred.head()

Unnamed: 0,SalePrice
0,259869.944647
1,143644.03639
2,141419.507156
3,367951.784262
4,385665.853171


In [37]:
#try gbr on test data
clf_pred=clf.predict(X_test)
clf_pred= clf_pred.reshape(-1,1)
clf_pred =sc_y_test.inverse_transform(clf_pred)
#print(clf_pred)
df_clf_pred = pd.DataFrame(clf_pred, columns=['SalePrice'])
df_clf_pred.head()

Unnamed: 0,SalePrice
0,121576.18632
1,163791.546878
2,182342.716017
3,197793.900473
4,176751.063136


In [38]:
#submission data
df_sub = pd.concat([df_test_id,df_clf_pred],axis=1)
print(df_sub.head())

     Id      SalePrice
0  1461  121576.186320
1  1462  163791.546878
2  1463  182342.716017
3  1464  197793.900473
4  1465  176751.063136


In [39]:
df_sub.to_csv('./data/submission.csv',index=False)

Change Log
++++++++++++++++++++++++++++++++++=

Feartue Engineering
Replace missing values by median

Model
SVR

Score
MAE: 50138.725050989204
    
MSE: 5299712145.626122
    
RMSE: 72799.12187400424

+++++++++++++++++++++++++++++++++
Params1
MAE: 0.21022910696011998
MSE: 0.10793411607512507
RMSE: 0.3285332800115158
    
Params
MAE: 0.21493207760478975
MSE: 0.11624055545701495
RMSE: 0.3409406919935122

