In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDRegressor

In [7]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [8]:
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [9]:
df_test.head(1)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal


In [10]:
df_train.drop('Id', axis=1, inplace=True)
df_test.drop('Id', axis=1, inplace=True)

In [11]:
df_train.isna().sum()[df_train.isna().sum() != 0].sort_values() * 100 / len(df_train)

Electrical       0.068493
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtFinType1     2.534247
BsmtExposure     2.602740
BsmtFinType2     2.602740
GarageCond       5.547945
GarageQual       5.547945
GarageFinish     5.547945
GarageYrBlt      5.547945
GarageType       5.547945
LotFrontage     17.739726
FireplaceQu     47.260274
MasVnrType      59.726027
Fence           80.753425
Alley           93.767123
MiscFeature     96.301370
PoolQC          99.520548
dtype: float64

In [12]:
for key, value in (df_train.isna().sum()[df_train.isna().sum() != 0].sort_values() * 100 / len(df_train)).items():
    if value > 40:
        df_train.drop(key, axis=1, inplace=True)
    else:
        df_train[key].fillna(df_train[key].value_counts().index[0], inplace=True)

In [13]:
for key, value in (df_test.isna().sum()[df_test.isna().sum() != 0].sort_values() * 100 / len(df_test)).items():
    if value > 40:
        df_test.drop(key, axis=1, inplace=True)
    else:
        df_test[key].fillna(df_test[key].value_counts().index[0], inplace=True)

In [14]:
df_test.isna().sum()[df_test.isna().sum() != 0].sort_values() * 100 / len(df_test)

Series([], dtype: float64)

In [15]:
for i in df_test.columns:
    label = LabelEncoder()
    if df_train[i].dtype == object:
        if len(set(df_test['MSZoning']).difference(df_train['MSZoning'])) > 0:
            differences = [set(df_test['MSZoning']).difference(df_train['MSZoning'])]
            for dif in differences:
                df_test[i].replace(to_replace=dif, value=df_train[i].value_counts().index[0], inplace=True)
        df_train[i] = label.fit_transform(df_train[i])
        df_test[i] = label.transform(df_test[i])

In [16]:
model = SGDRegressor()

In [17]:
scale = StandardScaler()

In [18]:
pipe = Pipeline([('scale', scale), ('model', model)])

In [19]:
pipe

In [20]:
X_train = df_train.drop('SalePrice', axis=1)
y_train = df_train['SalePrice']
X_test = df_test.copy()

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
params = {
    'model__penalty': ['l1', 'l2', 'elasticnet'],
    'model__alpha': [0.001, 0.01, 0.1, 1],
    'model__eta0': [0.001, 0.01, 0.1, 1],
    'model__max_iter': [100, 500, 1000]
}

In [23]:
final_model = GridSearchCV(pipe, param_grid=params, verbose=2, cv=4)

In [24]:
final_model.fit(X_train, y_train)

Fitting 4 folds for each of 144 candidates, totalling 576 fits
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l1; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l1; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l1; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l1; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l2; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l2; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l2; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penalty=l2; total time=   0.0s
[CV] END model__alpha=0.001, model__eta0=0.001, model__max_iter=100, model__penal

In [21]:
y_pred = final_model.predict(X_val)

In [22]:
np.sqrt(mean_squared_error(y_pred, y_val))

np.float64(35660.62844459016)

In [23]:
final_model.best_score_

np.float64(0.7846052237429986)

In [24]:
final_y = final_model.predict(X_test)

In [25]:
df_sub = pd.read_csv('sample_submission.csv')

In [26]:
df_sub['SalePrice'] = final_y

In [27]:
df_sub.to_csv('sub.csv', index=False, index_label=False)