In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [130]:
raw_train = pd.read_csv('/Kaggle/House Prices/train.csv')
raw_test = pd.read_csv('/Kaggle/House Prices/test.csv')

In [131]:
train = raw_train.copy()
test = raw_test.copy()

In [132]:
def getMissingValue(df, columns):
    missingValue = df[columns].isna().sum()
    missingValue = missingValue[missingValue > 0].sort_values()
    print(missingValue)

## Missing data handling for train data set

In [133]:
getMissingValue(train, train.columns)

Electrical         1
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
BsmtExposure      38
BsmtFinType2      38
GarageCond        81
GarageQual        81
GarageFinish      81
GarageType        81
GarageYrBlt       81
LotFrontage      259
FireplaceQu      690
Fence           1179
Alley           1369
MiscFeature     1406
PoolQC          1453
dtype: int64


In [150]:
categorical_cols = [feature for feature in train.columns if train[feature].dtype == 'O']
numeric_cols = [feature for feature in train.columns if train[feature].dtype != 'O']

In [153]:
numeric_cols.remove('SalePrice')

## Numeric Features

In [159]:
getMissingValue(test, numeric_cols)

Series([], dtype: int64)


In [160]:
getMissingValue(train, numeric_cols)

Series([], dtype: int64)


In [157]:
## replacing the numeric missing value
def fill_numeric_missing_values(df):
    for col in numeric_cols:
        median = df[col].median()
        df[col+'_nan'] = np.where(df[col].isnull(), 1, 0)
        df[col] = df[col].fillna(median)

In [158]:
fill_numeric_missing_values(train)
fill_numeric_missing_values(test)

In [76]:
getMissingValue(train, numeric_cols)

Series([], dtype: int64)


In [77]:
train[numeric_cols]

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,40,0,0,0,0,0,8,2007,175000
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,349,0,0,0,0,0,0,2,2010,210000
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,60,0,0,0,0,2500,5,2010,266500
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,366,0,112,0,0,0,0,4,2010,142125


## Categorical Features

In [161]:
## replacing the categorical missing value
def fill_categorical_missing_values(df):
    for col in categorical_cols:
        df[col] = df[col].fillna('Missing')

In [163]:
fill_categorical_missing_values(train)
fill_categorical_missing_values(test)

In [164]:
getMissingValue(train, categorical_cols)

Series([], dtype: int64)


In [166]:
getMissingValue(test, categorical_cols)

Series([], dtype: int64)


In [167]:
year_features = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

In [168]:
train[year_features].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,2003,2003,2003.0
1,1976,1976,1976.0
2,2001,2002,2001.0
3,1915,1970,1998.0
4,2000,2000,2000.0


In [169]:
for feature in year_features:
    train[feature] = train['YrSold'] - train[feature]
    test[feature] = test['YrSold'] - test[feature]

In [170]:
train[year_features].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


## Feature Scaling

In [171]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for col in categorical_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.fit_transform(test[col])

In [172]:
from sklearn.preprocessing import MinMaxScaler

In [173]:
scaling_features = [feature for feature in train.columns if feature not in ['Id', 'SalePrice']]

scaler = MinMaxScaler()
scaler.fit(train[scaling_features])
scaler.fit(test[scaling_features])

MinMaxScaler()

In [174]:
scaler.transform(train[scaling_features])
scaler.transform(test[scaling_features])

array([[0.        , 0.6       , 0.32960894, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.8       , 0.33519553, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.8       , 0.29608939, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.8       , 0.77653631, ..., 0.        , 0.        ,
        0.        ],
       [0.38235294, 0.8       , 0.22905028, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.8       , 0.29608939, ..., 0.        , 0.        ,
        0.        ]])

In [175]:
train_scaled = pd.concat([train[['Id', 'SalePrice']].reset_index(drop=True), 
                        pd.DataFrame(scaler.transform(train[scaling_features]), columns=scaling_features)], axis = 1)

In [177]:
test_scaled = pd.concat([test['Id'].reset_index(drop=True), 
                        pd.DataFrame(scaler.transform(test[scaling_features]), columns=scaling_features)], axis = 1)

In [178]:
train_scaled.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,GarageArea_nan,WoodDeckSF_nan,OpenPorchSF_nan,EnclosedPorch_nan,3SsnPorch_nan,ScreenPorch_nan,PoolArea_nan,MiscVal_nan,MoSold_nan,YrSold_nan
0,1,208500,0.235294,0.6,0.24581,0.12661,1.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,181500,0.0,0.6,0.329609,0.14747,1.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,223500,0.235294,0.6,0.26257,0.177399,1.0,0.5,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,140000,0.294118,0.6,0.217877,0.146563,1.0,0.5,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,250000,0.235294,0.6,0.351955,0.231997,1.0,0.5,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
test_scaled.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,GarageArea_nan,WoodDeckSF_nan,OpenPorchSF_nan,EnclosedPorch_nan,3SsnPorch_nan,ScreenPorch_nan,PoolArea_nan,MiscVal_nan,MoSold_nan,YrSold_nan
0,1461,0.0,0.6,0.329609,0.184147,1.0,0.5,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1462,0.0,0.8,0.335196,0.232124,1.0,0.5,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1463,0.235294,0.8,0.296089,0.224197,1.0,0.5,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1464,0.235294,0.8,0.318436,0.154326,1.0,0.5,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1465,0.588235,0.8,0.122905,0.064121,1.0,0.5,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [181]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [182]:
X = train_scaled.drop(["SalePrice", "Id"], axis = 1)
y = train_scaled['SalePrice']

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [184]:
param_grid_lasso = {'alpha': [0.2, 0.8, 0.9, 1],
                    'max_iter': [i for i in range(1000, 8000, 500)],
                    'selection': ['cyclic', 'random']
                   }


lasso_model = Lasso()
grid_model_lasso = GridSearchCV(lasso_model, param_grid_lasso)

In [185]:
grid_model_lasso.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0.2, 0.8, 0.9, 1],
                         'max_iter': [1000, 1500, 2000, 2500, 3000, 3500, 4000,
                                      4500, 5000, 5500, 6000, 6500, 7000,
                                      7500],
                         'selection': ['cyclic', 'random']})

In [186]:
grid_model_lasso.best_params_

{'alpha': 1, 'max_iter': 5000, 'selection': 'random'}

In [187]:
pred_lasso = grid_model_lasso.predict(X_test)

In [188]:
np.sqrt(mean_squared_error(pred_lasso, y_test))

37294.15882847916

In [190]:
submission_preds_lasso = grid_model_lasso.predict(test.drop('Id', axis=1))

In [120]:
feature_sel_model.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
       False,  True, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

In [191]:
df_final = pd.DataFrame({'Id': test.Id,
                        'SalePrice': submission_preds_lasso})
df_final.to_csv('E:\Kaggle\House Prices\Submission_new_Lasso.csv', index=False)