# Creating Dataset for training
## preprocess the data

In [1]:
import pandas as pd

In [2]:
def read_data():
    train_df = pd.read_csv('./data/train.csv')
    test_df = pd.read_csv('./data/test.csv')
    print("train df: {}, test_df: {}".format(train_df.shape, test_df.shape))
    dataset = pd.concat([train_df, test_df], axis=0)
#     print(dataset.iloc[1460])
    return dataset
dataset = read_data()
dataset.isna().sum()

train df: (1460, 81), test_df: (1459, 80)


Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
                 ... 
MoSold              0
YrSold              0
SaleType            1
SaleCondition       0
SalePrice        1459
Length: 81, dtype: int64

In [3]:
dataset.shape
dataset.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0


In [4]:
def fill_NaNs(df):
    df.drop('Id', axis=1, inplace=True) # removing Id feature (will not give any info on the price just fifo).
    df['PoolQC'] = df['PoolQC'].fillna('NA') # No pool
    df['MiscFeature'] = df['MiscFeature'].fillna('NA') # no special element in the house.
    df['Alley'] = df['Alley'].fillna('NA') # not access to alley
    df['Fence'] = df['Fence'].fillna('NA') # no fence 
    # same thing we will do to FireplaceQu, LotFrontage
    df['FireplaceQu'] = df['FireplaceQu'].fillna('NA') # no fireplace in the house. 
    df['LotFrontage'] = df['LotFrontage'].fillna(0.) # there is no front area. 
    # and for the Garage missing houses and the Basement missing houses. 
    df.fillna({'GarageType':'NA', 'GarageFinish':'NA', 'GarageQual':'NA', 'GarageCond':'NA', 'GarageYrBlt':.0}, inplace=True)
    df.fillna({'BsmtExposure':'NA', 'BsmtQual':'NA', 'BsmtFinType2':'NA', 'BsmtCond':'NA', 'BsmtFinType1': 'NA'}, inplace=True)
    df['MasVnrType'] = df['MasVnrType'].fillna('NA') # No Masonry veneer type
    df['MasVnrArea'] = df['MasVnrArea'].fillna(0.) # No Masonry area. 
    df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0]) # we will replace the NaNs with the median=SBrkr
    return df

In [5]:
dataset = fill_NaNs(dataset)

In [6]:
def create_dummies(df):
    df = pd.get_dummies(df, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=True, dtype=None)
    return df

In [7]:
dataset = create_dummies(dataset)
dataset.columns.tolist()

['MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'SalePrice',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Street_Pave',
 'Alley_NA',
 'Alley_Pave',
 'LotShape_IR2',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'Utilities_NoSeWa',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LotConfig_Inside',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearC

In [8]:
train_df, kaggle_test_df = dataset[~dataset['SalePrice'].isna()] ,  dataset[dataset['SalePrice'].isna()] # split back. 

In [9]:
print(train_df.shape, kaggle_test_df.shape)

(1460, 261) (1459, 261)


# Modeling

In [10]:
from sklearn.model_selection import train_test_split
X = train_df.drop('SalePrice', axis=1)
y = train_df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

In [11]:
print(X_train.shape, X_test.shape,y_train.shape,y_test.shape)

(1022, 260) (438, 260) (1022,) (438,)


#### import dependencies

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [13]:
piplines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    'ridge':make_pipeline(Ridge(random_state=1234)),
    'lasso':make_pipeline(Lasso(random_state=1234)),
    'enet':make_pipeline(ElasticNet(random_state=1234)),
}

###### create hyperparameter grid.

In [14]:
RandomForestRegressor().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [15]:
hypergrid = {
    'rf':{
        'randomforestregressor__min_samples_split':[2,3,4,5,6],
        'randomforestregressor__min_samples_leaf':[1,2,3,4,5,6]        
    },
    
    'gb':{
        'gradientboostingregressor__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'ridge':{
        'ridge__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'lasso':{
        'lasso__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'enet':{
        'elasticnet__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    }
}

In [16]:
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import GridSearchCV

In [17]:
fit_models = {}
for algo, pipline in piplines.items():
    model = GridSearchCV(pipline, hypergrid[algo], cv=10, n_jobs=-1)
    try:
        print("starting training for {}".format(algo))
        model.fit(X_train, y_train)
        fit_models[algo] = model
        print("{} trained succefully!".format(algo))
    except NotFittedError as e:
        print(repr(e))

starting training for rf
rf trained succefully!
starting training for gb
gb trained succefully!
starting training for ridge
ridge trained succefully!
starting training for lasso


  model = cd_fast.enet_coordinate_descent(


lasso trained succefully!
starting training for enet
enet trained succefully!


  model = cd_fast.enet_coordinate_descent(


# Evaluation 

In [18]:
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np

In [19]:
results = {
    'rf': None,
    'gb':None,
    'ridge':None,
    'lasso':None,
    'enet':None
}

In [20]:

for algo, model in fit_models.items():
    yhat = model.predict(X_test)
    results[algo] = np.round(yhat,2)
    print('{} scores - R2: {} MAE: {}'.format(algo, r2_score(y_test, yhat), mean_absolute_error(y_test, yhat)))

rf scores - R2: 0.8758115667080781 MAE: 16392.56047142121
gb scores - R2: 0.8974761496977999 MAE: 15184.196438731591
ridge scores - R2: 0.8333540111334629 MAE: 19235.165945707282
lasso scores - R2: 0.718995621634209 MAE: 19202.73671682635
enet scores - R2: 0.8677857926659323 MAE: 18086.383363902587


it seems that our winner is the grdientboosting regressor!

In [21]:
df_res = pd.DataFrame(results)

In [22]:
df_res.shape
df_res['GT'] = list(y_test)
df_res['diff_rf'] = df_res['GT'] - df_res['rf']
df_res['diff_gb'] = df_res['GT'] - df_res['gb']
df_res['diff_ridge'] = df_res['GT'] - df_res['ridge']
df_res['diff_lasso'] = df_res['GT'] - df_res['lasso']
df_res['diff_enet'] = df_res['GT'] - df_res['enet']

## Analysing results

In [23]:
df_rf_res = df_res[['diff_rf', 'rf', 'GT']]
df_rf_res.head(5)

Unnamed: 0,diff_rf,rf,GT
0,5427.36,199572.64,205000.0
1,-2479.71,347479.71,345000.0
2,-3565.05,177465.05,173900.0
3,3072.0,90428.0,93500.0
4,24612.36,241287.64,265900.0


In [24]:
df_gb_res = df_res[['diff_gb', 'gb', 'GT']]
df_gb_res.head(5)

Unnamed: 0,diff_gb,gb,GT
0,-6276.45,211276.45,205000.0
1,-4809.73,349809.73,345000.0
2,-803.96,174703.96,173900.0
3,10329.79,83170.21,93500.0
4,46335.4,219564.6,265900.0


In [25]:
df_ridge_res = df_res[['diff_ridge', 'ridge', 'GT']]
df_ridge_res.head(5)

Unnamed: 0,diff_ridge,ridge,GT
0,-3122.16,208122.16,205000.0
1,5422.22,339577.78,345000.0
2,-2609.99,176509.99,173900.0
3,27290.94,66209.06,93500.0
4,28423.35,237476.65,265900.0


In [26]:
df_lasso_res = df_res[['diff_lasso', 'lasso', 'GT']]
df_lasso_res.head(5)

Unnamed: 0,diff_lasso,lasso,GT
0,1085.75,203914.25,205000.0
1,-11692.59,356692.59,345000.0
2,-9286.6,183186.6,173900.0
3,42401.98,51098.02,93500.0
4,41848.68,224051.32,265900.0


In [27]:
df_enet_res = df_res[['diff_enet', 'enet', 'GT']]
df_enet_res.head(5)

Unnamed: 0,diff_enet,enet,GT
0,-10093.58,215093.58,205000.0
1,7040.92,337959.08,345000.0
2,217.51,173682.49,173900.0
3,17875.07,75624.93,93500.0
4,28148.54,237751.46,265900.0


##### Check time to predict:

In [43]:
import time

In [29]:
dict_avg = {
    'rf': [],
    'gb':[],
    'ridge':[],
    'lasso':[],
    'enet':[]
}

In [51]:
def time_decorator(orignal_function):
    def time_wrapper(*args, **kwargs):
        st = time.time()
        orignal_function(*args, **kwargs)
        dict_avg[args[0]]= time.time() - st
        return  
    return time_wrapper

In [52]:
@time_decorator
def run_performance_test(algo, model, x_test):
    yhat = model.predict(X_test)

In [53]:
for algo, model in fit_models.items():
    run_performance_test(algo, model, X_test)

In [54]:
for algo, val in dict_avg.items():
    print(algo, val)
# dict_avg['rf']/len(X_test) * 100


rf 0.01899886131286621
gb 0.003998279571533203
ridge 0.002023458480834961
lasso 0.0019762516021728516
enet 0.0029997825622558594
