In [1]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_squared_log_error,r2_score
from sklearn_pandas import DataFrameMapper
from pandas.api.types import is_string_dtype, is_numeric_dtype

from scipy.stats import skew
from scipy.special import boxcox1p

%matplotlib inline

In [2]:
def print_feature_importances(model,X):
    important_features = pd.Series(data=rf_model.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    print(important_features.head(50))
    
def get_cat_columns_by_type(df):
    out = []
    for colname,col_values in df.items():
        if is_string_dtype(col_values):
            out.append((colname,'string') )
        elif not is_numeric_dtype(col_values):
            out.append((colname,'categorical') )
    return out       

def get_numeric_columns(df):
    out = []
    for colname,col_values in df.items():
        if is_numeric_dtype(col_values):
            out.append(colname)
    return out       
    
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100


def convert_to_str_type(df_in,columns,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in columns:
        df[col] = df[col].astype(str)
    return df

    
def handle_missing_values(df_in,cat_cols=[], num_cols=[],na_dict=None,add_nan_col=True,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
 
    if na_dict is None:
        na_dict = {}

    for colname, col_values in df.items():   
        if colname not in num_cols:
            continue
        if pd.isnull(col_values).sum():
            df[colname+'_na'] = pd.isnull(col_values)
            filler = na_dict[colname] if colname in na_dict else col_values.median()
            df[colname] = col_values.fillna(filler)
            na_dict[colname] = filler
    for colname in cat_cols:
        if colname not in df.columns:
            continue
        df[colname].fillna(df[colname].mode()[0], inplace=True)
        lbl = LabelEncoder() 
        lbl.fit(list(df[colname].values)) 
        df[colname] = lbl.transform(list(df[colname].values))
    
    return (df,na_dict)



def scale_num_cols(df_in, mapper, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    if mapper is None:
        map_f = [([c],StandardScaler()) for c in df.columns if is_numeric_dtype(df[c])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return (df,mapper)



def extract_and_drop_target_column(df_in, y_name, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
    if not is_numeric_dtype(df[y_name]):
        df[y_name] = df[y_name].cat.codes
        y = df[y_name].values
    else:
        y = df[y_name]
    df.drop([y_name], axis=1, inplace=True)
    return (df,y)

def print_mse(m,X_train, X_valid, y_train, y_valid):
    res = [mean_squared_error(y_train,m.predict(X_train)),
                mean_squared_error(y_valid,m.predict(X_valid)),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    print('MSE Training set = {}, MSE Validation set = {}, score Training Set = {}, score on Validation Set = {}'.format(res[0],res[1],res[2], res[3]))
    if hasattr(m, 'oob_score_'):
          print('OOB Score = {}'.format(m.oob_score_))      

def get_iqr_min_max(df,cols):
    out = {}
    for colname, col_values in df.items():
        if colname not in cols:
            continue
        quartile75, quartile25 = np.percentile(col_values, [75 ,25])
        ## Inter Quartile Range ##
        IQR = quartile75 - quartile25
        min_value = quartile25 - (IQR*1.5)
        max_value = quartile75 + (IQR*1.5)
        out[colname] = (min_value,max_value)
    return out


def bin_numerical_columns(df_in,cols,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in cols.keys():
        bins = cols[col]
        buckets_ = np.linspace(bins[0],bins[1],bins[2])
        df[col] = pd.cut(df[col],buckets_,include_lowest=True)
    return df


In [None]:
def preprocess_df(df_train,df_test=None,
                  log_y=True,
                  id_col= None,
                  drop_target=True,
                  convert_to_cat_cols=None,
                  remove_skewness=False,scale_mapper=None,
                  bin_columns_dict=None,
                  new_features_func=None):
    
    if drop_target:
        df,y = extract_and_drop_target_column(df_train,'SalePrice',inplace=True)
    if log_y:
        y = np.log1p(y)
    else:
        y = None
        
    combined = pd.concat((df, df_test)).reset_index(drop=True)
    
    
    if df_test is not None and id_col is not None:
        test_id = df_test['Id'].copy()
        combined.drop('Id', axis=1,inplace=True)
    else: test_id = None
   
    if new_features_func is not None:
        combined = new_features_func(combined)
    
    
    if convert_to_cat_cols is not None:
        combined = convert_to_str_type(combined,convert_to_cat_cols,inplace=True)
    
        
    if bin_columns_dict is not None:
        combined = bin_numerical_columns(combined,bin_columns_dict,inplace=True)
    
    
    cat_cols = get_cat_columns_by_type(combined)
    cat_cols = [cat_cols[i][0] for i in range(len(cat_cols))]
    num_cols = [col for col in combined.columns if col not in cat_cols]
    
    combined = pd.get_dummies(combined,columns=cat_cols, dummy_na=True)
    
    n_train = df.shape[0]
    n_test = df_test.shape[0]
      
    
    combined,d = handle_missing_values(combined,cat_cols=cat_cols,
                                       num_cols=num_cols,inplace=True)
    
    
    if remove_skewness:
        skewed_cols = combined[num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)
        skewness = pd.DataFrame({'Skew' :skewed_cols})
        skewness_log = skewness[skewness > 4.0]
        skewness_other = skewness[skewness <= 4.0]
        skewed_features_log = skewness_log.index
        skewed_features_other = skewness_other.index
        lambda_ = 0.0
        for feature in skewed_features_log:
            combined[feature] = boxcox1p(combined[feature],lambda_)
        lambda_ = 0.15
        for feature in skewed_features_other:
            combined[feature] = boxcox1p(combined[feature],lambda_)
    
    if scale_mapper is not None:
        map_f = [([c],scale_mapper) for c in num_cols]
        mapper = DataFrameMapper(map_f).fit(combined)
    else:
        mapper = None
        
    combined,_ = scale_num_cols(combined,mapper,inplace=True) 
    
    print(get_missing_values_percentage(combined))
    
    return combined,df,y,cat_cols,num_cols,test_id,n_train,n_test


In [None]:
def add_new_features1(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    return df
def add_new_features2(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    return df
def add_new_features3(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    df['TotalLivArea'] = df['GrLivArea'] + df['GarageArea'] + df['LotArea']
    return df

def add_new_features4(df):
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    df['TotalLivArea'] = df['GrLivArea'] + df['GarageArea'] + df['LotArea']
    
    df["GrLivArea-2"] = df["GrLivArea"] ** 2
    df["GrLivArea-3"] = df["GrLivArea"] ** 3
    df["GrLivArea-Sq"] = np.sqrt(df["GrLivArea"])
    df["GarageArea-2"] = df["GarageArea"] ** 2
    df["GarageArea-3"] = df["GarageArea"] ** 3
    df["GarageArea-Sq"] = np.sqrt(df["GarageArea"])
    return df

    

In [None]:
PATH = "data/iowa_housing/"

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()

combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)}
                                       
                                       )

print(combined.shape,len(cat_cols),len(num_cols),n_train,n_test)

0.0
(2919, 659) 50 29 1460 1459


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)

rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model,X_train,X_valid,y_train,y_valid)

MSE Training set = 0.0030193402814424793, MSE Validation set = 0.021656989952280544, score Training Set = 0.9810732273685645, score on Validation Set = 0.8637930078429806
OOB Score = 0.8600927383772609


In [None]:
print_feature_importances(rf_model,X_train)

GrLivArea                     0.320039
ExterQual_TA                  0.215576
TotalBsmtSF                   0.071321
GarageCars                    0.053315
GarageArea                    0.042996
1stFlrSF                      0.028447
BsmtFinSF1                    0.017132
LotArea                       0.016112
CentralAir_Y                  0.013451
CentralAir_N                  0.013165
2ndFlrSF                      0.011117
ExterQual_Fa                  0.009236
BsmtQual_Ex                   0.008515
FullBath                      0.008180
MSZoning_C (all)              0.006447
BsmtUnfSF                     0.005574
KitchenQual_TA                0.005081
LotFrontage                   0.004732
OpenPorchSF                   0.004719
OverallQual_(3.222, 4.333]    0.004031
KitchenQual_Gd                0.003830
BsmtQual_Gd                   0.003823
HalfBath                      0.003764
TotRmsAbvGrd                  0.003673
MSSubClass                    0.003300
GarageType_Attchd        

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       new_features_func=add_new_features1,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)}
                                       )

print(combined.shape,len(cat_cols),len(num_cols),n_train,n_test)

0.0
(2919, 661) 50 30 1460 1459


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)
rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model, X_train,X_valid,y_train,y_valid)

MSE Training set = 0.002922005989641421, MSE Validation set = 0.02130034464721873, score Training Set = 0.9816833686042122, score on Validation Set = 0.866036051976834
OOB Score = 0.8626689468435889


In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       new_features_func=add_new_features2,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)}
                                       )

print(combined.shape,len(cat_cols),len(num_cols),n_train,n_test)

0.0
(2919, 662) 50 31 1460 1459


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)
rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model, X_train,X_valid,y_train,y_valid)

MSE Training set = 0.0027102876205698477, MSE Validation set = 0.018670977722303743, score Training Set = 0.9830105278707397, score on Validation Set = 0.8825728911640413
OOB Score = 0.875238558298763


In [None]:
print_feature_importances(rf_model,X_train)

TotalSF                       0.617051
OverallGrade                  0.069475
ExterQual_TA                  0.050137
GarageCars                    0.034046
GarageArea                    0.016317
KitchenQual_TA                0.013109
LotArea                       0.011433
GrLivArea                     0.010714
BsmtFinSF1                    0.008689
2ndFlrSF                      0.008448
BsmtUnfSF                     0.007096
CentralAir_Y                  0.006821
CentralAir_N                  0.006396
1stFlrSF                      0.006168
FullBath                      0.005686
TotalBsmtSF                   0.004562
MSZoning_C (all)              0.004285
LotFrontage                   0.003664
BsmtQual_Ex                   0.003468
BsmtQual_Gd                   0.003325
KitchenQual_Gd                0.003275
OpenPorchSF                   0.003244
BsmtQual_TA                   0.002815
MSSubClass                    0.002759
GarageFinish_Unf              0.002680
Neighborhood_OldTown     

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       new_features_func=add_new_features2,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['GarageCars','CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)}
                                       )


0.0


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)

rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model, X_train,X_valid,y_train,y_valid)

MSE Training set = 0.0027217679334127984, MSE Validation set = 0.019178245152996945, score Training Set = 0.9829385633849044, score on Validation Set = 0.8793825414845062
OOB Score = 0.8744244489256909


In [None]:
print_feature_importances(rf_model,X_train)

TotalSF                       0.614101
OverallGrade                  0.067279
ExterQual_TA                  0.052831
GarageCars_2.0                0.023071
GarageArea                    0.016034
GarageCars_3.0                0.014750
KitchenQual_TA                0.012062
LotArea                       0.011667
GrLivArea                     0.010929
2ndFlrSF                      0.008749
BsmtFinSF1                    0.008395
BsmtUnfSF                     0.006885
CentralAir_Y                  0.006746
CentralAir_N                  0.006335
1stFlrSF                      0.006146
FullBath                      0.005193
TotalBsmtSF                   0.004762
MSZoning_C (all)              0.003853
LotFrontage                   0.003593
OpenPorchSF                   0.003330
BsmtQual_Ex                   0.003300
BsmtQual_Gd                   0.003255
KitchenQual_Gd                0.003248
BsmtQual_TA                   0.003039
GarageFinish_Unf              0.002897
MSSubClass               

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       new_features_func=add_new_features3,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['GarageCars','CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)}
                                       )



0.0


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)

rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model, X_train,X_valid,y_train,y_valid)

MSE Training set = 0.002689138175527549, MSE Validation set = 0.019053090924821426, score Training Set = 0.9831431034337053, score on Validation Set = 0.8801696721528534
OOB Score = 0.8752126557396414


In [None]:
print_feature_importances(rf_model,X_train)

TotalSF                       0.612296
OverallGrade                  0.066018
ExterQual_TA                  0.050531
GarageCars_2.0                0.022494
GarageCars_3.0                0.017362
GarageArea                    0.016196
TotalLivArea                  0.012867
KitchenQual_TA                0.012278
GrLivArea                     0.010239
2ndFlrSF                      0.009158
BsmtFinSF1                    0.008240
CentralAir_Y                  0.006851
BsmtUnfSF                     0.006834
1stFlrSF                      0.006461
CentralAir_N                  0.006258
FullBath                      0.005246
TotalBsmtSF                   0.004673
LotArea                       0.004334
MSZoning_C (all)              0.004128
LotFrontage                   0.003531
BsmtQual_Gd                   0.003292
OpenPorchSF                   0.003120
GarageFinish_Unf              0.002983
BsmtQual_Ex                   0.002914
BsmtQual_TA                   0.002822
KitchenQual_Gd           

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
df = df_raw.copy()
stratify_col = df['OverallQual'].copy()
combined,df,y,cat_cols,num_cols,_,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       drop_target=True,
                                       new_features_func=add_new_features4,
                                       id_col='Id',
                                       log_y=True,
                                       convert_to_cat_cols=['GarageCars','CentralAir','MoSold','YrSold','YearBuilt','GarageYrBlt','YearRemodAdd'],
                                       remove_skewness=True,
                                       scale_mapper=RobustScaler(),
                                       bin_columns_dict={'OverallQual':(1,11,10),'OverallCond':(1,11,10)} 
                                       )


0.0


In [None]:
df = combined[:n_train]
df_test = combined[n_train:]
X_train,X_valid,y_train,y_valid = train_test_split(df,y,test_size=0.15,
                                  stratify=stratify_col,shuffle = True,random_state=20)

rf_model = RandomForestRegressor(n_estimators=1500,n_jobs=-1,oob_score=True).fit(X_train.values,
                                                                                y_train)
print_mse(rf_model, X_train,X_valid,y_train,y_valid)

In [None]:
print_feature_importances(rf_model,X_train)