<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/gradient-boosting/notebooks/House_prices_kaggle_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [376]:
#!pip install category_encoders
#!pip install shap

In [377]:
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import timeit

from category_encoders import TargetEncoder
from google.colab import files
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoLarsCV, LinearRegression, ElasticNet#, SGDRegressor,LogisticRegression, LogisticRegressionCV, 
from sklearn.model_selection import train_test_split,GridSearchCV#, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeRegressor


In [378]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)

#Work with categories

In [379]:
def is_cat(df, col):
  '''checks if a column is of object or category type'''
  return df[col].dtype in ['object', 'category']

def improve_cats(dataframe) -> pd.DataFrame:
  '''turns dtypes 64->32 and object->category'''
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

#Show statistics

In [380]:
def adjusted_r2(yt,yp, colnum):
  '''computes adjusted r2 score'''
  return 1 - (1 - r2_score(yt, yp)) * ((yt.shape[0]-1) / (yt.shape[0] - colnum+(1e-12)))

def print_scores(y_train, y_test, y_pred_train, y_pred_test, colnum):
  '''outprints adjusted r2 and rmse results on train and validation'''
  train_r2 =  adjusted_r2(y_train, y_pred_train, colnum)
  test_r2 = adjusted_r2(y_test,y_pred_test, colnum)
  train_rmse = np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train)))
  test_rmse = np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test)))
  text = f'train r2: {train_r2}\ntrain rmse: {train_rmse}\n\ntest r2: {test_r2}\ntest rmse: {test_rmse}\n'
  print(text)
  return text

#Split dataframe to X,y

In [381]:
def split_data(df, target):
  '''splits data'''
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.3, random_state=42)

#Work with model

In [382]:
def train_model(X_train, y_train):
  '''subj'''
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), 
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42)), ]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))
  pipe.fit(X_train, y_train)
  return pipe

In [383]:
def split_run_test(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  model = train_model(X_train, y_train)

  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  txt = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns))
  return model, txt

In [384]:
def get_permutations(df):
  '''Pick best columns'''
  X_train, X_test, y_train, y_test = split_data(df, 'SalePrice')
  model = train_model(X_train, y_train)
  result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)
  importance_dict = dict(zip(X_train.columns, result.importances_mean))
  sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
  print(sorted_importance)
  #get_shap(X_train, X_test, model)
  #grid(X_train, y_train)
  return [x[0] for x in sorted_importance]  
  
'''def get_shap(X_train, X_test, model):
  explainer = shap.Explainer(model.predict, X_train)
  shap_values = explainer(X_test)
  shap.plots.waterfall(shap_values[0])

def grid(X_train, y_train):
  estimators = [('LinReg',LinearRegression()), 
                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), 
                ('Tree',DecisionTreeRegressor(max_depth=5)), 
                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
                
  pipe = make_pipeline(RobustScaler(), 
                      StackingRegressor(estimators=estimators, 
                                        final_estimator=RandomForestRegressor(max_depth=6)))

  param_grid = {
      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],
      'stackingregressor__final_estimator__max_depth': [3, 5, 7],
      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],
      'stackingregressor__Elastic__l1_ratio': [0.1, 0.5, 0.9],      
  }

  grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
  grid.fit(X_train, y_train)
  print('grid best params: ', grid.best_params_)
  print('grid best score: ', grid.best_score_)'''

"def get_shap(X_train, X_test, model):\n  explainer = shap.Explainer(model.predict, X_train)\n  shap_values = explainer(X_test)\n  shap.plots.waterfall(shap_values[0])\n\ndef grid(X_train, y_train):\n  estimators = [('LinReg',LinearRegression()), \n                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), \n                ('Tree',DecisionTreeRegressor(max_depth=5)), \n                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),\n                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]\n                \n  pipe = make_pipeline(RobustScaler(), \n                      StackingRegressor(estimators=estimators, \n                                        final_estimator=RandomForestRegressor(max_depth=6)))\n\n  param_grid = {\n      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],\n      'stackingregressor__final_estimator__max_depth': [3, 5, 7],\n      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],\n      'stackingregressor_

#Work with columns

In [385]:
def create_cols(df):
  #create Total SF
  df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]+df['LotArea']
  #create Porch
  df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  #Create green area
  df["OutsideArea"] = df["LotArea"] - df["GrLivArea"] - df["GarageArea"] 
  #Create month sold * year
  df['MonthSold'] = df['YrSold']*12 + df['MoSold'] #-df['YrSold'].min()
  #dates_frames  = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold','MoSold']
  #Create booleans
  df['Has2Floors'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has1Floors'] = df['1stFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasPorch'] = df['PorchSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has2Wood'] = df['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x>0 else 0)
  #create bath
  fullbsmtb = df['BsmtFullBath'].apply(lambda x: x if x > 0 else 0)
  halfbsmtb = df['BsmtHalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  fullb = df['FullBath'].apply(lambda x: x if x > 0 else 0)
  halfb = df['HalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  df['Bath'] = fullbsmtb + halfbsmtb + fullb + halfb
  return df

def log_cols(df, log_cols):
  '''adds 1 to all numeric columns and np.logs handpicked columns'''
  for column in [col for col in df.columns if not is_cat(df, col)]:
    df[column] = df[column]+1
  for col in log_cols:
    df[col] = np.log(df[col])
  return df

def plotme(df, cols):
  for col in cols:
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

def categorize_cols(df):
  """fill NaNs"""
  for col in df.columns:
    if df[col].isna().any():
      if is_cat(df, col):
        df[col] = df[col].cat.add_categories(['MISSING'])
        df[col] = df[col].fillna('MISSING')
        df[col] = df[col].cat.remove_unused_categories()
      else:
        if col not in ['GarageArea', 'KitchenAbvGr', 'TotRmsAbvGrd', 'LotArea',\
                       'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces', 'LotFrontage', \
                       'WoodDeckSF', 'MasVnrArea', '2ndFlrSF','GarageArea', 'WoodDeckSF',\
                       'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']:
          df[col] = df[col].fillna(df[col].mean())
        else:
          df[col] = df[col].fillna(0)
  return df

"""def encode_cols(df, cols):
  '''Use Label encoder'''
  for col in cols:
    encoder = LabelEncoder()
    df[col+'_e'] = encoder.fit_transform(df[col])
  return df"""

def drop_categories(df):
  '''drop cat values'''
  cats = [col for col in df.columns if is_cat(df, col)]
  return df.drop(columns = cats)

def iqr(df, columns, mult=3):
  '''cut outliers'''
  df = df.copy()
  for col in columns:
    d=df[col].describe()
    val =(d['50%'] + (d['75%']-d['25%'])) * mult
    df = df[df[col] <= val]
    
  return df

def work_df(df, to_log = ['LotFrontage', 'LotArea', 'GrLivArea', 'GarageArea'], target =[], to_drop = []):
  df = df.copy()
  df = improve_cats(df)
  df = create_cols(df)
  df = df.drop(columns = to_drop)
  df = categorize_cols(df)

  df['OutsideArea'] = df['OutsideArea'].apply(lambda x: x if x>0 else 1)

  df = log_cols(df, target+to_log)
  return df

#Creating variable

In [386]:
cols_to_log = ['LotFrontage', 'LotArea', 'GrLivArea','TotalSF','OutsideArea', 
               'MonthSold', 'TotalBsmtSF', 'YrSold', 'YearBuilt','YearRemodAdd', 
               'GarageYrBlt', '2ndFlrSF','GarageArea', 'WoodDeckSF', 'BsmtFinSF1', 
               'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']

cols_to_iqr = ['SalePrice','LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 
               'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 
               'GarageArea', 'PorchSF', 'OutsideArea', 'TotalSF', '2ndFlrSF', '1stFlrSF']

cols_to_drop = ['OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
                'PoolArea', 'MiscVal', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath', 
                'FullBath', 'HalfBath']

In [387]:
df = work_df(raw_df_train, to_log = cols_to_log, to_drop = cols_to_drop, target = ['SalePrice'])                                                             
df_test = work_df(raw_df_test, to_log = cols_to_log, to_drop = cols_to_drop)

df = iqr(df, cols_to_iqr)

cols = ['Neighborhood']#,'LotShape', 'FireplaceQu']
for col in cols:
  encoder = TargetEncoder(cols = col)
  df[col+'_te'] = encoder.fit_transform(df[col], df['SalePrice'])
  df_test[col+'_te'] = encoder.transform(df_test[col])

df = drop_categories(df)
df_test = drop_categories(df_test)

#Getting best columns 

In [388]:
to_drop = []#'Modern', 'HasBsmt', 'HasGarage', 'HasPool','HasVnr']
df1 = df.copy().drop(columns = to_drop)
df_test1 = df_test.copy().drop(columns = to_drop)
columns = get_permutations(df1) 

[('MonthSold', 0.624826464050994), ('MoSold', 0.5544934895149309), ('YrSold', 0.5192668908668093), ('GrLivArea', 0.1689135227238431), ('OverallQual', 0.14234752487062396), ('2ndFlrSF', 0.13790134494257722), ('Has2Floors', 0.1303712629448722), ('TotalSF', 0.09862657252127845), ('Neighborhood_te', 0.0809188108933271), ('YearBuilt', 0.05003195737822402), ('LotArea', 0.04134227061351835), ('OverallCond', 0.04127115693343856), ('1stFlrSF', 0.024273962692374716), ('Bath', 0.021192018987410743), ('BsmtFinSF1', 0.01637086033573142), ('WoodDeckSF', 0.014455591769265053), ('GarageArea', 0.013050323464272439), ('YearRemodAdd', 0.009575329250164721), ('TotalBsmtSF', 0.00840269307934841), ('PorchSF', 0.0074304044836176805), ('Fireplaces', 0.006867876068827194), ('Has2Wood', 0.004046691567348848), ('GarageYrBlt', 0.003982975468354089), ('OutsideArea', 0.0035879841726363495), ('BedroomAbvGr', 0.003174571298293827), ('MSSubClass', 0.0015118098297996264), ('LotFrontage', 0.001470047728530599), ('BsmtUn

In [394]:
cols = ['SalePrice']+columns
model, txt = split_run_test(df1[cols])

train r2: 0.9317305478548442
train rmse: 0.008200539704755993

test r2: 0.8858950380373605
test rmse: 0.010481121266173071



In [390]:
y_pred_test = model.predict(df_test[columns])

In [391]:
ids = np.arange(1461, 2920)
my_result = pd.DataFrame({'Id': ids, 'SalePrice': np.e**y_pred_test})
my_result

Unnamed: 0,Id,SalePrice
0,1461,128067.989206
1,1462,156100.531990
2,1463,185339.854394
3,1464,188502.539671
4,1465,187872.863089
...,...,...
1454,2915,86267.408548
1455,2916,86164.464308
1456,2917,167796.911823
1457,2918,125219.528032


In [392]:
#my_result.to_csv('submission.csv', index=False)
#files.download('submission.csv')