<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/gradient-boosting/notebooks/House_prices_kaggle_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install category_encoders
!pip install xgboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import timeit
import xgboost as xgb

from catboost import CatBoostRegressor
from category_encoders import TargetEncoder
from google.colab import files
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor, VotingRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoLarsCV, LinearRegression, ElasticNet#, SGDRegressor,LogisticRegression, LogisticRegressionCV, 
from sklearn.model_selection import train_test_split#, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeRegressor

In [4]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)

#Work with categories

In [5]:
def is_cat(df, col):
  '''checks if a column is of object or category type'''
  return df[col].dtype in ['object', 'category']

def improve_cats(dataframe) -> pd.DataFrame:
  '''turns dtypes 64->32 and object->category'''
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

#Show statistics

In [6]:
def adjusted_r2(yt,yp, colnum):
  '''computes adjusted r2 score'''
  return 1 - (1 - r2_score(yt, yp)) * ((yt.shape[0]-1) / (yt.shape[0] - colnum+(1e-12)))

def print_scores(y_train, y_test, y_pred_train, y_pred_test, colnum):
  '''outprints adjusted r2 and rmse results on train and validation'''
  train_r2 =  adjusted_r2(y_train, y_pred_train, colnum)
  test_r2 = adjusted_r2(y_test,y_pred_test, colnum)
  train_rmse = np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train)))
  test_rmse = np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test)))
  text = f'train r2: {train_r2}\ntrain rmse: {train_rmse}\n\ntest r2: {test_r2}\ntest rmse: {test_rmse}\n'
  print(text)
  return text

#Split dataframe to X,y

In [7]:
def split_data(df, target):
  '''splits data'''
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.3, random_state=42)

#Work with model

In [55]:
def train_model(X_train, y_train):
  '''subj'''
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), 
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))
  pipe.fit(X_train, y_train)
  y3_train = np.exp((np.log(y_train) + np.log(pipe.predict(X_train))) / 2)
  pipe.fit(X_train, y3_train)


  return pipe

In [9]:
def train_model2(X_train, y_train):
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5))]

  pipe1 = make_pipeline(RobustScaler(), xgb.XGBRegressor(colsample_bytree=0.4,
                             gamma=0.045,
                             learning_rate=0.05,
                             max_depth=10,
                             min_child_weight=1.5,
                             n_estimators=300,
                             reg_alpha=0.65,
                             reg_lambda=0.45,
                             subsample=0.95))
  pipe2 = make_pipeline(RobustScaler(), LassoLarsCV(max_iter=15,eps=0.01))
  pipe3 = make_pipeline(RobustScaler(), DecisionTreeRegressor(max_depth=5))
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), 
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))

  blend = VotingRegressor(estimators=[('pipe1', pipe1), ('pipe2', pipe2), ('pipe3', pipe3), ('pipe4',pipe)], weights=[0.2, 0.1, 0.1, 0.6])


  blend.fit(X_train, y_train)
  y3_train = np.exp((np.log(y_train) + np.log(blend.predict(X_train))) / 2)
  blend.fit(X_train, y3_train)
  

  return blend


In [54]:
def train_cat(df, target = 'SalePrice'):
  X_train, X_val, y_train, y_val = split_data(df, target)
  model = CatBoostRegressor(
      iterations=1500,
      learning_rate=0.05,
      depth=6,
      l2_leaf_reg=3,
      subsample=0.8,
      random_seed=42,
      random_strength=0.6,
      loss_function = 'RMSE',
      eval_metric='RMSE',
      verbose=False
  )
  model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)
  score = model.score(X_val, y_val)
  y3_train = np.exp((np.log(y_train) + np.log(model.predict(X_train))) / 2)
  model.fit(X_train, y3_train, eval_set=(X_val, y_val), verbose=False)

  txt = print_scores(y_train, y_val, model.predict(X_train), model.predict(X_val), len(X_train.columns))
  return model, txt


In [11]:
def split_run_test(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  model = train_model2(X_train, y_train)

  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  txt = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns))
  return model, txt

In [51]:
def split_run_test_stack(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  model = train_model(X_train, y_train)

  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  txt = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns))
  return model, txt

In [12]:
def get_permutations(df):
  '''Pick best columns'''
  X_train, X_test, y_train, y_test = split_data(df, 'SalePrice')
  model = train_model2(X_train, y_train)
  result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)
  importance_dict = dict(zip(X_train.columns, result.importances_mean))
  sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
  print(sorted_importance)
  #get_shap(X_train, X_test, model)
  #grid(X_train, y_train)
  return [x[0] for x in sorted_importance]  
  
'''def get_shap(X_train, X_test, model):
  explainer = shap.Explainer(model.predict, X_train)
  shap_values = explainer(X_test)
  shap.plots.waterfall(shap_values[0])

def grid(X_train, y_train):
  estimators = [('LinReg',LinearRegression()), 
                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), 
                ('Tree',DecisionTreeRegressor(max_depth=5)), 
                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
                
  pipe = make_pipeline(RobustScaler(), 
                      StackingRegressor(estimators=estimators, 
                                        final_estimator=RandomForestRegressor(max_depth=6)))

  param_grid = {
      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],
      'stackingregressor__final_estimator__max_depth': [3, 5, 7],
      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],
      'stackingregressor__Elastic__l1_ratio': [0.1, 0.5, 0.9],      
  }

  grid = GridSearchCV(pipe, param_grid=param_grid, cv=5)
  grid.fit(X_train, y_train)
  print('grid best params: ', grid.best_params_)
  print('grid best score: ', grid.best_score_)'''

"def get_shap(X_train, X_test, model):\n  explainer = shap.Explainer(model.predict, X_train)\n  shap_values = explainer(X_test)\n  shap.plots.waterfall(shap_values[0])\n\ndef grid(X_train, y_train):\n  estimators = [('LinReg',LinearRegression()), \n                ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), \n                ('Tree',DecisionTreeRegressor(max_depth=5)), \n                ('LGBR', LGBMRegressor(max_depth=5, learning_rate=0.05)),\n                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]\n                \n  pipe = make_pipeline(RobustScaler(), \n                      StackingRegressor(estimators=estimators, \n                                        final_estimator=RandomForestRegressor(max_depth=6)))\n\n  param_grid = {\n      'stackingregressor__final_estimator__n_estimators': [10, 50, 100],\n      'stackingregressor__final_estimator__max_depth': [3, 5, 7],\n      'stackingregressor__final_estimator__min_samples_split': [2, 4, 8],\n      'stackingregressor_

#Work with columns

In [46]:
def create_cols(df):
  #create Total SF
  df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]+df['LotArea']
  #create Porch
  df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  #Create green area
  df["OutsideArea"] = df["LotArea"] - df["GrLivArea"] - df["GarageArea"] 
  #Create month sold * year
  df['MonthSold'] = df['YrSold']*12 + df['MoSold'] #-df['YrSold'].min()
  #dates_frames  = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold','MoSold']
  #Create booleans
  df['Has2Floors'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has1Floors'] = df['1stFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasPorch'] = df['PorchSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasWood'] = df['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x>0 else 0)
  #create bath
  fullbsmtb = df['BsmtFullBath'].apply(lambda x: x if x > 0 else 0)
  halfbsmtb = df['BsmtHalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  fullb = df['FullBath'].apply(lambda x: x if x > 0 else 0)
  halfb = df['HalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  df['Bath'] = fullbsmtb + halfbsmtb + fullb + halfb
  return df

def log_cols(df, log_cols):
  '''adds 1 to all numeric columns and np.logs handpicked columns'''
  for column in [col for col in df.columns if not is_cat(df, col)]:
    df[column] = df[column]+1
  for col in log_cols:
    df[col] = np.log(df[col])
  return df

def plotme(df, cols):
  for col in cols:
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

def categorize_cols(df):
  """fill NaNs"""
  for col in df.columns:
    if df[col].isna().any():
      if is_cat(df, col):
        df[col] = df[col].cat.add_categories(['MISSING'])
        df[col] = df[col].fillna('MISSING')
        df[col] = df[col].cat.remove_unused_categories()
      else:
        if col not in ['GarageArea', 'KitchenAbvGr', 'TotRmsAbvGrd', 'LotArea',\
                       'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces', 'LotFrontage', \
                       'WoodDeckSF', 'MasVnrArea', '2ndFlrSF','GarageArea', 'WoodDeckSF',\
                       'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']:
          df[col] = df[col].fillna(df[col].mean())
        else:
          df[col] = df[col].fillna(0)
  return df

"""def encode_cols(df, cols):
  '''Use Label encoder'''
  for col in cols:
    encoder = LabelEncoder()
    df[col+'_e'] = encoder.fit_transform(df[col])
  return df"""

def drop_categories(df):
  '''drop cat values'''
  cats = [col for col in df.columns if is_cat(df, col)]
  return df.drop(columns = cats)

def iqr(df, columns, mult=3):
  '''cut outliers'''
  df = df.copy()
  for col in columns:
    d=df[col].describe()
    val =(d['50%'] + (d['75%']-d['25%'])) * mult
    df = df[df[col] <= val]
  return df

def work_df(df, to_log = ['LotFrontage', 'LotArea', 'GrLivArea', 'GarageArea'], target =[], to_drop = []):
  df = df.copy()
  df = improve_cats(df)
  df = create_cols(df)
  df = df.drop(columns = to_drop)
  df = categorize_cols(df)

  df['OutsideArea'] = df['OutsideArea'].apply(lambda x: x if x>0 else 0)

  df = log_cols(df, target+to_log)
  return df

#Creating variable

In [36]:
cols_to_log = ['LotFrontage', 'LotArea', 'GrLivArea','TotalSF','OutsideArea', 
               'MonthSold', 'TotalBsmtSF', 'YrSold', 'YearBuilt','YearRemodAdd', 
               'GarageYrBlt', '2ndFlrSF','GarageArea', 'WoodDeckSF', 'BsmtFinSF1', 
               'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']

cols_to_iqr = ['SalePrice','LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF', 
               'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 
               'GarageArea', 'PorchSF', 'OutsideArea', 'TotalSF', '2ndFlrSF', '1stFlrSF']

cols_to_drop = ['OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch', 
                'PoolArea', 'MiscVal', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath', 
                'FullBath', 'HalfBath']

In [59]:
df = work_df(raw_df_train, to_log = cols_to_log, to_drop = cols_to_drop, target = ['SalePrice'])                                                             
df_test = work_df(raw_df_test, to_log = cols_to_log, to_drop = cols_to_drop)

df = iqr(df, cols_to_iqr)

cols = ['Neighborhood']#, 'LotShape']#, 'FireplaceQu']
for col in cols:
  encoder = TargetEncoder(cols = col)
  df[col+'_te'] = encoder.fit_transform(df[col], df['SalePrice'])
  df_test[col+'_te'] = encoder.transform(df_test[col])

df = drop_categories(df)
df_test = drop_categories(df_test)

#Getting best columns 

In [60]:
#to_drop = []#'Modern', 'HasBsmt', 'HasGarage', 'HasPool','HasVnr']
df1 = df.copy()           #.drop(columns = to_drop)
df_test1 = df_test.copy() #.drop(columns = to_drop)
columns = get_permutations(df1) 

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[('MonthSold', 0.543887289983012), ('MoSold', 0.490816150756795), ('YrSold', 0.44628999427373167), ('OverallQual', 0.17471263649485078), ('GrLivArea', 0.13353440726011545), ('Neighborhood_te', 0.08740985162459057), ('Has2Floors', 0.07424221555311337), ('2ndFlrSF', 0.06892984098414806), ('TotalSF', 0.05192511941877963), ('OverallCond', 0.022333367430735884), ('YearBuilt', 0.021946206444635685), ('1stFlrSF', 0.01900616043447446), ('Bath', 0.018587601079643433), ('LotArea', 0.017305691019497072), ('TotalBsmtSF', 0.01494054903138845), ('BsmtFinSF1', 0.013206559290255337), ('YearRemodAdd', 0.010211319578703314), ('GarageCars', 0.0076837269767437875), ('WoodDeckSF', 0.007070297978800944), ('GarageArea', 0.0062750234245133926), ('Fireplaces', 0.003982382647186322), ('ScreenPorch', 0.0032014669260609697), ('MSSubClass', 0.0019466300239541433), ('OpenPorchSF', 0.0018839965308532713), ('GarageYrBlt', 0.0017012501941415458), ('OutsideArea', 0.001631160006574517), ('BsmtFullBath', 0.00155945130423

In [61]:
cols = ['SalePrice']+columns
model, txt0 = split_run_test(df1[cols])
cat, txt1 = train_cat(df1[cols])
stack,txt2 = split_run_test_stack(df1[cols])

train r2: 0.9199342141091336
train rmse: 0.008823543897884263

test r2: 0.8816284947192414
test rmse: 0.010478496487312808

train r2: 0.9924919558713039
train rmse: 0.002702701628125035

test r2: 0.8904629527276704
test rmse: 0.010047321322835066

train r2: 0.9250417225049554
train rmse: 0.00854938229989566

test r2: 0.8894851173475649
test rmse: 0.01014516972614318



In [65]:
y_pred_test = model.predict(df_test[columns]) * 0.3 + cat.predict(df_test[columns]) *0.3 + stack.predict(df_test[columns]) *0.4

In [66]:

df_test_result = df_test.copy()
df_test_result['SalePrice'] = y_pred_test
#pd.concat([df_test[columns], pd.DataFrame({'SalePrice':y_pred_test})], axis=1)
new_df = pd.concat([df1, df_test_result.sample(frac=0.6, random_state=42)])
new_df = new_df.dropna()
new_df.reset_index(drop=True)
columns = get_permutations(new_df)




[('MonthSold', 0.2184440660919716), ('MoSold', 0.1980911289207913), ('YrSold', 0.18609830374640932), ('OverallQual', 0.18443723861153266), ('GrLivArea', 0.09536457316898103), ('Neighborhood_te', 0.06448812509745436), ('2ndFlrSF', 0.03404489351815464), ('Has2Floors', 0.02961597671176174), ('Bath', 0.026730074647957135), ('1stFlrSF', 0.024964546505195438), ('TotalSF', 0.02210068894157511), ('TotalBsmtSF', 0.016138396126574472), ('YearBuilt', 0.015149806519218777), ('OverallCond', 0.015079404945304343), ('BsmtFinSF1', 0.009642656044242203), ('YearRemodAdd', 0.009195058089797325), ('GarageCars', 0.008983188871921533), ('GarageArea', 0.006920402311692742), ('WoodDeckSF', 0.004421675491468801), ('HasFireplace', 0.002563749600862053), ('ScreenPorch', 0.0022893904830858514), ('Fireplaces', 0.0021906649542350887), ('BsmtFullBath', 0.0014431850084716813), ('LotArea', 0.0013302639279397788), ('OpenPorchSF', 0.0011985979592420315), ('MSSubClass', 0.0011777009457769716), ('TotRmsAbvGrd', 0.00117352

In [69]:
model, txt0 = split_run_test(new_df[['SalePrice']+columns])
cat, txt1 = train_cat(new_df[['SalePrice']+columns])
stack,txt2 = split_run_test_stack(new_df[['SalePrice']+columns])
y_pred_test = model.predict(df_test[columns]) * 0.3 + cat.predict(df_test[columns]) *0.3 + stack.predict(df_test[columns]) *0.4

#model, txt = train_cat(new_df[['SalePrice']+columns])#split_run_test(new_df[['SalePrice']+columns])
#y_pred_test = model.predict(df_test[columns])


train r2: 0.9480514873507241
train rmse: 0.007151242463592941

test r2: 0.9198842055260552
test rmse: 0.008786348960599644

train r2: 0.9805288902480642
train rmse: 0.0043722459427323595

test r2: 0.9253386936731139
test rmse: 0.008491453722529144

train r2: 0.954023059480296
train rmse: 0.00672808988525285

test r2: 0.92913754743643
test rmse: 0.008264729304207201



In [29]:
df_test.shape

(1459, 36)

In [70]:
ids = np.arange(1461, 2920)
my_result = pd.DataFrame({'Id': ids, 'SalePrice': np.e**y_pred_test})
my_result

Unnamed: 0,Id,SalePrice
0,1461,125106.710465
1,1462,154352.970134
2,1463,178395.187207
3,1464,185810.422026
4,1465,186628.068959
...,...,...
1454,2915,88650.071326
1455,2916,89969.048634
1456,2917,171556.761329
1457,2918,116292.131362


In [71]:
my_result.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>