<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/goradient-boosting/notebooks/Housing_prices_kaggle_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder,\
Normalizer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoLarsCV, LinearRegression, Lasso, Ridge, LogisticRegression, LogisticRegressionCV
import itertools
from sklearn.tree import DecisionTreeRegressor
from random import shuffle

In [16]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

In [17]:
raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)


In [18]:
def improve_cats(dataframe) -> pd.DataFrame:
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

In [19]:
df = improve_cats(raw_df_train) #pd.concat([df_train_1, df_test_1])
df_test = improve_cats(raw_df_test)

In [20]:
dates_frames  = ['YearBuilt', 'YeatRemodAdd', 'GarageYrBlt', "YrSold",'MoSold']

In [21]:
def cut_iqr(df, col, mult):
  d=df[col].describe()
  val =(d['50%'] + (d['75%']-d['25%'])) * mult
  return df[df[col] <= val]

In [22]:
def plotme(df, col):
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

In [23]:
#we need to remove empty values in categories if there are ones 
def cut_cats(df):
  for catcol in df.columns:
    if df[catcol].dtype == 'category':
      df[catcol].cat.remove_unused_categories() 
  return df

def create_mt_catcol(df_new):
  df = df_new.copy()
  for col in df.columns:
      if df[col].dtype == 'category':
        if df[col].isna().any():
          df[col] = df[col].cat.add_categories(['MISSING'])
          df[col] = df[col].fillna('MISSING')
  cut_cats(df)
  return df

In [24]:
#Change after we get encoding
def get_valid_cols(df, ok_cols, to_drop=[]):
  return [x for x in df.columns if df[x].dtype not in ['category', 'object'] and x not in ok_cols+to_drop]

In [25]:
def split_data(df, target):
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.2, random_state=42)
  

In [26]:
#!pip install catboost

In [27]:
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor

In [28]:
#Could run separate times for y_pred_train, y_pred_test but training two could take longer 

#Change model
def run_model(X_train, y_train, X_test, model = LinearRegression):
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=1000)), ('Forest',RandomForestRegressor(max_depth = 6)),
                ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.04))]#, ('KNN', KNeighborsRegressor))]
                #'Bag',BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42)),

  pipe = make_pipeline(StandardScaler(), StackingRegressor(estimators=estimators, final_estimator=  LGBMRegressor(max_depth = 5, learning_rate = 0.04))) #RandomForestRegressor(max_depth=7)))
                       #KNeighborsRegressor(n_neighbors = 10))#BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42))#LogisticRegressionCV())#( max_depth=10))
  pipe.fit(X_train, y_train)

  y_pred_train = pipe.predict(X_train)
  y_pred_test = pipe.predict(X_test)
  return y_pred_train, y_pred_test, pipe

In [29]:
def split_run_test(df, target = 'SalePrice'):
  X_train, X_test, y_train, y_test = split_data(df, target)
  y_pred_train, y_pred_test, model = run_model(X_train, y_train, X_test)
  print_scores(y_train, y_test, y_pred_train, y_pred_test, print_train=True)
  return model

In [30]:
def print_scores(y_train, y_test, y_pred_train, y_pred_test, print_train=False):
    #print('Column: ', col)
    if print_train:
      print('train: r2', r2_score(y_train, y_pred_train))
      print("train: root mean squared error:", np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train))))
      print('Test:')
    print('test: r2', r2_score(y_test, y_pred_test))
    print("test: root mean squared error:", np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test))))
    print()

In [31]:
def find_best_param(dataset: pd.DataFrame, valid_cols: list, ok_cols: list, to_drop = [], r2_threshold = 0.67, target = "SalePrice"):
  dataset = dataset.copy()
  best_col = ['', r2_threshold]
  
  for col in valid_cols:
    #old good col + a test col
    test_cols = ok_cols + [col]
    df = dataset[test_cols]
  
    try:
      X_train, X_test, y_train, y_test = split_data(df, target)
      y_pred_train, y_pred_test, _ = run_model(X_train, y_train, X_test)

      r2 = r2_score(y_test, y_pred_test) #compare_scores(y_train, y_test, y_pred_train, y_pred_test, r2_threshold)
      if r2 > r2_threshold:
        r2_threshold = r2
        best_col = [col, r2]
        #print_scores(y_train, y_test, y_pred_train, y_pred_test)
    except ValueError:
      continue
      #print(col, 'has NaN')
  
  result = ok_cols + [best_col[0]]
  #print('Best result:', result, best_col[1])
  return best_col


In [32]:
def get_best_cols(df, ok_cols: list, to_drop =[],  r2_threshold=0.67):
  valid_cols = get_valid_cols(df_cats, ok_cols, to_drop)#
  valid_nodups = valid_cols.copy()
  shuffle(valid_cols)
  #best_result = r2_threshold
  for col in valid_cols:
    #print(new_valid)
    best_col, r2 = find_best_param(df_cats, valid_nodups, ok_cols, r2_threshold)
    if r2 - r2_threshold > 0.001:
      r2_threshold = r2
      ok_cols.append(best_col)
      valid_nodups.remove(col)
  return ok_cols, r2_threshold


In [33]:
def many_shuffles(df, my_cols = ['SalePrice', 'OverallQual'], to_drop = [], num=10, threshold = 0.67):
  bestestbest = [my_cols, threshold]
  for x in range(num):
    print('shuffle:', x+1)
    result = get_best_cols(df, my_cols, to_drop)
    if result[1] > bestestbest[1]:
      bestestbest = result
      print(bestestbest)
  print('\n\nThe result is:')
  print(bestestbest)
  return bestestbest

In [34]:
def drop_n_log(df):
  df_num = df.copy()
  #Create a new parameter that combines all porches
  df_num["TotalSF"] = df_num["1stFlrSF"] + df_num["2ndFlrSF"] + df_num["TotalBsmtSF"] 
  df_num['PorchSF'] = df_num['OpenPorchSF'] + df_num['EnclosedPorch'] + df_num['3SsnPorch'] + df_num['ScreenPorch']
  df_num["OutsideArea"] = df_num["LotArea"] - df_num["GrLivArea"] - df_num["GarageArea"]

  #drop low numbers 
  #cols = ['KitchenAbvGr', 'BedroomAbvGr']
  #df_num = drop_low_zeros(df_num, cols)

  df_num['GarageYrBlt'] = df_num['GarageYrBlt'].fillna(df_num['GarageYrBlt'].mean()) #REPLACE

  #Drop numeric columns
  df_num = df_num.drop(columns=['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']) #drop porches
  df_num = df_num.drop(columns=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'])                    #drop basements
  df_num = df_num.drop(columns=['GarageCars'])                                               #drop garage
  df_num = df_num.drop(columns=['LowQualFinSF', 'PoolArea', 'MiscVal', 'TotRmsAbvGrd', '1stFlrSF'])      #drop misc

  #wierdly it produced worse result
  for column in [col for col in df_num.columns if df_num[col].dtype not in ['category', 'object']]:
    df_num[column] = df_num[column]+1
  #  df_num[column] = np.log(df_num[column] + 1)

  to_log = ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']#, 'OutsideArea']#, 'TotalSF']#, 'GarageArea','WoodDeckSF', 'MasVnrArea','TotalBsmtSF','2ndFlrSF', 'PorchSF']#]#,]#, ]#, ]#]]
  for col in to_log:
    df_num[col] = np.log(df_num[col])
  print(df_num.shape)
  return df_num

In [35]:
#Cut iqr
def make_iqr(df_num):
  df_iqr = df_num.copy()
  for col in ['SalePrice', 'LotFrontage', 'LotArea', 'MasVnrArea', \
              'TotalBsmtSF', 'GrLivArea', \
              'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'PorchSF']:

      df_iqr = cut_iqr(df_iqr, col, 3)
      #plotme(df_iqr, col)

  print(df_iqr.shape)
  return df_iqr

In [36]:
def make_cats(df_iqr):
  '''df_cats = df_iqr.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'Utilities', 'Condition1', \
                        'Condition2', 'Street', 'LandContour', 'LandSlope', \
                  'RoofMatl', 'ExterCond', 'BsmtCond', 'BsmtFinType2', 'Heating', 'CentralAir', \
                  'Electrical', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType'], axis = 1)'''
  df_cats = create_mt_catcol(df_iqr)

  #factorize
  #df_cats =  factorize_cats(df_cats)

  return df_cats

In [37]:
def factorize_cats(df):
  df = df.copy()
  new_df=df.select_dtypes(include=['category']).apply(lambda x: x.factorize()[0])  
  print(type(new_df))
  for col in new_df.columns:
    #print(col)
    df[col] = new_df[col]
  print(df.shape)
  return df

In [38]:
df_num = drop_n_log(df)

(1460, 70)


In [39]:
df_iqr= make_iqr(df_num)

(1140, 70)


In [40]:
df_cats = make_cats(df_iqr)

In [41]:
#model.predict()

In [42]:
#Find second best parameter
my_cols = ['SalePrice']#, 'OverallQual']
valids = get_valid_cols(df_cats, my_cols, ['OverallQual'])
find_best_param(df_cats, valids, my_cols, ['OverallQual'], r2_threshold = 0.5)

['TotalSF', 0.7127141442912897]

In [43]:
#test on best combination so far
split_run_test(df_cats[['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'BedroomAbvGr', 'Fireplaces', 'BsmtFullBath', 'GarageArea', 'HalfBath', 'YearRemodAdd', 'WoodDeckSF', 'YrSold']])#, 0.9173624653156374)#'SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'GarageArea', 'BsmtFullBath', 'Fireplaces', 'BedroomAbvGr', 'YearRemodAdd', 'HalfBath']])

train: r2 0.908033278188526
train: root mean squared error: 0.010210806342658993
Test:
test: r2 0.9073966444423274
test: root mean squared error: 0.009956040383679973



In [44]:
#get current best combination 
get_best_cols(df_cats, ['SalePrice', 'OverallQual'], r2_threshold = 0.67)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


(['SalePrice',
  'OverallQual',
  'TotalSF',
  'YearBuilt',
  'OutsideArea',
  'BedroomAbvGr',
  'HalfBath',
  'OverallCond',
  'GrLivArea',
  'GarageArea',
  'GrLivArea'],
 0.9033590578506254)

In [45]:
#get best orders of best combinations
#result = many_shuffles(df_cats, num=5, threshold = 0.67)

In [46]:
#split_run_test(df_cats[result[0]])

train: r2 0.8873116507908816
train: root mean squared error: 0.13261506109954052
Test:
test: r2 0.9173624653156374
test: root mean squared error: 0.11168377778450472

(['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'BedroomAbvGr', 'Fireplaces', 'BsmtFullBath', 'GarageArea', 'HalfBath', 'YearRemodAdd', 'WoodDeckSF', 'YrSold'], 
0.9173624653156374)'

log of 'LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice'

['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'GarageArea', 'BsmtFullBath', 'Fireplaces', 'BedroomAbvGr', 'YearRemodAdd', 'HalfBath']

log of ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice','GarageArea']

(['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'OverallCond', 'YearBuilt', 'LotArea', 'GarageArea', 'Fireplaces', 'HalfBath', 'BsmtFullBath', 'BedroomAbvGr', 'GarageYrBlt', 'WoodDeckSF', 'WoodDeckSF', 'BsmtHalfBath'], 0.9218701457673271)


In [47]:
one = [['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Foundation', 'Fireplaces', 'OverallCond', 'YearBuilt', 'GrLivArea', 'BedroomAbvGr', 'BsmtFullBath', 'Functional', 'MasVnrType', 'CentralAir', 'HalfBath', 'BsmtExposure', 'PavedDrive'], 0.9248235848301374, 'lognorm', 'LassoLarsCV']


## LassoLarsCV 0.92

(['SalePrice',
  'OverallQual',
  'TotalSF',
  'GarageArea',
  'LotArea',
  'Foundation',
  'Fireplaces',
  'OverallCond',
  'HalfBath',
  'FullBath',
  'CentralAir',
  'BsmtExposure',
  'BsmtFullBath',
  'PavedDrive',
  'MasVnrType',
  'Functional',
  'BedroomAbvGr',
  'GrLivArea',
  '2ndFlrSF',
  'Condition1',
  'ExterCond'],
 0.9236066685585789) - basic log['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']. LassoLarsCV

## Stacking .91 = .91

Stack LR(), LassoLarsCV(iter = 2500), LGBMRegressor (depth = 5, learning_rate = 0.04), final estimator = Random Forest Regressor (depth=6)

train: r2 0.9170645972588807
train: root mean squared error: 0.009690338119708751

test: r2 0.9108202976462205
test: root mean squared error: 0.009789630655685054

log = ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']

cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'BedroomAbvGr', 'Fireplaces', 'BsmtFullBath', 'GarageArea', 'HalfBath', 'YearRemodAdd', 'WoodDeckSF', 'YrSold']]