<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/gradient-boosting/notebooks/Housing_prices_kaggle_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder,\
Normalizer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LassoLarsCV, LinearRegression, Lasso, Ridge, LogisticRegression, LogisticRegressionCV
import itertools
from sklearn.tree import DecisionTreeRegressor
from random import shuffle

In [67]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

In [68]:
raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)


In [69]:
def improve_cats(dataframe) -> pd.DataFrame:
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

In [70]:
df = improve_cats(raw_df_train) #pd.concat([df_train_1, df_test_1])
df_test = improve_cats(raw_df_test)

In [71]:
dates_frames  = ['YearBuilt', 'YeatRemodAdd', 'GarageYrBlt', "YrSold",'MoSold']

In [72]:
def cut_iqr(df, col, mult):
  d=df[col].describe()
  val =(d['50%'] + (d['75%']-d['25%'])) * mult
  return df[df[col] <= val]

In [73]:
def plotme(df, col):
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

In [74]:
#we need to remove empty values in categories if there are ones 
def cut_cats(df):
  for catcol in df.columns:
    if df[catcol].dtype == 'category':
      df[catcol].cat.remove_unused_categories() 
  return df

def create_mt_catcol(df_new):
  df = df_new.copy()
  for col in df.columns:
      if df[col].dtype == 'category':
        if df[col].isna().any():
          df[col] = df[col].cat.add_categories(['MISSING'])
          df[col] = df[col].fillna('MISSING')
  cut_cats(df)
  return df

In [75]:
#Change after we get encoding
def get_valid_cols(df, ok_cols, to_drop=[]):

  return [x for x in df.columns if x not in ok_cols+to_drop]# and df[x].dtype not in ['category', 'object']]

In [76]:
def split_data(df, target):
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.2, random_state=42)
  

In [77]:
#!pip install catboost

In [78]:
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor

In [103]:
#Could run separate times for y_pred_train, y_pred_test but training two could take longer 



#Change model
def run_model(X_train, y_train, X_test, model = LinearRegression):
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=12,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)),
                ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.04))]#, ('KNN', KNeighborsRegressor))]
                #'Bag',BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42)),

  estimator = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05))]

  pipe = make_pipeline(StandardScaler(), LinearRegression())#StackingRegressor(estimators=estimators, final_estimator=  RandomForestRegressor(max_depth = 6))) #RandomForestRegressor(max_depth=7)))
                       #KNeighborsRegressor(n_neighbors = 10))#BaggingRegressor(estimator=LinearRegression(), n_estimators=5, random_state=42))#LogisticRegressionCV())#( max_depth=10))
  pipe.fit(X_train, y_train)

  y_pred_train = pipe.predict(X_train)
  y_pred_test = pipe.predict(X_test)
  return y_pred_train, y_pred_test, pipe

In [104]:
def split_run_test(df, target = 'SalePrice'):
  X_train, X_test, y_train, y_test = split_data(df, target)
  y_pred_train, y_pred_test, model = run_model(X_train, y_train, X_test)
  print_scores(y_train, y_test, y_pred_train, y_pred_test, print_train=True)
  return model

In [105]:
def print_scores(y_train, y_test, y_pred_train, y_pred_test, print_train=False):
    #print('Column: ', col)
    if print_train:
      print('train: r2', r2_score(y_train, y_pred_train))
      print("train: root mean squared error:", np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train))))
      print('Test:')
    print('test: r2', r2_score(y_test, y_pred_test))
    print("test: root mean squared error:", np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test))))

    print()

In [106]:
def find_best_param(dataset: pd.DataFrame, valid_cols: list, ok_cols: list, to_drop = [], r2_threshold = 0.67, target = "SalePrice"):
  dataset = dataset.copy()
  best_col = ['', r2_threshold]
  
  for col in valid_cols:
    #old good col + a test col
    test_cols = ok_cols + [col]
    df = dataset[test_cols]
  
    try:
      X_train, X_test, y_train, y_test = split_data(df, target)
      y_pred_train, y_pred_test, _ = run_model(X_train, y_train, X_test)

      r2 = r2_score(y_test, y_pred_test) #compare_scores(y_train, y_test, y_pred_train, y_pred_test, r2_threshold)
      if r2 > r2_threshold:
        r2_threshold = r2
        best_col = [col, r2]
        #print_scores(y_train, y_test, y_pred_train, y_pred_test)
    except ValueError:
      continue
      #print(col, 'has NaN')
  
  result = ok_cols + [best_col[0]]
  #print('Best result:', result, best_col[1])
  return best_col


In [125]:
def get_best_cols(df, ok_cols: list, to_drop =[],  r2_threshold=0.67):
  valid_cols = get_valid_cols(df_cats, ok_cols, to_drop)#
  valid_nodups = valid_cols.copy()
  shuffle(valid_cols)
  #best_result = r2_threshold
  for col in valid_cols:
    #print(new_valid)
    best_col, r2 = find_best_param(df_cats, valid_nodups, ok_cols, r2_threshold)
    if r2 > r2_threshold:# > 0.001:
      r2_threshold = r2
      ok_cols.append(best_col)
      valid_nodups.remove(col)
  return ok_cols, r2_threshold


In [126]:
def many_shuffles(df, my_cols = ['SalePrice', 'OverallQual'], to_drop = [], num=10, threshold = 0.67):
  bestestbest = [my_cols, threshold]
  for x in range(num):
    print('shuffle:', x+1)
    result = get_best_cols(df, my_cols.copy(), to_drop)
    if result[1] > bestestbest[1]:
      bestestbest = result
      print(bestestbest)
  print('\n\nThe result is:')
  print(bestestbest)
  return bestestbest

In [127]:
def drop_n_log(df, testdf=False):
  df_num = df.copy()
  #Create a new parameter that combines all porches
  df_num["TotalSF"] = df_num["1stFlrSF"] + df_num["2ndFlrSF"] + df_num["TotalBsmtSF"] 
  df_num['PorchSF'] = df_num['OpenPorchSF'] + df_num['EnclosedPorch'] + df_num['3SsnPorch'] + df_num['ScreenPorch']
  df_num["OutsideArea"] = df_num["LotArea"] - df_num["GrLivArea"] - df_num["GarageArea"]

  #drop low numbers 
  #cols = ['KitchenAbvGr', 'BedroomAbvGr']
  #df_num = drop_low_zeros(df_num, cols)

  df_num['GarageYrBlt'] = df_num['GarageYrBlt'].fillna(df_num['GarageYrBlt'].mean()) #REPLACE

  #Drop numeric columns
  df_num = df_num.drop(columns=['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']) #drop porches
  df_num = df_num.drop(columns=['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'])                    #drop basements
  df_num = df_num.drop(columns=['GarageCars'])                                               #drop garage
  df_num = df_num.drop(columns=['LowQualFinSF', 'PoolArea', 'MiscVal', 'TotRmsAbvGrd', '1stFlrSF'])      #drop misc

  #wierdly it produced worse result
  for column in [col for col in df_num.columns if df_num[col].dtype not in ['category', 'object']]:
    df_num[column] = df_num[column]+1
  #  df_num[column] = np.log(df_num[column] + 1)
  testdf = [] if testdf else ['SalePrice']
  to_log = testdf+['LotFrontage', 'LotArea', 'GrLivArea']#, 'OutsideArea']#, 'TotalSF']#, 'GarageArea','WoodDeckSF', 'MasVnrArea','TotalBsmtSF','2ndFlrSF', 'PorchSF']#]#,]#, ]#, ]#]]
  for col in to_log:
    df_num[col] = np.log(df_num[col])
  print(df_num.shape)
  return df_num

In [128]:
#Cut iqr
def make_iqr(df_num):
  df_iqr = df_num.copy()
  for col in ['SalePrice', 'LotFrontage', 'LotArea', 'MasVnrArea', \
              'TotalBsmtSF', 'GrLivArea', \
              'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces', 'GarageArea', 'PorchSF']:

      df_iqr = cut_iqr(df_iqr, col, 3)
      #plotme(df_iqr, col)

  print(df_iqr.shape)
  return df_iqr

In [143]:
def make_cats(df_iqr):
  df_cats = df_iqr.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'Utilities', 'Condition1', \
                        'Condition2', 'Street', 'LandContour', 'LandSlope', \
                  'RoofMatl', 'ExterCond', 'BsmtCond', 'BsmtFinType2', 'Heating', 'CentralAir', \
                  'Electrical', 'Functional', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType'], axis = 1)
  df_cats = create_mt_catcol(df_iqr)

  #factorize
  df_cats =  factorize_cats(df_cats)

  return df_cats

In [144]:
def factorize_cats(df):
  df = df.copy()
  new_df=df.select_dtypes(include=['category']).apply(lambda x: x.factorize()[0])  
  print(type(new_df))
  for col in new_df.columns:
    #print(col)
    df[col] = new_df[col]
  print(df.shape)
  return df

In [145]:
df_num = drop_n_log(df)

(1460, 70)


In [146]:
df_iqr= make_iqr(df_num)

(1140, 70)


In [147]:
df_cats = make_cats(df_iqr)

<class 'pandas.core.frame.DataFrame'>
(1140, 70)


In [148]:
#model.predict()

In [149]:
#Find second best parameter
my_cols = ['SalePrice']#, 'OverallQual']
valids = get_valid_cols(df_cats, my_cols, ['OverallQual'])
find_best_param(df_cats, valids, my_cols, ['OverallQual'], r2_threshold = 0.5)

['TotalSF', 0.7168602300401753]

In [150]:
#test on best combination so far
split_run_test(df_cats[['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'BedroomAbvGr', 'Fireplaces', 'BsmtFullBath', 'GarageArea', 'HalfBath', 'YearRemodAdd', 'WoodDeckSF', 'YrSold']])#, 0.9173624653156374)#'SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'GarageArea', 'BsmtFullBath', 'Fireplaces', 'BedroomAbvGr', 'YearRemodAdd', 'HalfBath']])

train: r2 0.8873116507908816
train: root mean squared error: 0.011216538269260832
Test:
test: r2 0.9173624653156374
test: root mean squared error: 0.009411391258568358



In [151]:
#get current best combination 
mymod = get_best_cols(df_cats, ['SalePrice', 'OverallQual'], r2_threshold = 0.67)


In [152]:
mymod

(['SalePrice',
  'OverallQual',
  'TotalSF',
  'GarageArea',
  'LotArea',
  'Foundation',
  'OverallCond',
  'YearBuilt',
  'GrLivArea',
  'BedroomAbvGr',
  'FireplaceQu',
  'BsmtFullBath',
  'Functional',
  'CentralAir',
  'Fence',
  'PavedDrive',
  'YearRemodAdd',
  'ExterCond',
  'Condition1',
  'MiscFeature',
  'Neighborhood',
  'WoodDeckSF',
  'Exterior2nd',
  'GarageCond',
  'KitchenQual',
  'LandContour',
  'BsmtExposure',
  'HouseStyle',
  'LotShape',
  'GarageType',
  'ExterQual',
  'PoolQC',
  'PoolQC'],
 0.9268455191787586)

In [153]:
#test on best combination so far
d = split_run_test(df_cats[['SalePrice', 'OverallQual', 'TotalSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath', 'MSSubClass', 'Fireplaces', 'LotArea', 'OverallCond', 'HalfBath', 'YearRemodAdd', 'OverallCond', 'HalfBath', 'FullBath', 'YearBuilt', 'BsmtFullBath', 'GrLivArea', 'GarageYrBlt', '2ndFlrSF', 'OutsideArea', 'YrSold', 'YrSold', 'PorchSF', 'WoodDeckSF', 'PorchSF', 'MasVnrArea', 'TotalBsmtSF']])
print(d)

train: r2 0.8926461206999482
train: root mean squared error: 0.010965143754793035
Test:
test: r2 0.9138478764768979
test: root mean squared error: 0.009617197495655032

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearregression', LinearRegression())])


In [154]:
#get best orders of best combinations
result = many_shuffles(df_cats, num=5, threshold = 0.67)

shuffle: 1
(['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Foundation', 'Fireplaces', 'OverallCond', 'YearBuilt', 'GrLivArea', 'BedroomAbvGr', 'BsmtFullBath', 'Functional', 'MasVnrType', 'CentralAir', 'PavedDrive', 'YearRemodAdd', 'ExterCond', 'BsmtExposure', 'Condition1', 'GarageCond', 'MasVnrArea', 'LotShape', 'FireplaceQu', 'HouseStyle', 'LandContour', 'PoolQC', 'ExterQual', 'GarageType', 'Heating', 'ExterCond', 'BsmtFinType2', 'GrLivArea'], 0.9270453794892924)
shuffle: 2
(['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Foundation', 'Fireplaces', 'OverallCond', 'YearBuilt', 'GrLivArea', 'BedroomAbvGr', 'BsmtFullBath', 'Functional', 'MasVnrType', 'CentralAir', 'MiscFeature', 'ExterCond', 'PavedDrive', 'Neighborhood', 'BsmtExposure', 'Fence', 'BsmtHalfBath', 'Condition1', 'BsmtHalfBath', 'Fence'], 0.9271797319053682)
shuffle: 3
shuffle: 4
shuffle: 5


The result is:
(['SalePrice', 'OverallQual', 'TotalSF', 'GarageArea', 'LotArea', 'Foundation', 'Fi

In [155]:
my_model = split_run_test(df_cats[result[0]])

train: r2 0.8931950303764852
train: root mean squared error: 0.010933384295665106
Test:
test: r2 0.9271797319053682
test: root mean squared error: 0.008841494510114298



train: r2 0.8873116507908816
train: root mean squared error: 0.13261506109954052
Test:
test: r2 0.9173624653156374
test: root mean squared error: 0.11168377778450472

(['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'BedroomAbvGr', 'Fireplaces', 'BsmtFullBath', 'GarageArea', 'HalfBath', 'YearRemodAdd', 'WoodDeckSF', 'YrSold'], 
0.9173624653156374)'

log of 'LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice'

['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'LotArea', 'YearBuilt', 'OverallCond', 'GarageArea', 'BsmtFullBath', 'Fireplaces', 'BedroomAbvGr', 'YearRemodAdd', 'HalfBath']

log of ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice','GarageArea']


(['SalePrice', 'OverallQual', 'GrLivArea', 'TotalBsmtSF', 'OverallCond', 'YearBuilt', 'LotArea', 'GarageArea', 'Fireplaces', 'HalfBath', 'BsmtFullBath', 'BedroomAbvGr', 'GarageYrBlt', 'WoodDeckSF', 'WoodDeckSF', 'BsmtHalfBath'], 0.9218701457673271)


In [156]:

subm_df = make_cats(drop_n_log(df_test.copy()))

pred_ys = d.pridict(subm_df)
pred_ys = np.e**pred_ys

KeyError: ignored

## LassoLarsCV 0.92

(['SalePrice',
  'OverallQual',
  'TotalSF',
  'GarageArea',
  'LotArea',
  'Foundation',
  'Fireplaces',
  'OverallCond',
  'HalfBath',
  'FullBath',
  'CentralAir',
  'BsmtExposure',
  'BsmtFullBath',
  'PavedDrive',
  'MasVnrType',
  'Functional',
  'BedroomAbvGr',
  'GrLivArea',
  '2ndFlrSF',
  'Condition1',
  'ExterCond'],
 0.9236066685585789) - basic log['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']. LassoLarsCV

## Stacking .92 = .92

  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)), ('Tree',DecisionTreeRegressor(max_depth=5)),
                ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05))]
final = RandomForestRegressor(max_depth=6)

train: r2 0.9259061491493182
train: root mean squared error: 0.009157887674229528

Test:
test: r2 0.9217500638734336
test: root mean squared error: 0.009142663976721768

log = ['LotFrontage', 'LotArea', 'GrLivArea', 'SalePrice']

cols = ['SalePrice', 'OverallQual', 'TotalSF', 'MSSubClass', 'GarageArea', 'BsmtHalfBath', 'MSSubClass', 'Fireplaces', 'LotArea', 'OverallCond', 'HalfBath', 'YearRemodAdd', 'OverallCond', 'HalfBath', 'FullBath', 'YearBuilt', 'BsmtFullBath', 'GrLivArea', 'GarageYrBlt', '2ndFlrSF', 'OutsideArea', 'YrSold', 'YrSold', 'PorchSF', 'WoodDeckSF', 'PorchSF', 'MasVnrArea', 'TotalBsmtSF']

factorized?

Best so far - 0.93 stacking of linear regression with factorized cats.

In [None]:
Создать датасет в который добавляются условия и параметры если он лучше