<a href="https://colab.research.google.com/github/Dimonfordont/DS_course/blob/main/Basics/Models/House_prices_kaggle_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!pip install category_encoders

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import timeit

from category_encoders import TargetEncoder
from google.colab import files
from lightgbm import LGBMRegressor
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, StackingRegressor
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoLarsCV, LinearRegression, ElasticNet#, SGDRegressor,LogisticRegression, LogisticRegressionCV,
from sklearn.model_selection import train_test_split#, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.tree import DecisionTreeRegressor

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('max_colwidth', 80)

raw_df_train = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/train.csv', index_col=0)
raw_df_test = pd.read_csv('https://raw.githubusercontent.com/Dimildizio/DS_course/main/gradient-boosting/data/test.csv', index_col=0)

#Work with categories

In [None]:
def is_cat(df, col):
  '''checks if a column is of object or category type'''
  return df[col].dtype in ['object', 'category']

def improve_cats(dataframe) -> pd.DataFrame:
  '''turns dtypes 64->32 and object->category'''
  df = dataframe.copy()
  for col in df.columns:
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
      df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
      df[col] = df[col].astype('category')
    else:
      print('Unknown data type')
      return
  return df

#Show statistics

In [None]:
def adjusted_r2(yt,yp, colnum):
  '''computes adjusted r2 score'''
  return 1 - (1 - r2_score(yt, yp)) * ((yt.shape[0]-1) / (yt.shape[0] - colnum+(1e-12)))

def print_scores(y_train, y_test, y_pred_train, y_pred_test, colnum):
  '''outprints adjusted r2 and rmse results on train and validation'''
  train_r2 =  adjusted_r2(y_train, y_pred_train, colnum)
  test_r2 = adjusted_r2(y_test,y_pred_test, colnum)
  train_rmse = np.sqrt(mean_squared_error(np.log(y_pred_train), np.log(y_train)))
  test_rmse = np.sqrt(mean_squared_error(np.log(y_pred_test), np.log(y_test)))
  text = f'train r2: {train_r2}\ntrain rmse: {train_rmse}\n\ntest r2: {test_r2}\ntest rmse: {test_rmse}\n'
  print(text)
  return text

#Split dataframe to X,y

In [None]:
def split_data(df, target):
  '''splits data'''
  X = df.drop(target, axis=1)
  y = df[target]
  return train_test_split(X, y, test_size=0.01, random_state=13)

#Work with model

In [None]:
def train_model(X_train, y_train):
  '''subj'''
  estimators = [('LinReg',LinearRegression()), ('Lasso',LassoLarsCV(max_iter=15,eps=0.01)),
                ('Tree',DecisionTreeRegressor(max_depth=5)), ('LGBR', LGBMRegressor(max_depth = 5, learning_rate = 0.05)),
                ('Elastic',ElasticNet(alpha=0.0005,random_state=42))]
  pipe = make_pipeline(RobustScaler(), StackingRegressor(estimators=estimators, final_estimator = RandomForestRegressor(max_depth=6)))
  pipe.fit(X_train, y_train)
  return pipe

In [None]:
def split_run_test(df, target = 'SalePrice'):
  '''splits data and runs model'''
  X_train, X_test, y_train, y_test = split_data(df, target)
  model = train_model(X_train, y_train)

  y_pred_train = model.predict(X_train)
  y_pred_test = model.predict(X_test)
  txt = print_scores(y_train, y_test, y_pred_train, y_pred_test, len(X_train.columns))
  return model, txt

In [None]:
def get_permutations(df):
  '''Pick best columns'''
  X_train, X_test, y_train, y_test = split_data(df, 'SalePrice')
  model = train_model(X_train, y_train)
  result = permutation_importance(model, X_train, y_train, n_repeats=10, random_state=42)
  importance_dict = dict(zip(X_train.columns, result.importances_mean))
  sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)
  print(sorted_importance)
  return [x[0] for x in sorted_importance]

#Work with columns

In [None]:
def create_cols(df):
  #create Total SF
  df["TotalSF"] = df["1stFlrSF"] + df["2ndFlrSF"] + df["TotalBsmtSF"]+df['LotArea']
  #create Porch
  df['PorchSF'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['3SsnPorch'] + df['ScreenPorch']
  #Create green area
  df["OutsideArea"] = df["LotArea"] - df["GrLivArea"] - df["GarageArea"]
  #Create month sold * year
  df['MonthSold'] = df['YrSold']*12 + df['MoSold'] #-df['YrSold'].min()
  #dates_frames  = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold','MoSold']
  #Create booleans
  df['Has2Floors'] = df['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has1Floors'] = df['1stFlrSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasPorch'] = df['PorchSF'].apply(lambda x: 1 if x>0 else 0)
  df['Has2Wood'] = df['WoodDeckSF'].apply(lambda x: 1 if x>0 else 0)
  df['HasFireplace'] = df['Fireplaces'].apply(lambda x: 1 if x>0 else 0)
  #create bath
  fullbsmtb = df['BsmtFullBath'].apply(lambda x: x if x > 0 else 0)
  halfbsmtb = df['BsmtHalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  fullb = df['FullBath'].apply(lambda x: x if x > 0 else 0)
  halfb = df['HalfBath'].apply(lambda x: x*0.5 if x > 0 else 0)
  df['Bath'] = fullbsmtb + halfbsmtb + fullb + halfb
  return df

def log_cols(df, log_cols):
  '''adds 1 to all numeric columns and np.logs handpicked columns'''
  for column in [col for col in df.columns if not is_cat(df, col)]:
    df[column] = df[column]+1
  for col in log_cols:
    df[col] = np.log(df[col])
  return df

def plotme(df, cols):
  for col in cols:
    if col != 'SalePrice':
      sns.scatterplot(y = df['SalePrice'], x = df[col])

def categorize_cols(df):
  """fill NaNs"""
  for col in df.columns:
    if df[col].isna().any():
      if is_cat(df, col):
        df[col] = df[col].cat.add_categories(['MISSING'])
        df[col] = df[col].fillna('MISSING')
        df[col] = df[col].cat.remove_unused_categories()
      else:
        if col not in ['GarageArea', 'KitchenAbvGr', 'TotRmsAbvGrd', 'LotArea',\
                       'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces', 'LotFrontage', \
                       'WoodDeckSF', 'MasVnrArea', '2ndFlrSF','GarageArea', 'WoodDeckSF',\
                       'BsmtFinSF1', 'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']:
          df[col] = df[col].fillna(df[col].mean())
        else:
          df[col] = df[col].fillna(0)
  return df

"""def encode_cols(df, cols):
  '''Use Label encoder'''
  for col in cols:
    encoder = LabelEncoder()
    df[col+'_e'] = encoder.fit_transform(df[col])
  return df"""

def drop_categories(df):
  '''drop cat values'''
  cats = [col for col in df.columns if is_cat(df, col)]
  return df.drop(columns = cats)

def iqr(df, columns, mult=3):
  '''cut outliers'''
  df = df.copy()
  for col in columns:
    d=df[col].describe()
    val =(d['50%'] + (d['75%']-d['25%'])) * mult
    df = df[df[col] <= val]
  return df

def work_df(df, to_log = ['LotFrontage', 'LotArea', 'GrLivArea', 'GarageArea'], target =[], to_drop = []):
  df = df.copy()
  df = improve_cats(df)
  df = create_cols(df)
  df = df.drop(columns = to_drop)
  df = categorize_cols(df)

  df['OutsideArea'] = df['OutsideArea'].apply(lambda x: x if x>0 else 0)

  df = log_cols(df, target+to_log)
  return df

#Creating variable

In [None]:
cols_to_log = ['LotFrontage', 'LotArea', 'GrLivArea','TotalSF','OutsideArea',
               'MonthSold', 'TotalBsmtSF', 'YrSold', 'YearBuilt','YearRemodAdd',
               'GarageYrBlt', '2ndFlrSF','GarageArea', 'WoodDeckSF', 'BsmtFinSF1',
               'BsmtFinSF2','BsmtUnfSF', 'LowQualFinSF', '1stFlrSF']

cols_to_iqr = ['SalePrice','LotFrontage', 'LotArea', 'MasVnrArea', 'TotalBsmtSF',
               'GrLivArea', 'BedroomAbvGr', 'KitchenAbvGr', 'Fireplaces',
               'GarageArea', 'PorchSF', 'OutsideArea', 'TotalSF', '2ndFlrSF', '1stFlrSF']

cols_to_drop = ['OpenPorchSF','EnclosedPorch', '3SsnPorch', 'ScreenPorch',
                'PoolArea', 'MiscVal', 'GarageCars', 'BsmtFullBath', 'BsmtHalfBath',
                'FullBath', 'HalfBath']

In [None]:
df = work_df(raw_df_train, to_log = cols_to_log, to_drop = cols_to_drop, target = ['SalePrice'])
df_test = work_df(raw_df_test, to_log = cols_to_log, to_drop = cols_to_drop)

df = iqr(df, cols_to_iqr)

cols = ['Neighborhood']#, 'LotShape']#, 'FireplaceQu']
for col in cols:
  encoder = TargetEncoder(cols = col)
  df[col+'_te'] = encoder.fit_transform(df[col], df['SalePrice'])
  df_test[col+'_te'] = encoder.transform(df_test[col])

df = drop_categories(df)
df_test = drop_categories(df_test)

#Getting best columns

In [None]:
to_drop = []#'Modern', 'HasBsmt', 'HasGarage', 'HasPool','HasVnr']
df1 = df.copy().drop(columns = to_drop)
df_test1 = df_test.copy().drop(columns = to_drop)
columns = get_permutations(df1)

[('MonthSold', 0.2682867385596045), ('MoSold', 0.24922580498735147), ('YrSold', 0.23246554353010956), ('OverallQual', 0.16205822248210247), ('GrLivArea', 0.15353910189296355), ('Has2Floors', 0.10598184728778386), ('2ndFlrSF', 0.08735412265171047), ('TotalSF', 0.07067098914982259), ('Neighborhood_te', 0.06263170717416695), ('YearBuilt', 0.04682110035012317), ('OverallCond', 0.04179169272639469), ('1stFlrSF', 0.03582048355341118), ('Bath', 0.02335105742132554), ('WoodDeckSF', 0.02308061764458409), ('LotArea', 0.018783982464302407), ('BsmtFinSF1', 0.014727096876807077), ('GarageArea', 0.013963444552012894), ('YearRemodAdd', 0.012406097262180471), ('TotalBsmtSF', 0.012376029024834357), ('Fireplaces', 0.008436888878886484), ('PorchSF', 0.008095869281135449), ('GarageYrBlt', 0.007342359323277414), ('BedroomAbvGr', 0.007196398708019513), ('Has2Wood', 0.0063019872757096975), ('TotRmsAbvGrd', 0.003406528902063455), ('KitchenAbvGr', 0.002649563168616176), ('BsmtFinSF2', 0.002052786928299488), ('

In [None]:
cols = ['SalePrice']+columns
model, txt = split_run_test(df1[cols])

train r2: 0.9309937150793202
train rmse: 0.00836349649226021

test r2: 1.0430737832343562
test rmse: 0.007551981269388807



In [None]:
y_pred_test = model.predict(df_test[columns])

In [None]:
ids = np.arange(1461, 2920)
my_result = pd.DataFrame({'Id': ids, 'SalePrice': np.e**y_pred_test})
my_result

Unnamed: 0,Id,SalePrice
0,1461,126932.485780
1,1462,156034.167337
2,1463,173411.301264
3,1464,185059.773791
4,1465,183938.810189
...,...,...
1454,2915,93230.582971
1455,2916,94607.662694
1456,2917,175704.332953
1457,2918,118641.277116


In [None]:
my_result.to_csv('submission.csv', index=False)
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>