In [129]:
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics

In [130]:
import pandas as pd
import numpy as np

In [131]:
import math

In [132]:
Path = "data/"

In [134]:
df = pd.read_csv(f"{Path}train.csv",low_memory=False)

In [202]:
df.head(100)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,4,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,12.247694
1,2,20,4,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,12.109011
2,3,60,4,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,12.317167
3,4,70,4,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,11.849398
4,5,60,4,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,12.429216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,60,4,,9765,Pave,,IR2,Lvl,AllPub,...,0,,,Shed,480,4,2009,WD,Normal,12.128111
96,97,20,4,78.0,10264,Pave,,IR1,Lvl,AllPub,...,0,,,,0,8,2006,WD,Normal,12.273731
97,98,20,4,73.0,10921,Pave,,Reg,HLS,AllPub,...,0,,,,0,5,2007,WD,Normal,11.458997
98,99,30,4,85.0,10625,Pave,,Reg,Lvl,AllPub,...,0,,,Shed,400,5,2010,COD,Abnorml,11.326596


## RMSLE

In [137]:
df['SalePrice'] = np.log(df.SalePrice)

## Categorical Transformation

In [138]:
df['MSZoning'].unique()

array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object)

In [144]:
for n,c in df[199:200].items():
    print(n) # name of column
    print(c) # index values
    print("----------------------------")
    

Id
199    200
Name: Id, dtype: int64
----------------------------
MSSubClass
199    20
Name: MSSubClass, dtype: int64
----------------------------
MSZoning
199    RL
Name: MSZoning, dtype: category
Categories (5, object): ['C (all)' < 'FV' < 'RH' < 'RL' < 'RM']
----------------------------
LotFrontage
199    76.0
Name: LotFrontage, dtype: float64
----------------------------
LotArea
199    9591
Name: LotArea, dtype: int64
----------------------------
Street
199    Pave
Name: Street, dtype: category
Categories (2, object): ['Grvl' < 'Pave']
----------------------------
Alley
199    NaN
Name: Alley, dtype: category
Categories (2, object): ['Grvl' < 'Pave']
----------------------------
LotShape
199    Reg
Name: LotShape, dtype: category
Categories (4, object): ['IR1' < 'IR2' < 'IR3' < 'Reg']
----------------------------
LandContour
199    Lvl
Name: LandContour, dtype: category
Categories (4, object): ['Bnk' < 'HLS' < 'Low' < 'Lvl']
----------------------------
Utilities
199    AllPub
Name

In [145]:
def train_categorize(df):
    for n,c in df.items():
        if pd.api.types.is_string_dtype(c): # if value is a string in train data
            df[n] = c.astype("category").cat.as_ordered() # categorize the value

When I apply categorization for train set then I have to use the same categorization for validation and test sets.
Because of that I wil write a second method that uses train's categorization for validation and test sets.

In [146]:
def apply_categorize(df,train):
    for n,c in df.items():
        if train[n].dtype == "category":
            df[n] = pd.Categorical(c,categories= train[n].cat.categories,ordered=True)

In [147]:
train_categorize(df)

In [149]:
df['MSZoning']

0       RL
1       RL
2       RL
3       RL
4       RL
        ..
1455    RL
1456    RL
1457    RL
1458    RL
1459    RL
Name: MSZoning, Length: 1460, dtype: category
Categories (5, object): ['C (all)' < 'FV' < 'RH' < 'RL' < 'RM']

In [153]:
df['MSZoning'].cat.categories

Index(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype='object')

When we categorize the variables in the front we do not see any differences but in the backend we can see that it is mapping and it will store the variables as numerical values by typing ".cat.codes" Also missing values are -1 for pandas. To change the column in the front too we are going to write a numericalize method 

In [203]:
def numericalize(df,col,name):
    if not pd.api.types.is_numeric_dtype(col):
        df[name] = col.cat.codes + 1

In [204]:
numericalize(df,df['MSZoning'],"MSZoning")
df['MSZoning']

0       4
1       4
2       4
3       4
4       4
       ..
1455    4
1456    4
1457    4
1458    4
1459    4
Name: MSZoning, Length: 1460, dtype: int8

In [220]:
def fix_missing(df,col,name):
    if pd.api.types.is_numeric_dtype(col):
        if pd.isnull(col).sum:
            df[name+"na"] = pd.isnull(col)
        df[name] = col.fillna(df[name].median())

In [221]:
def proc_df(df):
    
    df = df.copy()
    for n, c in df.items():
        fix_missing(df,c,n)
        numericalize(df,c,n)
    
    return df

In [222]:
df_processed,y = proc_df(df,"SalePrice")

In [223]:
df_processed.head(100)

0     12.247694
1     12.109011
2     12.317167
3     11.849398
4     12.429216
        ...    
95    12.128111
96    12.273731
97    11.458997
98    11.326596
99    11.767180
Name: SalePrice, Length: 100, dtype: float64

## TRAIN VALIDATION SPLIT

In [224]:
df_processed["YrSold"]

0       2008
1       2007
2       2008
3       2006
4       2008
        ... 
1455    2007
1456    2010
1457    2010
1458    2010
1459    2008
Name: YrSold, Length: 1460, dtype: int64

In [225]:
df_sorted = df_processed.sort_values(by='YrSold')
df_sorted["YrSold"].tail(100)

142     2010
1358    2010
234     2010
233     2010
230     2010
        ... 
353     2010
1322    2010
162     2010
158     2010
1284    2010
Name: YrSold, Length: 100, dtype: int64

In [228]:
def split_train_val(df,n):
    # df = dataset of the validation and train
    # the number of the train dataset
    return df[:n].copy(),df[n:].copy() # We are giving the current data to the validation

In [229]:
len(df_sorted)

1460

In [236]:
n_valid = 360
n_train = len(df_sorted) - n_valid
train,valid = split_train_val(df_sorted,n_train)

In [237]:
y_train = train["SalePrice"].values
train.drop("SalePrice", axis=1, inplace=True)
x_train = train

In [239]:
y_valid = valid["SalePrice"].values
valid.drop("SalePrice", axis=1, inplace=True)
x_valid = valid

In [240]:
print(f"x train length: {len(x_train)}")
print(f"y train length: {len(y_train)}")
print(f"x valid length: {len(x_valid)}")
print(f"y valid length: {len(y_valid)}")

x train length: 1100
y train length: 1100
x valid length: 360
y valid length: 360


## Train Random Forest

In [241]:
m = RandomForestRegressor(n_estimators=1, bootstrap=False,n_jobs=-1)
m.fit(x_train,y)

RandomForestRegressor(bootstrap=False, n_estimators=1, n_jobs=-1)

In [242]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [None]:
def print_score(m):
    print("RMSE ERRORS")
    print(f"Train set: {rmse(m.predict(x_train,y_train))}")
    print(f"Validation set: {rmse(m.predict(x_train,y_train))}")
    print("R^2 ERRORS")
    print(f"Train set: {m.score(x_train,y_train)}")
    print(f"Validation set: {m.score(x_train,y_train)}")