In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
import xgboost
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv('kaggle/train.csv',low_memory=False)
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [3]:
df0 = train.copy()
df0.isna().sum()

id        0
cat0      0
cat1      0
cat2      0
cat3      0
cat4      0
cat5      0
cat6      0
cat7      0
cat8      0
cat9      0
cont0     0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
target    0
dtype: int64

In [4]:
df0 = df0.dropna(subset=['target'])
df0.isna().sum()

id        0
cat0      0
cat1      0
cat2      0
cat3      0
cat4      0
cat5      0
cat6      0
cat7      0
cat8      0
cat9      0
cont0     0
cont1     0
cont2     0
cont3     0
cont4     0
cont5     0
cont6     0
cont7     0
cont8     0
cont9     0
cont10    0
cont11    0
cont12    0
cont13    0
target    0
dtype: int64

In [5]:
df1 = df0.copy()

In [6]:
for key,value in df1.items():
  if not pd.api.types.is_numeric_dtype(value):
    df1[key] = pd.Categorical(value).codes

In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300000 entries, 0 to 299999
Data columns (total 26 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      300000 non-null  int64  
 1   cat0    300000 non-null  int8   
 2   cat1    300000 non-null  int8   
 3   cat2    300000 non-null  int8   
 4   cat3    300000 non-null  int8   
 5   cat4    300000 non-null  int8   
 6   cat5    300000 non-null  int8   
 7   cat6    300000 non-null  int8   
 8   cat7    300000 non-null  int8   
 9   cat8    300000 non-null  int8   
 10  cat9    300000 non-null  int8   
 11  cont0   300000 non-null  float64
 12  cont1   300000 non-null  float64
 13  cont2   300000 non-null  float64
 14  cont3   300000 non-null  float64
 15  cont4   300000 non-null  float64
 16  cont5   300000 non-null  float64
 17  cont6   300000 non-null  float64
 18  cont7   300000 non-null  float64
 19  cont8   300000 non-null  float64
 20  cont9   300000 non-null  float64
 21  cont10  30

In [8]:
X = df1.drop("target",axis=1)
y = df1["target"]

### Splitting data

In [9]:
#let's create our validation set
X_train,X_valid,y_train,y_valid = train_test_split(X,y,test_size=0.2)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

((240000, 25), (60000, 25), (240000,), (60000,))

### first random forest model

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
X_tr_sc = scaler.fit_transform(X_train)
X_valid_sc = scaler.transform(X_valid)

In [12]:
model_scaled = RandomForestRegressor(random_state=42)
model_scaled.fit(X_tr_sc,y_train)

RandomForestRegressor(random_state=42)

In [13]:
#making predictions of scaled valid set
y_preds_sc = model_scaled.predict(X_valid_sc)

In [23]:
from sklearn.metrics import mean_squared_error as mse

In [15]:
#evaluating our scaled model with mse
mse = mean_squared_error(y_valid,y_preds_sc)

In [16]:
#root mean square error
rmse = np.sqrt(mse)
rmse

0.738092413786461

### let's tune hyperparameters of our random forest model

In [17]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [18]:
#let's create our grid
grid = { "n_estimators": [10,50,100],
         "max_depth" : [None, 3, 5, 10],
         "min_samples_split" : np.arange(2,20,2),
         "min_samples_leaf" : np.arange(1,20,2)}

In [20]:
#random search for random forest
rf_mo = RandomizedSearchCV(RandomForestRegressor(),
                           param_distributions=grid,
                           n_iter=3,
                           n_jobs = -1,
                           cv=3,
                           verbose=0)
rf_mo.fit(X_tr_sc,y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=3, n_jobs=-1,
                   param_distributions={'max_depth': [None, 3, 5, 10],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': [10, 50, 100]})

In [21]:
rf_mo.best_estimator_

RandomForestRegressor(max_depth=10, min_samples_leaf=11)

In [22]:
#evaluating model fct
from sklearn.metrics import mean_absolute_error as mae

In [24]:
from sklearn.metrics import r2_score

In [27]:
def eval0(model):
    train_pred = model.predict(X_train)
    valid_pred = model.predict(X_valid)
    dict_met = {"MAE metric train score":mae(y_train,train_pred),
               "MAE metric valid score":mae(y_valid,valid_pred),
               "MSE metric train score":mse(y_train,train_pred),
               "MSE metric valid score":mse(y_valid,valid_pred),
               "RMSE metric valid score":np.sqrt(mse(y_valid,valid_pred)),
               "R2 metric train score":r2_score(y_train,train_pred),
               "R2 metric valid score":r2_score(y_valid,valid_pred)}
    return dict_met

In [28]:
eval0(rf_mo)

{'MAE metric train score': 0.592436753839625,
 'MAE metric valid score': 0.5954358597831769,
 'MSE metric train score': 0.5565633477677252,
 'MSE metric valid score': 0.5607706639072993,
 'RMSE metric valid score': 0.7488462218021129,
 'R2 metric train score': -0.00011325710029885805,
 'R2 metric valid score': -0.0001147344665333172}

In [29]:
model_scaled1 = RandomForestRegressor(min_samples_leaf=11,random_state=42)
model_scaled1.fit(X_tr_sc,y_train)

RandomForestRegressor(min_samples_leaf=11, random_state=42)

In [30]:
eval0(model_scaled1)

{'MAE metric train score': 0.592325967210032,
 'MAE metric valid score': 0.5955090494812938,
 'MSE metric train score': 0.5594058939176837,
 'MSE metric valid score': 0.5636584062158716,
 'RMSE metric valid score': 0.7507718736179929,
 'R2 metric train score': -0.005221153802255962,
 'R2 metric valid score': -0.005264921197098316}

In [31]:
X.shape

(300000, 25)

### let's check a simple model

In [32]:
#let's have our predicted value be the mean of all our target values
y.mean()

8.241978552366994

In [34]:
#let's evaluate how our model performed
np.sqrt(mse(y_train,(np.ones(len(y_train))*y.mean())))

0.7459896084954153

### Let's create our XGBOOST model