## Training data with Random Forest using 5 cross validation

In [1]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
RNF_model = RandomForestRegressor()
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from time import time

In [2]:
tr_feature = pd.read_csv('Data/train_features.csv')
tr_label = pd.read_csv('Data/train_label.csv')

### creating a function for custom print

In [3]:
def print_result(results):
    print(f'BEST PARAMS : {results.best_params_}')
    
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    
    for mean,std,param in zip(means,stds,params):
        print(f'{round(mean,3)} (+/-{round(std *2,3)} for {param}')

#### Hypertuning the model

In [None]:
parameters = {'n_estimators':[5,10,25,30,100,150,200],
             'max_depth': [2,4,8,16,32,64,None],
             'random_state':[0,10,42,40,75,100]}
cv = GridSearchCV(RNF_model,parameters,cv=5)
cv.fit(tr_feature,tr_label.values.ravel())
print_result(cv)

In [None]:
cv.best_estimator_

In [None]:
# creatiing pkl file to save the model
joblib.dump(cv.best_estimator_,'Data/RNF_model.pkl')

In [4]:
val_features = pd.read_csv('Data/val_features.csv')
val_labels = pd.read_csv('Data/val_label.csv')

te_features = pd.read_csv('Data/test_features.csv')
te_labels = pd.read_csv('Data/test_label.csv')

#### loading and storing the .pkl file to model

In [5]:
model =  joblib.load(f'Data/RNF_model.pkl')

### Evaluating on validation dataset

In [6]:
val = model.predict(val_features)

In [7]:
mean_absolute_error(val_labels,val)

1230.011329883635

In [8]:
mean_squared_error(val_labels,val)

5178736.651257776

In [9]:
r2_score(val_labels,val)

0.7859793072885788

### Evaluating on test dataset

In [10]:
tst = model.predict(te_features)

In [13]:
mean_absolute_error(te_labels,tst)

1148.05068012082

In [14]:
mean_squared_error(te_labels,tst)

3392378.144676696

### accuracy is greater than while training on train and validation

In [9]:
r2_score(te_labels,tst)

0.8351807590049238