# Modelling : Random Forest (Boosting Technique)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

In [8]:
#cleaned data loading
data = pd.read_csv('cleaned_data.csv')
data.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [10]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [12]:
#train test split the data
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , random_state = 12 , train_size = 0.8)

**BASELINE MODEL**

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
RF_model = RandomForestClassifier()
RF_model.fit(X_train , y_train)

#Train Accuracy
Xtrain_pred = RF_model.predict(X_train)
print("Train accuracy : " , accuracy_score(y_train ,Xtrain_pred))

#Test Accuracy
Xtest_pred = RF_model.predict(X_test)
print("Test accuracy : " , accuracy_score(y_test ,Xtest_pred))

#CV Score
print("CV Score : " , cross_val_score(RF_model , X_train , y_train , cv = 5 , scoring = 'accuracy').mean())

Train accuracy :  1.0
Test accuracy :  0.8833333333333333
CV Score :  0.8410460992907801


**Fine Tuning Hyper parameters**

In [138]:
from sklearn.model_selection import GridSearchCV

estimator = RandomForestClassifier(random_state = 0)
params_grid = {'n_estimators' : list(range(1,101))}

fine_model = GridSearchCV(estimator , params_grid , cv = 5 , scoring = 'accuracy')
fine_model.fit(X_train , y_train)

fine_model.best_params_

{'n_estimators': 98}

In [130]:
fine_model.best_estimator_.feature_importances_

array([0.11257136, 0.01626174, 0.09302001, 0.01359768, 0.11909618,
       0.01797102, 0.07746581, 0.14357751, 0.08663459, 0.0130415 ,
       0.01408574, 0.29267687])

In [140]:
RF_model = RandomForestClassifier(n_estimators = 98, random_state=0)
RF_model.fit(X_train , y_train)

#Train Accuracy
Xtrain_pred = RF_model.predict(X_train)
print("Train accuracy : " , accuracy_score(y_train ,Xtrain_pred))

#Test Accuracy
Xtest_pred = RF_model.predict(X_test)
print("Test accuracy : " , accuracy_score(y_test ,Xtest_pred))

#CV Score
print("CV Score : " , cross_val_score(RF_model , X_train , y_train , cv = 5 , scoring = 'accuracy').mean())

Train accuracy :  1.0
Test accuracy :  0.8833333333333333
CV Score :  0.8453900709219859
