 Random Forest (RF)

In [22]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('../dataset/Maternal Health Risk Data Set.csv')

In [3]:
data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel
0,25,130,80,15.0,98.0,86,high risk
1,35,140,90,13.0,98.0,70,high risk
2,29,90,70,8.0,100.0,80,high risk
3,30,140,85,7.0,98.0,70,high risk
4,35,120,60,6.1,98.0,76,low risk


In [4]:
data['codeRisk'] = pd.factorize(data.RiskLevel)[0]

In [6]:
data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel,codeRisk
0,25,130,80,15.0,98.0,86,high risk,0
1,35,140,90,13.0,98.0,70,high risk,0
2,29,90,70,8.0,100.0,80,high risk,0
3,30,140,85,7.0,98.0,70,high risk,0
4,35,120,60,6.1,98.0,76,low risk,1


In [8]:
data.codeRisk.value_counts()

1    406
2    336
0    272
Name: codeRisk, dtype: int64

In [9]:
data.RiskLevel.value_counts()

low risk     406
mid risk     336
high risk    272
Name: RiskLevel, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder

In [12]:
le = LabelEncoder()

In [13]:
data['code_Risk'] = le.fit_transform(data.RiskLevel) 

In [14]:
data.head()

Unnamed: 0,Age,SystolicBP,DiastolicBP,BS,BodyTemp,HeartRate,RiskLevel,codeRisk,code_Risk
0,25,130,80,15.0,98.0,86,high risk,0,0
1,35,140,90,13.0,98.0,70,high risk,0,0
2,29,90,70,8.0,100.0,80,high risk,0,0
3,30,140,85,7.0,98.0,70,high risk,0,0
4,35,120,60,6.1,98.0,76,low risk,1,1


In [15]:
data.RiskLevel.value_counts()

low risk     406
mid risk     336
high risk    272
Name: RiskLevel, dtype: int64

In [40]:
df = pd.read_csv('../dataset/Maternal Health Risk Data Set.csv')


In [41]:
df['codeRisk'] = le.fit_transform(df.RiskLevel)

In [42]:
X = df.drop(['codeRisk', 'RiskLevel'], axis=1)
y = df.codeRisk

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.05 , random_state= 2)
print(y_test.shape)

(51,)


In [55]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [56]:
confusion_matrix(y_test, model.predict(X_test))

array([[12,  1,  0],
       [ 0, 21,  2],
       [ 0,  0, 15]], dtype=int64)

In [57]:
classification_report(y_test, model.predict(X_test))

'              precision    recall  f1-score   support\n\n           0       1.00      0.92      0.96        13\n           1       0.95      0.91      0.93        23\n           2       0.88      1.00      0.94        15\n\n    accuracy                           0.94        51\n   macro avg       0.95      0.95      0.94        51\nweighted avg       0.94      0.94      0.94        51\n'

In [58]:
# Definition of hyperparameters
def best_param_for_Random_Forest(number_cv=3):
    n_estimators = [int(x) for x in np.linspace(10, 100, 10)]
    max_features = ['auto', 'sqrt']
    max_depth = [2, 10]
    min_samples_split = [2, 5]
    min_samples_leaf = [1, 2]
    criterion = ['gini', 'entropy']
    bootstrap = [True, False]
    params_grid = {'n_estimators': n_estimators, 
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion': criterion,
               'bootstrap': bootstrap
              }
    # grid sarch
    list_best_param = []
    list_best_scores = []
    for i in range(2, number_cv):   
        df_grid = GridSearchCV(estimator=model, param_grid= params_grid, cv= i)
        df_grid.fit(X_train, y_train)
        list_best_param.append(df_grid.best_params_)
        list_best_scores.append(df_grid.best_score_)
# crate a data frame with the cv and the bst score
    print(pd.DataFrame(list_best_scores, index = range(2, number_cv), 
            columns = ['Best score']).sort_index())
    best_score_index = 0
# looks for the params with the best score
    for i in list_best_scores:
        if i > i+1:
            best_score_index = i.index
# best score
    best = list_best_scores.sort()
    print('best score: ',best[0])
# give the bst params

    print(list_best_param[best_score_index])

In [59]:
best_param_for_Random_Forest(number_cv = 10)

   Best score
2    0.802676
3    0.798546
4    0.825562
5    0.823467
6    0.842165
7    0.836975
8    0.834892
9    0.842160
{'bootstrap': False, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 80}
