In [None]:
# Random Forest using Library and  GridSearchCV 

In [None]:
# Importing libraries
import pandas as pd
import numpy as np

In [None]:
# Loading dataset
data = pd.read_csv('leukemia.csv', header=None)
data.head()

In [None]:
from sklearn.preprocessing import scale,LabelEncoder

x = scale(data.iloc[:,:-1])
x = pd.DataFrame(x)

x.head()

In [None]:
y = data.iloc[:,-1]
label = LabelEncoder()
label.fit(y)
y = label.transform(y)

In [None]:
np.unique(y, return_counts=True)  # gives counting in array

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,
                                                    test_size=0.2,
                                                   random_state=1)

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# testing accuracy
from sklearn.metrics import accuracy_score, matthews_corrcoef,confusion_matrix, make_scorer
mcc = matthews_corrcoef(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("MCC:",mcc)
print("Accuracy:",acc)

In [None]:
# Grid Search for entropy
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(random_state=42)

param_grid = {'n_estimators': [50,100,200],
             'max_features': [0.01,0.1,0.2,0.5,'auto'],
             'max_depth' : [None,10,20,50,100],
             'criterion':['entropy'] }

grid = GridSearchCV(estimator=rfc, param_grid = param_grid,cv=5,verbose=1, n_jobs=-1)

grid.fit(X_train, y_train)


In [None]:
# Grid Search for gini
from sklearn.model_selection import GridSearchCV

rfc_gini = RandomForestClassifier(random_state=42)

param_grid = {'n_estimators': [50,100,200],
             'max_features': [0.01,0.1,0.2,0.5,'auto'],
             'max_depth' : [None,10,20,50,100],
             'criterion':['gini'] }

grid_gini = GridSearchCV(estimator=rfc, param_grid = param_grid,cv=5,verbose=1, n_jobs=-1)

grid_gini.fit(X_train, y_train)


In [None]:
# Fitting data on best parameters
clf = grid.best_estimator_
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
matthews_corrcoef(y_test, y_pred)

In [None]:
print(grid.best_params_)

In [None]:
feature_score = clf.feature_importances_

In [None]:
k = 100
selected_features = feature_score.argsort()[-k:]

In [None]:
selected_features

In [None]:
X_new = x.loc[:,selected_features]
X_new.shape

In [None]:
# train_test_split on new dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new,y,
                                                    test_size=0.2,
                                                   random_state=1)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
model.score(X_train,y_train)

In [None]:
mcc = matthews_corrcoef(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
print("MCC:",mcc)
print("Accuracy:",acc)

In [None]:
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(random_state=42)

param_grid = {'n_estimators': [50,100,200],
             'max_features': [0.01,0.1,0.2,0.5,'auto'],
             'max_depth' : [None,10,20,50,100],
             'criterion':['gini'] }

grid = GridSearchCV(estimator=rfc, param_grid = param_grid,cv=5,verbose=1, n_jobs=-1)

grid.fit(X_train, y_train)


In [None]:
clf = grid.best_estimator_
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
matthews_corrcoef(y_test, y_pred)

In [None]:
print(grid.best_params_)

In [None]:
clf_gini = grid_gini.best_estimator_
clf_gini.fit(X_train,y_train)
y_pred = clf_gini.predict(X_test)
matthews_corrcoef(y_test, y_pred)

In [None]:
# find out on default

In [None]:
from sklearn.model_selection import cross_val_score, StrtifiedKFold
scorer = make_scorer(matthews_corrcoef)
kfold = StratifiedKFold(n_splits=5)
acc_cv = cross_val_score(clf,x,y,cv=5,scoring=scorer)
print("Accuracy on entropy on original data",acc_cv.mean())

acc_xnew = cross_val_score(clf,X_new,y,cv=5,scoring=scorer)
print("Accuracy on entropy on k=100 :",acc_xnew.mean())


In [None]:
from sklearn.model_selection import cross_val_score
scorer = make_scorer(matthews_corrcoef)
acc_cv = cross_val_score(clf_gini,x,y,cv=5,scoring=scorer)
print("Accuracy on gini on original data",acc_cv.mean())

acc_xnew = cross_val_score(clf_gini,X_new,y,cv=5,scoring=scorer)
print("Accuracy on gini on k=100 :",acc_xnew.mean())

In [None]:
feature_score = clf_gini.feature_importances_

In [None]:
k = 100
selected_features = feature_score.argsort()[::-k]