In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, recall_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,cross_validate

## Data preparing

In [5]:
cancer_data = pd.read_csv('../../data/cancer_data_v4.csv')

cancer_data.drop(['WBC', 'EO%', 'EO', 'GLO', 'NEUT%', 'HCT', 'TCH','TBIL', 'IBIL', 'LDH-L'], axis=1, inplace=True)
for fea in cancer_data.columns[6:]:
    median = cancer_data.groupby(['Origin','Age_group', 'A=male B=female'])[fea].median()
    cancer_data.set_index(['Origin','Age_group', 'A=male B=female'], inplace=True)
    cancer_data[fea].fillna(median, inplace=True)
    cancer_data.reset_index(inplace=True)
cancer_data['A=Case, B=Control'] = cancer_data['A=Case, B=Control'].apply(lambda x: 1 if x=='A' else 0)

group_b = cancer_data[cancer_data['Origin']=='B']
group_c = cancer_data[cancer_data['Origin']=='C'].sample(n=212,random_state=0)

data_bc = pd.concat([group_b,group_c], axis=0).sample(frac=1,random_state=0)# 424
data_ac = cancer_data.append(data_bc).drop_duplicates(keep=False).sample(frac=1,random_state=0) # 7368

In [6]:
X1, y1 = data_bc.drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), data_bc['A=Case, B=Control']
X2, y2 = data_ac.drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), data_ac['A=Case, B=Control']

## RandomForestClassifier

In [32]:
rf = RandomForestClassifier(n_estimators= 260, oob_score=True,random_state=0)
rf.fit(X2, y2)
print(rf.oob_score_) # 0.9698697068403909

0.9698697068403909


# n_estimators

In [9]:
%%time
param_test1 = {'n_estimators':range(150,350,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X2,y2)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 6min 2s


({'n_estimators': 260}, 0.9682710923794147)

### max_depth & min_samples_split

In [19]:
%%time
param_test2 = {'max_depth':range(10,51,5), 'min_samples_split':range(10,101,10)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 260,random_state=0), param_grid = param_test2, scoring='recall', cv=5)
gsearch2.fit(X2,y2)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 26min 38s


({'max_depth': 20, 'min_samples_split': 10}, 0.9672132407931615)

### min_samples_split & min_samples_leaf

In [23]:
%%time
param_test3 = {'min_samples_split':range(10,101,20), 'min_samples_leaf':range(10,101,20)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 260, max_depth=20,oob_score=True, random_state=0), param_grid = param_test3, scoring='recall', cv=5)
gsearch3.fit(X2,y2)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 5min 53s


({'min_samples_leaf': 10, 'min_samples_split': 10}, 0.9566399670098482)

### max_features

In [28]:
%%time
param_test4 = {'max_features':range(2,30,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 260, oob_score=True, random_state=0), param_grid = param_test4, scoring='recall', cv=5)
gsearch4.fit(X2,y2)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 11min 27s


({'max_features': 22}, 0.9706520447603669)

In [33]:
rf2 = RandomForestClassifier(n_estimators= 260, oob_score=True, random_state=0)
rf2.fit(X2,y2)

y_pred = rf2.predict(X1)
tn, fp, fn, tp = confusion_matrix(y1, y_pred).ravel()

rf_prob = rf2.predict_proba(X1)[:,1]
rf_fpr, rf_tpr, _ = roc_curve(y1, rf_prob)
rf_auc = auc(rf_fpr, rf_tpr)

print("ACC:",accuracy_score(y1, y_pred))
print("REC:",recall_score(y1, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",rf_auc)

ACC: 0.5377358490566038
REC: 0.08490566037735849
SPC: 0.9905660377358491
AUC: 0.7150787646849414


In [375]:
importances = pd.DataFrame(rf2.feature_importances_, columns = ['rf_importance'], index=X_train.columns).sort_values(by='rf_importance', ascending=False)

## XGBoost

In [35]:
from xgboost import XGBClassifier

### n_estimators

In [38]:
%%time
param_test1 = {'n_estimators':range(50,200,10)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(random_state=0), 
                       param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X2,y2)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 27.1 s


({'n_estimators': 120}, 1.0)

### min_child_weight & max_depth

In [40]:
%%time
param_test2 = {
 'max_depth':[2,3,4,5,6],
 'min_child_weight':[2,3,4,5,6,7]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(n_estimators=120,random_state=0), 
                       param_grid = param_test2, scoring='recall',cv=5)
gsearch2.fit(X2,y2)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 39.7 s


({'max_depth': 5, 'min_child_weight': 3}, 1.0)

### Gamma

In [368]:
%%time
param_test3 = {
 'gamma':[i/100.0 for i in range(25,45, 1)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, random_state=0), 
                       param_grid = param_test3, scoring='recall',cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 2.39 s


({'gamma': 0.42}, 0.7793103448275863)

### subsample & colsample_bytree

In [369]:
%%time
param_test4 = {
 'subsample':[i/100.0 for i in range(80,100)],
 'colsample_bytree':[i/100.0 for i in range(90,101)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, random_state=0), 
                       param_grid = param_test4, scoring='recall',cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 26.2 s


({'colsample_bytree': 0.92, 'subsample': 0.81}, 0.7862068965517242)

### reg_alpha & reg_lambda

In [370]:
%%time
param_test5 = {
 'reg_alpha':[0, 1e-3, 2e-3, 3e-3, 4e-3],
 'reg_lambda': [0.5, 1, 1.5, 2]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, colsample_bytree=0.92, subsample=0.81,random_state=0), 
                       param_grid = param_test5, scoring='recall',cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch5.best_score_

Wall time: 2.44 s


({'reg_alpha': 0, 'reg_lambda': 1}, 0.7862068965517242)

### learning_rate

In [377]:
%%time
param_test6 = {
    'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2, 0.25, 0.3]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, colsample_bytree=0.92, subsample=0.81, reg_alpha=0, reg_lambda=1, random_state=0), 
                       param_grid = param_test6, scoring='recall',cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.best_params_, gsearch6.best_score_

Wall time: 980 ms


({'learning_rate': 0.3}, 0.7862068965517242)

In [42]:
xgb2 = XGBClassifier(n_estimators=120, random_state=0)
xgb2.fit(X2,y2)
y_pred = xgb2.predict(X1)
tn, fp, fn, tp = confusion_matrix(y1, y_pred).ravel()

xgb_prob = xgb2.predict_proba(X1)[:,1]
xgb_fpr, xgb_tpr, _ = roc_curve(y1, xgb_prob)
xgb_auc = auc(xgb_fpr, xgb_tpr)

print("ACC:",accuracy_score(y1, y_pred))
print("REC:",recall_score(y1, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",xgb_auc)

ACC: 0.5023584905660378
REC: 0.0047169811320754715
SPC: 1.0
AUC: 0.6608001067995729


In [334]:
importances['xgb_importance'] = pd.DataFrame(xgb2.feature_importances_, index=X_train.columns)
importances.sort_values(by='xgb_importance', ascending=False)

Unnamed: 0,rf_importance,xgb_importance
BASO,0.033166,0.048534
RBC,0.042941,0.037253
PLT,0.034554,0.036376
BUN,0.030496,0.034813
MCV,0.027678,0.034582
RDW-CV,0.034931,0.033044
TP,0.032907,0.032932
RDW-SD,0.006911,0.031402
NEUT,0.054752,0.031271
GLU,0.049081,0.031186


# model export

In [43]:
from joblib import dump, load

In [44]:
# dump(rf2, '../model/rf_clf')
# dump(xgb2, '../model/xgb_clf')

['../model/xgb_clf']