In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, recall_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,cross_validate

## Data preparing

### Full data

In [336]:
cancer_data = pd.read_csv('../../data/cancer_data_v4.csv')
cancer_data.drop(['WBC', 'EO%', 'EO', 'GLO', 'NEUT%', 'HCT', 'TCH','TBIL', 'IBIL', 'LDH-L'], axis=1, inplace=True)
for fea in cancer_data.columns[6:]:
    median = cancer_data.groupby(['Age_group', 'A=male B=female'])[fea].median()
    cancer_data.set_index(['Age_group', 'A=male B=female'], inplace=True)
    cancer_data[fea].fillna(median, inplace=True)
    cancer_data.reset_index(inplace=True)
cancer_data['A=Case, B=Control'] = cancer_data['A=Case, B=Control'].apply(lambda x: 1 if x=='A' else 0)
X_train, X_test, y_train, y_test = train_test_split(cancer_data.drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), cancer_data['A=Case, B=Control'], test_size=0.3, random_state=0)

### Physical Center

In [293]:
cancer_data = pd.read_csv('../../data/cancer_data_v4.csv')
cancer_data.drop(['WBC', 'EO%', 'EO', 'GLO', 'NEUT%', 'HCT', 'TCH','TBIL', 'IBIL', 'LDH-L'], axis=1, inplace=True)
for fea in cancer_data.columns[6:]:
    median = cancer_data.groupby(['Age_group', 'A=male B=female'])[fea].median()
    cancer_data.set_index(['Age_group', 'A=male B=female'], inplace=True)
    cancer_data[fea].fillna(median, inplace=True)
    cancer_data.reset_index(inplace=True)
cancer_data['A=Case, B=Control'] = cancer_data['A=Case, B=Control'].apply(lambda x: 1 if x=='A' else 0)

group_b = cancer_data[cancer_data['Origin']=='B']
group_c = cancer_data[cancer_data['Origin']=='C'].sample(n=212)

X_train, X_test, y_train, y_test = train_test_split(pd.concat([group_b,group_c], axis=0).drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), pd.concat([group_b,group_c], axis=0)['A=Case, B=Control'].sample(frac=1,random_state=0), test_size=0.2, random_state=0)

In [294]:
print(len(X_train), len(X_test))

339 85


## RandomForestClassifier

In [296]:
rf = RandomForestClassifier(oob_score=True,random_state=0)
rf.fit(X_train, y_train)
print(rf.oob_score_)

0.5191740412979351


# n_estimators

In [300]:
%%time
param_test1 = {'n_estimators':range(30,80,5)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), 
                       param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 4.48 s


({'n_estimators': 35}, 0.5299465240641711)

### max_depth & min_samples_split

In [303]:
%%time
param_test2 = {'max_depth':range(1,20,2), 'min_samples_split':range(2,20,2)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35,random_state=0),
   param_grid = param_test2, scoring='recall', cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 24.3 s


({'max_depth': 13, 'min_samples_split': 6}, 0.5593582887700534)

### min_samples_split & min_samples_leaf

In [307]:
%%time
param_test3 = {'min_samples_split':range(2,10,1), 'min_samples_leaf':range(1,20,2)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35, max_depth=13,oob_score=True, random_state=10),
            param_grid = param_test3, scoring='recall', cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 24.8 s


({'min_samples_leaf': 15, 'min_samples_split': 2}, 0.5477718360071301)

### max_features

In [308]:
%%time
param_test4 = {'max_features':range(4,30,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35, max_depth=13, min_samples_split=2,
                                  min_samples_leaf=15 ,oob_score=True, random_state=0),
param_grid = param_test4, scoring='recall', cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 4.66 s


({'max_features': 14}, 0.5)

In [312]:
rf2 = RandomForestClassifier(n_estimators= 35, max_depth=13, min_samples_split=2,min_samples_leaf=15,oob_score=True, max_features=14, random_state=0)
rf2.fit(X_train,y_train)

y_pred = rf2.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

rf_prob = rf2.predict_proba(X_test)[:,1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_prob)
rf_auc = auc(rf_fpr, rf_tpr)

print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",rf_auc)

ACC: 0.43529411764705883
REC: 0.4772727272727273
SPC: 0.3902439024390244
AUC: 0.45177383592017734


In [313]:
importances = pd.DataFrame(rf2.feature_importances_, columns = ['rf_importance'], index=X_train.columns).sort_values(by='rf_importance', ascending=False)

## XGBoost

In [113]:
from xgboost import XGBClassifier

In [315]:
xgb = XGBClassifier(random_state=0)
xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
xgb_prob = xgb.predict_proba(X_test)[:,1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_prob)
xgb_auc = auc(xgb_fpr, xgb_tpr)
print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",xgb_auc)

ACC: 0.49411764705882355
REC: 0.4772727272727273
SPC: 0.5121951219512195
AUC: 0.5210643015521064


In [144]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### n_estimators

In [316]:
%%time
param_test1 = {'n_estimators':range(50,200,10)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(random_state=0), 
                       param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 3.04 s


({'n_estimators': 130}, 0.4586452762923351)

### min_child_weight & max_depth

In [317]:
%%time
param_test2 = {
 'max_depth':[2,3,4,5,6],
 'min_child_weight':[2,3,4,5,6,7]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(n_estimators=130,random_state=0), 
                       param_grid = param_test2, scoring='recall',cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 5.24 s


({'max_depth': 5, 'min_child_weight': 7}, 0.5351158645276293)

### Gamma

In [323]:
%%time
param_test3 = {
 'gamma':[i/100.0 for i in range(25,45, 1)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(n_estimators=130,max_depth=5, min_child_weight=7, random_state=0), 
                       param_grid = param_test3, scoring='recall',cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 3.51 s


({'gamma': 0.34}, 0.5411764705882354)

### subsample & colsample_bytree

In [324]:
%%time
param_test4 = {
 'subsample':[i/100.0 for i in range(80,100)],
 'colsample_bytree':[i/100.0 for i in range(90,101)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(n_estimators=130,max_depth=5, min_child_weight=7, gamma=0.34, random_state=0), 
                       param_grid = param_test4, scoring='recall',cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 37 s


({'colsample_bytree': 1.0, 'subsample': 0.94}, 0.5709447415329769)

### reg_alpha & reg_lambda

In [327]:
%%time
param_test5 = {
 'reg_alpha':[0, 1e-3, 2e-3, 3e-3, 4e-3],
 'reg_lambda': [0.5, 1, 1.5, 2]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(n_estimators=130,max_depth=5, min_child_weight=7, gamma=0.34, colsample_bytree=1, subsample=0.94,random_state=0), 
                       param_grid = param_test5, scoring='recall',cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch5.best_score_

Wall time: 3.51 s


({'reg_alpha': 0, 'reg_lambda': 1}, 0.5709447415329769)

### learning_rate

In [330]:
%%time
param_test6 = {
    'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2, 0.3, 0.4]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(n_estimators=130,max_depth=3, min_child_weight=7, gamma=0.34, colsample_bytree=1, subsample=0.94, reg_alpha=0, reg_lambda=1, random_state=0), 
                       param_grid = param_test6, scoring='recall',cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.best_params_, gsearch6.best_score_

Wall time: 1.23 s


({'learning_rate': 0.1}, 0.5055258467023174)

In [333]:
xgb2 = XGBClassifier(n_estimators=130,max_depth=3, min_child_weight=7, gamma=0.34, colsample_bytree=1, subsample=0.94, reg_alpha=0, reg_lambda=1, learning_rate=0.1,random_state=0)
xgb2.fit(X_train,y_train)
y_pred = xgb2.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

xgb_prob = xgb2.predict_proba(X_test)[:,1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_prob)
xgb_auc = auc(xgb_fpr, xgb_tpr)

print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",xgb_auc)

ACC: 0.611764705882353
REC: 0.5909090909090909
SPC: 0.6341463414634146
AUC: 0.5814855875831485


In [334]:
importances['xgb_importance'] = pd.DataFrame(xgb2.feature_importances_, index=X_train.columns)
importances.sort_values(by='xgb_importance', ascending=False)

Unnamed: 0,rf_importance,xgb_importance
BASO,0.033166,0.048534
RBC,0.042941,0.037253
PLT,0.034554,0.036376
BUN,0.030496,0.034813
MCV,0.027678,0.034582
RDW-CV,0.034931,0.033044
TP,0.032907,0.032932
RDW-SD,0.006911,0.031402
NEUT,0.054752,0.031271
GLU,0.049081,0.031186


# model export

In [198]:
from joblib import dump, load

In [335]:
# dump(rf2, '../model/rf_clf_mini')
# dump(xgb2, '../model/xgb_clf_mini')

['../model/xgb_clf_mini']