In [285]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, recall_score, accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score,cross_validate

## Data preparing

### Full data

In [336]:
cancer_data = pd.read_csv('../../data/cancer_data_v4.csv')
cancer_data.drop(['WBC', 'EO%', 'EO', 'GLO', 'NEUT%', 'HCT', 'TCH','TBIL', 'IBIL', 'LDH-L'], axis=1, inplace=True)
for fea in cancer_data.columns[6:]:
    median = cancer_data.groupby(['Age_group', 'A=male B=female'])[fea].median()
    cancer_data.set_index(['Age_group', 'A=male B=female'], inplace=True)
    cancer_data[fea].fillna(median, inplace=True)
    cancer_data.reset_index(inplace=True)
cancer_data['A=Case, B=Control'] = cancer_data['A=Case, B=Control'].apply(lambda x: 1 if x=='A' else 0)
X_train, X_test, y_train, y_test = train_test_split(cancer_data.drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), cancer_data['A=Case, B=Control'], test_size=0.3, random_state=0)

### Physical Center

In [397]:
cancer_data = pd.read_csv('../../data/cancer_data_v4.csv')
cancer_data.drop(['WBC', 'EO%', 'EO', 'GLO', 'NEUT%', 'HCT', 'TCH','TBIL', 'IBIL', 'LDH-L'], axis=1, inplace=True)
for fea in cancer_data.columns[6:]:
    median = cancer_data.groupby(['Age_group', 'A=male B=female'])[fea].median()
    cancer_data.set_index(['Age_group', 'A=male B=female'], inplace=True)
    cancer_data[fea].fillna(median, inplace=True)
    cancer_data.reset_index(inplace=True)
cancer_data['A=Case, B=Control'] = cancer_data['A=Case, B=Control'].apply(lambda x: 1 if x=='A' else 0)

group_b = cancer_data[cancer_data['Origin']=='B']
group_c = cancer_data[cancer_data['Origin']=='C'].sample(n=212,random_state=0)

new = pd.concat([group_b,group_c], axis=0).sample(frac=1,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(new.drop(['Origin','Age_group','A=male B=female','ID', 'A=Case, B=Control', 'Pattern'], axis=1), new['A=Case, B=Control'], test_size=0.3, random_state=0)

In [398]:
X_train

Unnamed: 0,Age,Smoke,MCV,ALT,AST:ALT,GLU,BUN,Cr,RBC,HGB,...,Cystatin-C,UA,TG,HDL-C,LDL-C,ALP,GGT,CK,HBDH,T-CEA
4539,59,0.0,83.5,23.0,0.91,5.97,4.50,67.0,5.70,147.0,...,0.88,404.0,1.11,1.50,3.84,50.0,23.0,119.0,101.0,1.44
1937,53,300.0,90.7,33.0,0.88,5.29,4.55,93.3,4.83,144.0,...,0.83,293.0,3.82,0.87,2.49,56.0,46.0,162.0,110.0,0.88
5451,47,0.0,92.5,19.0,1.16,5.89,3.90,74.0,5.59,169.0,...,0.82,271.0,1.36,1.34,4.73,79.0,17.0,116.0,157.0,1.08
5364,48,0.0,93.4,12.0,0.83,4.57,4.70,80.0,5.33,164.0,...,0.91,374.0,1.78,1.49,3.83,48.0,21.0,77.0,114.0,0.65
4975,74,0.0,93.1,14.0,1.36,4.87,4.90,82.0,4.51,135.0,...,1.16,390.0,1.41,0.84,1.55,98.0,26.0,107.0,141.0,2.73
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7621,51,900.0,93.1,29.0,0.76,4.78,4.60,81.0,5.53,173.0,...,0.95,499.0,4.61,0.97,2.82,91.0,23.0,100.0,136.0,1.43
7060,66,0.0,87.3,7.0,1.17,5.94,4.40,45.0,4.81,136.0,...,0.74,289.0,1.27,1.54,3.69,93.0,11.0,148.0,177.0,1.47
7241,65,0.0,87.6,18.0,1.33,7.63,9.70,121.0,4.99,143.0,...,0.90,424.0,0.57,1.44,2.15,58.0,8.0,100.0,154.0,7.05
6362,65,0.0,98.3,14.0,1.00,4.22,7.10,89.0,4.64,148.0,...,1.21,360.0,1.16,1.70,1.64,68.0,28.0,48.0,143.0,2.82


In [382]:
print(len(X_train), len(X_test))

296 128


## RandomForestClassifier

In [350]:
rf = RandomForestClassifier(oob_score=True,random_state=0)
rf.fit(X_train, y_train)
print(rf.oob_score_)

0.7128378378378378


# n_estimators

In [353]:
%%time
param_test1 = {'n_estimators':range(30,120,5)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(random_state=0), 
                       param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 8.67 s


({'n_estimators': 35}, 0.7448275862068965)

### max_depth & min_samples_split

In [354]:
%%time
param_test2 = {'max_depth':range(1,20,2), 'min_samples_split':range(2,20,2)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35,random_state=0),
   param_grid = param_test2, scoring='recall', cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 21.4 s


({'max_depth': 3, 'min_samples_split': 10}, 0.7517241379310345)

### min_samples_split & min_samples_leaf

In [355]:
%%time
param_test3 = {'min_samples_split':range(2,10,1), 'min_samples_leaf':range(1,20,2)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35, max_depth=3,oob_score=True, random_state=10),
            param_grid = param_test3, scoring='recall', cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 20.4 s


({'min_samples_leaf': 7, 'min_samples_split': 2}, 0.7655172413793103)

### max_features

In [356]:
%%time
param_test4 = {'max_features':range(4,30,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 35, max_depth=3, min_samples_split=7,
                                  min_samples_leaf=2 ,oob_score=True, random_state=0), param_grid = param_test4, scoring='recall', cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 3.93 s


({'max_features': 28}, 0.7862068965517242)

In [394]:
rf2 = RandomForestClassifier(n_estimators= 35, max_depth=3, min_samples_split=7,min_samples_leaf=2, max_features=2, oob_score=True, random_state=0)
rf2.fit(X_train,y_train)

y_pred = rf2.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

rf_prob = rf2.predict_proba(X_test)[:,1]
rf_fpr, rf_tpr, _ = roc_curve(y_test, rf_prob)
rf_auc = auc(rf_fpr, rf_tpr)

print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",rf_auc)

ACC: 0.640625
REC: 0.5373134328358209
SPC: 0.7540983606557377
AUC: 0.7122583802299975


In [375]:
importances = pd.DataFrame(rf2.feature_importances_, columns = ['rf_importance'], index=X_train.columns).sort_values(by='rf_importance', ascending=False)

## XGBoost

In [113]:
from xgboost import XGBClassifier

In [365]:
xgb = XGBClassifier(random_state=0)
xgb.fit(X_train,y_train)

y_pred = xgb.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
xgb_prob = xgb.predict_proba(X_test)[:,1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_prob)
xgb_auc = auc(xgb_fpr, xgb_tpr)
print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",xgb_auc)

ACC: 0.578125
REC: 0.582089552238806
SPC: 0.5737704918032787
AUC: 0.630780523611451


In [144]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

### n_estimators

In [366]:
%%time
param_test1 = {'n_estimators':range(50,200,10)}
gsearch1 = GridSearchCV(estimator = XGBClassifier(random_state=0), 
                       param_grid = param_test1, scoring='recall',cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.best_params_, gsearch1.best_score_

Wall time: 2.3 s


({'n_estimators': 90}, 0.710344827586207)

### min_child_weight & max_depth

In [367]:
%%time
param_test2 = {
 'max_depth':[2,3,4,5,6],
 'min_child_weight':[2,3,4,5,6,7]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,random_state=0), 
                       param_grid = param_test2, scoring='recall',cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

Wall time: 3.31 s


({'max_depth': 5, 'min_child_weight': 3}, 0.7793103448275862)

### Gamma

In [368]:
%%time
param_test3 = {
 'gamma':[i/100.0 for i in range(25,45, 1)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, random_state=0), 
                       param_grid = param_test3, scoring='recall',cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Wall time: 2.39 s


({'gamma': 0.42}, 0.7793103448275863)

### subsample & colsample_bytree

In [369]:
%%time
param_test4 = {
 'subsample':[i/100.0 for i in range(80,100)],
 'colsample_bytree':[i/100.0 for i in range(90,101)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, random_state=0), 
                       param_grid = param_test4, scoring='recall',cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Wall time: 26.2 s


({'colsample_bytree': 0.92, 'subsample': 0.81}, 0.7862068965517242)

### reg_alpha & reg_lambda

In [370]:
%%time
param_test5 = {
 'reg_alpha':[0, 1e-3, 2e-3, 3e-3, 4e-3],
 'reg_lambda': [0.5, 1, 1.5, 2]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, colsample_bytree=0.92, subsample=0.81,random_state=0), 
                       param_grid = param_test5, scoring='recall',cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch5.best_score_

Wall time: 2.44 s


({'reg_alpha': 0, 'reg_lambda': 1}, 0.7862068965517242)

### learning_rate

In [377]:
%%time
param_test6 = {
    'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.2, 0.25, 0.3]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.42, colsample_bytree=0.92, subsample=0.81, reg_alpha=0, reg_lambda=1, random_state=0), 
                       param_grid = param_test6, scoring='recall',cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.best_params_, gsearch6.best_score_

Wall time: 980 ms


({'learning_rate': 0.3}, 0.7862068965517242)

In [393]:
xgb2 = XGBClassifier(n_estimators=90,max_depth=5, min_child_weight=3, gamma=0.34, colsample_bytree=0.92, subsample=0.81, reg_alpha=0, reg_lambda=1, learning_rate=0.3,random_state=0)
xgb2.fit(X_train,y_train)
y_pred = xgb2.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

xgb_prob = xgb2.predict_proba(X_test)[:,1]
xgb_fpr, xgb_tpr, _ = roc_curve(y_test, xgb_prob)
xgb_auc = auc(xgb_fpr, xgb_tpr)

print("ACC:",accuracy_score(y_test, y_pred))
print("REC:",recall_score(y_test, y_pred))
print("SPC:",tn/(tn+fp))
print("AUC:",xgb_auc)

ACC: 0.6640625
REC: 0.5970149253731343
SPC: 0.7377049180327869
AUC: 0.7700024467824811


In [334]:
importances['xgb_importance'] = pd.DataFrame(xgb2.feature_importances_, index=X_train.columns)
importances.sort_values(by='xgb_importance', ascending=False)

Unnamed: 0,rf_importance,xgb_importance
BASO,0.033166,0.048534
RBC,0.042941,0.037253
PLT,0.034554,0.036376
BUN,0.030496,0.034813
MCV,0.027678,0.034582
RDW-CV,0.034931,0.033044
TP,0.032907,0.032932
RDW-SD,0.006911,0.031402
NEUT,0.054752,0.031271
GLU,0.049081,0.031186


# model export

In [198]:
from joblib import dump, load

In [399]:
dump(rf2, '../model/rf_clf_mini')
dump(xgb2, '../model/xgb_clf_mini')

['../model/xgb_clf_mini']