In [1]:
import pandas as pd
from pandas import DataFrame
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt

In [2]:
df = pd.read_csv("..\\data\\genus.reabund_reduced.csv")
print(df.shape)
df.head()

(129, 9)


Unnamed: 0,id,Subgroup_6_ge,Sphingomonadaceae_unclassified,Enterococcus,Sphingomonas,Chitinophagaceae_unclassified,Mitsuokella,Methylobacterium,diagnosis
0,1063714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,3314627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,3940838,0.002571,0.004284,0.094253,0.018851,0.0,0.005998,0.0,1
3,4380559,0.0,0.0,0.0,0.0,0.0,0.037661,0.0,0
4,5254362,0.0,0.0,0.155051,0.003844,0.0,0.003844,0.0,0


In [3]:
df.drop(labels='id', axis=1, inplace=True)
df.head()

Unnamed: 0,Subgroup_6_ge,Sphingomonadaceae_unclassified,Enterococcus,Sphingomonas,Chitinophagaceae_unclassified,Mitsuokella,Methylobacterium,diagnosis
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.002571,0.004284,0.094253,0.018851,0.0,0.005998,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.037661,0.0,0
4,0.0,0.0,0.155051,0.003844,0.0,0.003844,0.0,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [5]:
X = df.iloc[:, :-1]
y = df.iloc[:, [-1]]

In [6]:
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV

## Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier as RFC

In [8]:
rf_scores = []
rf_models = []
rf_kf = KFold(n_splits=4, random_state=42, shuffle=True)
for train_index, test_index in rf_kf.split(X, y):
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    rfc_params = {'n_estimators': (5, 10, 20, 30, 50, 100),
                  'min_samples_split': (8, 16, 20, 24, 32, 48),
                  'min_samples_leaf': (2, 4, 6, 8, 16)
             }
    rfc = GridSearchCV(RFC(),
                       rfc_params,
                       scoring='roc_auc',
                       cv=4,
                       iid=False,
                       n_jobs=-1)
    
    rfc.fit(X_train, y_train.values.ravel())
    
    print(rfc.best_params_)
    rf_scores.append(rfc.score(X_test, y_test))
    rf_models.append(RFC(**rfc.best_params_))
    print()

{'min_samples_leaf': 6, 'min_samples_split': 16, 'n_estimators': 5}

{'min_samples_leaf': 6, 'min_samples_split': 16, 'n_estimators': 10}

{'min_samples_leaf': 16, 'min_samples_split': 20, 'n_estimators': 50}

{'min_samples_leaf': 2, 'min_samples_split': 16, 'n_estimators': 5}



In [29]:
print('[', ', '.join([f"{score:.3f}" for score in rf_scores]), ']')
rf_score = np.mean(rf_scores)
print(f"The average score is {rf_score:.3f}")

[ 0.711, 0.656, 0.667, 0.649 ]
The average score is 0.671


In [10]:
rf_models;

## XGBoost

In [11]:
import xgboost as xgb
from xgboost import XGBClassifier as XGB

In [12]:
xgb_scores = []
xgb_models = []
xgb_kf = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in xgb_kf.split(X, y):
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    xgb_params = {'n_estimators': (2, 5, 10, 20, 30, 50),
                  'learning_rate': (.01, .05, .1, .2, .3)
             }
    xgbc = GridSearchCV(XGB(booster='gblinear', colsample_bytree=.3, n_jobs=-1),
                        xgb_params,
                        scoring='roc_auc',
                        cv=3,
                        iid=False,
                        n_jobs=-1)
    
    xgbc.fit(X_train, y_train.values.ravel())
    
    print(xgbc.best_params_)
    xgb_scores.append(xgbc.score(X_test, y_test))
    xgb_models.append(XGB(**xgbc.best_params_))
    print()

{'learning_rate': 0.01, 'n_estimators': 5}

{'learning_rate': 0.01, 'n_estimators': 10}

{'learning_rate': 0.01, 'n_estimators': 2}

{'learning_rate': 0.01, 'n_estimators': 2}

{'learning_rate': 0.1, 'n_estimators': 50}



In [28]:
print('[', ', '.join([f"{score:.3f}" for score in xgb_scores]), ']')
xgb_score = np.mean(xgb_scores)
print(f"The average score is {xgb_score:.3f}")

[ 0.809, 0.719, 0.865, 0.712, 0.706 ]
The average score is 0.762


In [14]:
xgb_models;

## AdaBoost

In [15]:
from sklearn.ensemble import AdaBoostClassifier as ABC

In [16]:
ab_scores = []
ab_models = []
ab_kf = KFold(n_splits=4, random_state=42, shuffle=True)
for train_index, test_index in ab_kf.split(X, y):
    
    X_train, X_test, y_train, y_test = X.iloc[train_index], X.iloc[test_index], y.iloc[train_index], y.iloc[test_index]
    
    ab_params = {'n_estimators': (2, 5, 10, 20, 30, 50),
                  'learning_rate': (.01, .05, .1, .2, .3)
             }
    ab = GridSearchCV(ABC(),
                       ab_params,
                       scoring='roc_auc',
                       cv=4,
                       iid=False,
                       n_jobs=-1)
    ab.fit(X_train, y_train.values.ravel())
    
    print(ab.best_params_)
    ab_scores.append(ab.score(X_test, y_test))
    ab_models.append(ABC(**ab.best_params_))
    print()

{'learning_rate': 0.01, 'n_estimators': 2}

{'learning_rate': 0.1, 'n_estimators': 10}

{'learning_rate': 0.05, 'n_estimators': 10}

{'learning_rate': 0.3, 'n_estimators': 20}



In [27]:
print('[', ', '.join([f"{score:.3f}" for score in ab_scores]), ']')
ab_score = np.mean(ab_scores)
print(f"The average score is {ab_score:.3f}")

[ 0.671, 0.760, 0.623, 0.658 ]
The average score is 0.678


In [18]:
xgb_models;