In [1]:
import pandas
import xgboost as xgb
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE


In [15]:
df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t')
df.head()

names = df['name']
del df['name']

df = df.astype(float)

labels = df['label']
del df['label']

df = df.fillna(0)
labels = labels.fillna(0)
labels = labels.astype(int)


In [3]:
"""
trainDf = df[:10000]
testDf = df[10000:]


dtrain = xgb.DMatrix(trainDf, label=labels[:10000])
dtest = xgb.DMatrix(testDf, label=labels[10000:])


evallist = [(dtest, 'eval'), (dtrain, 'train')]


# param_dist = dict(max_depth=[7],
#                   learning_rate=[0.1],
#                   n_estimators=[40], 
#                   gamma=[10],
#                   scale_pos_weight=[1],
#                   base_score=[rare_event_rate],
#                   subsample=[1])

param = dict(max_depth=7,
            learning_rate=0.1,
            n_estimators=10,
            gamma=10,
            scale_pos_weight=1,
            base_score=0.5,
            subsample=1)

num_round = 20

bst = xgb.train(param, dtrain, num_round, evallist)
"""

"\ntrainDf = df[:10000]\ntestDf = df[10000:]\n\n\ndtrain = xgb.DMatrix(trainDf, label=labels[:10000])\ndtest = xgb.DMatrix(testDf, label=labels[10000:])\n\n\nevallist = [(dtest, 'eval'), (dtrain, 'train')]\n\n\n# param_dist = dict(max_depth=[7],\n#                   learning_rate=[0.1],\n#                   n_estimators=[40], \n#                   gamma=[10],\n#                   scale_pos_weight=[1],\n#                   base_score=[rare_event_rate],\n#                   subsample=[1])\n\nparam = dict(max_depth=7,\n            learning_rate=0.1,\n            n_estimators=10,\n            gamma=10,\n            scale_pos_weight=1,\n            base_score=0.5,\n            subsample=1)\n\nnum_round = 20\n\nbst = xgb.train(param, dtrain, num_round, evallist)\n"

In [16]:
split = 5000

X = df
y = labels

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2)

print(Counter(trainY))

#smt = SMOTETomek(sampling_strategy='auto')
#smt = RandomUnderSampler(sampling_strategy='auto')
#smt = TomekLinks(sampling_strategy='auto')
#smt = ClusterCentroids(sampling_strategy='auto')
#enn = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=7)
#smote = SMOTE(sampling_strategy='auto', k_neighbors=3)
#smt = SMOTEENN(sampling_strategy='auto', smote=smote, enn=None)

X_smt, y_smt = smt.fit_resample(trainX, trainY)

print(X_smt.shape)
print(Counter(y_smt))

Counter({0: 10805, 1: 672})
(17625, 587)
Counter({1: 9607, 0: 8018})


In [17]:
_RANDOM_STATE = 1337
# class_balance = len(y) / sum(y) - 1  # n_negative / n_positive
rare_event_rate = sum(y_smt) / len(y_smt)

param_dist = dict(max_depth=7,
            learning_rate=0.1,
            n_estimators=40,
            gamma=10,
            scale_pos_weight=1,
            base_score=rare_event_rate,
            subsample=1,
            objective= 'binary:logistic' )

#param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 }

clf = xgb.XGBClassifier(**param_dist, booster='gbtree', n_jobs=-1, random_state=_RANDOM_STATE)

clf.fit( X_smt, y_smt,
        eval_set=[(X_smt, y_smt), (testX, testY)],
        eval_metric='logloss',
        verbose=True)

#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_val_score(clf, X, y, cv=cv)

evals_result = clf.evals_result()

[0]	validation_0-logloss:0.61720	validation_1-logloss:0.70059
[1]	validation_0-logloss:0.55613	validation_1-logloss:0.63615
[2]	validation_0-logloss:0.50309	validation_1-logloss:0.58193
[3]	validation_0-logloss:0.45806	validation_1-logloss:0.53804
[4]	validation_0-logloss:0.41725	validation_1-logloss:0.49703
[5]	validation_0-logloss:0.38204	validation_1-logloss:0.46363
[6]	validation_0-logloss:0.35187	validation_1-logloss:0.43489
[7]	validation_0-logloss:0.32666	validation_1-logloss:0.41058
[8]	validation_0-logloss:0.30332	validation_1-logloss:0.38899
[9]	validation_0-logloss:0.28201	validation_1-logloss:0.36923
[10]	validation_0-logloss:0.26431	validation_1-logloss:0.35289
[11]	validation_0-logloss:0.24704	validation_1-logloss:0.33702
[12]	validation_0-logloss:0.23205	validation_1-logloss:0.32363
[13]	validation_0-logloss:0.21778	validation_1-logloss:0.31083
[14]	validation_0-logloss:0.20535	validation_1-logloss:0.30014
[15]	validation_0-logloss:0.19425	validation_1-logloss:0.29120
[1

In [18]:
import performance

num_round=25
preds = clf.predict(testX)
performance.printStats(testY, preds)

from matplotlib import pyplot as plt

conf_mat = performance.getConfusionMatrix(testY, preds)
print('Confusion matrix:\n', conf_mat)

=== Performance Stats ===

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2702
           1       0.42      0.40      0.41       168

    accuracy                           0.93      2870
   macro avg       0.69      0.68      0.69      2870
weighted avg       0.93      0.93      0.93      2870


Sensitivity (ability to correctly predict true): 0.39880952380952384
Specificity (ability to correctly predict false): 0.9652109548482606
Informedness (probability of informed decision): 0.36402047865778453
Accuracy: 0.9320557491289199
ROC AUC: 0.6820102393288922
Confusion matrix:
 [[2608   94]
 [ 101   67]]
