In [1]:
import pandas
import xgboost as xgb
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split

from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import TomekLinks
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import ClusterCentroids
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE

import crossValidate
import performance


In [6]:
df = pandas.read_csv('osu18_cerenkov_feat_mat.tsv', sep='\t')

test_size = 0.2
randomRows = df.sample(frac=test_size).index

testData = df.iloc[randomRows,:]

df = df.drop(randomRows)
k = 3
foldsCollection = crossValidate.getKfolds(df, 'osu18_groups.tsv', k, nreps=1)

models = []
for foldGroup in foldsCollection:
    foldList = foldGroup[0]
    labelList = foldGroup[1]
    
    avg_auc = 0
    for idx in range(len(foldList)):
        testX = foldList[idx]
        testY = labelList[idx]
        print('Fold {}'.format(idx))
        print('Test Size: {}'.format(testX.shape[0]))
        
        #trainX = crossValidate.getRemainder(foldList, testX)
        #trainY = crossValidate.getRemainder(labelList, testY)
        
        #print('Train Size: {}'.format(trainX.shape[0]))
        
        trainX = np.empty(shape=[0, testX.shape[1]])
        trainY = np.empty(shape=[0,])
        
        for j in range(len(foldList)):
            if j != idx:
                trainX = np.concatenate((trainX, foldList[j]), axis=0)
                trainY = np.concatenate((trainY, labelList[j]), axis=0)
                
        
        X_smt = trainX
        y_smt = trainY
        
        _RANDOM_STATE = 1337
        # class_balance = len(y) / sum(y) - 1  # n_negative / n_positive
        rare_event_rate = sum(y_smt) / len(y_smt)

        param_dist = dict(max_depth=7,
                    learning_rate=0.1,
                    n_estimators=40,
                    gamma=10,
                    scale_pos_weight=1,
                    base_score=rare_event_rate,
                    subsample=1,
                    objective= 'binary:logistic' )

        #param_dist = { 'objective':'binary:logistic', 'n_estimators': 2 }


        clf = xgb.XGBClassifier(**param_dist, booster='gbtree', n_jobs=-1, random_state=_RANDOM_STATE)

        clf.fit( X_smt, y_smt,
                eval_set=[(X_smt, y_smt), (testX, testY)],
                eval_metric='logloss',
                verbose=False)

        preds = clf.predict(testX)
        curr_auc = performance.getAUC(testY, preds)
        print('Current fold AUC: {}'.format(curr_auc))
        print('Current fold accuracy: {}'.format(performance.getAccuracy(testY, preds)))
        avg_auc += curr_auc
        
        models.append(clf)
        
    avg_auc /= k
    print('Average K-Fold AUC for all folds: {}'.format(avg_auc))

def getEnsemblePredictions(models, testX):
    predictions = np.zeros(shape=[testX.shape[0], 1])
    for model in models:
        preds = model.predict(testX)
        #Sum up all predictions for averaging
        predictions += preds

    #Average the predictions
    predictions = predictions / len(models)

    #Force the predictions to a binary value for testing
    predictions = [1 if val >= 0.5 else 0 for val in predictions]
    return predictions

testData = testData.drop(testData.query('label != 0 & label != 1').index)
del testData['name']
testData = testData.astype(float)
labels = df['label']
del df['label']
labels = labels.astype(int)

testX = testData
testY = labels

ensembledPredictions = getEnsemblePredictions(models, testX)

print('Final Ensemble Predictions')
performance.printStats(testY, ensembledPredictions)


ValueError: Invalid file path or buffer object type: <class 'pandas.core.frame.DataFrame'>

"\ntrainDf = df[:10000]\ntestDf = df[10000:]\n\n\ndtrain = xgb.DMatrix(trainDf, label=labels[:10000])\ndtest = xgb.DMatrix(testDf, label=labels[10000:])\n\n\nevallist = [(dtest, 'eval'), (dtrain, 'train')]\n\n\n# param_dist = dict(max_depth=[7],\n#                   learning_rate=[0.1],\n#                   n_estimators=[40], \n#                   gamma=[10],\n#                   scale_pos_weight=[1],\n#                   base_score=[rare_event_rate],\n#                   subsample=[1])\n\nparam = dict(max_depth=7,\n            learning_rate=0.1,\n            n_estimators=10,\n            gamma=10,\n            scale_pos_weight=1,\n            base_score=0.5,\n            subsample=1)\n\nnum_round = 20\n\nbst = xgb.train(param, dtrain, num_round, evallist)\n"

In [12]:
split = 5000

X = df
y = labels

trainX, testX, trainY, testY = train_test_split(X, y, test_size=0.2)

print(Counter(trainY))

#smt = SMOTETomek(sampling_strategy='auto')
smt = RandomUnderSampler(sampling_strategy='auto')
#smt = TomekLinks(sampling_strategy='auto')
#smt = ClusterCentroids(sampling_strategy='auto')
#enn = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=7)
#smote = SMOTE(sampling_strategy='auto', k_neighbors=3)
#smt = SMOTEENN(sampling_strategy='auto', smote=smote, enn=None)

X_smt, y_smt = smt.fit_resample(trainX, trainY)

print(Counter(y_smt))

Counter({0: 10816, 1: 661})
Counter({0: 661, 1: 661})


In [13]:


#cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
#cross_val_score(clf, X, y, cv=cv)

evals_result = clf.evals_result()

[0]	validation_0-logloss:0.65347	validation_1-logloss:0.66155
[1]	validation_0-logloss:0.62318	validation_1-logloss:0.63667
[2]	validation_0-logloss:0.60094	validation_1-logloss:0.61960
[3]	validation_0-logloss:0.57827	validation_1-logloss:0.60174
[4]	validation_0-logloss:0.55796	validation_1-logloss:0.58671
[5]	validation_0-logloss:0.53929	validation_1-logloss:0.57369
[6]	validation_0-logloss:0.52831	validation_1-logloss:0.56744
[7]	validation_0-logloss:0.51772	validation_1-logloss:0.56042
[8]	validation_0-logloss:0.50701	validation_1-logloss:0.55281
[9]	validation_0-logloss:0.49522	validation_1-logloss:0.54283
[10]	validation_0-logloss:0.48798	validation_1-logloss:0.53702
[11]	validation_0-logloss:0.48045	validation_1-logloss:0.53441
[12]	validation_0-logloss:0.47124	validation_1-logloss:0.52729
[13]	validation_0-logloss:0.46861	validation_1-logloss:0.52466
[14]	validation_0-logloss:0.46274	validation_1-logloss:0.51935
[15]	validation_0-logloss:0.45586	validation_1-logloss:0.51512
[1

In [14]:


num_round=25
preds = clf.predict(testX)
performance.printStats(testY, preds)

from matplotlib import pyplot as plt

conf_mat = performance.getConfusionMatrix(testY, preds)
print('Confusion matrix:\n', conf_mat)

=== Performance Stats ===

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.83      0.90      2691
           1       0.23      0.78      0.36       179

    accuracy                           0.82      2870
   macro avg       0.61      0.80      0.63      2870
weighted avg       0.94      0.82      0.86      2870


Sensitivity (ability to correctly predict true): 0.7821229050279329
Specificity (ability to correctly predict false): 0.8272017837235228
Informedness (probability of informed decision): 0.6093246887514558
Accuracy: 0.824390243902439
ROC AUC: 0.8046623443757278
Confusion matrix:
 [[2226  465]
 [  39  140]]
