In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
import pickle
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
from sklearn.metrics import roc_curve, auc
from scipy import interp

rcParams['figure.figsize'] = 12, 4



In [45]:
def onehot(x,numlabels):
    t = [0 for i in range(numlabels)]
    t[x-1] = 1
    return t

In [57]:
def GetEER_(y_score, y_test):
	n_classes = y_score.shape[1]
	fpr = dict()
	tpr = dict()
	roc_auc = dict()
	missRate = dict()
	for i in range(n_classes):
		fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
		roc_auc[i] = auc(fpr[i], tpr[i])
	fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
	missRate["micro"] = 1 - tpr["micro"]
	all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
	# Then interpolate all ROC curves at this points
	mean_tpr = np.zeros_like(all_fpr)
	for i in range(n_classes):
		mean_tpr += interp(all_fpr, fpr[i], tpr[i])
	mean_tpr /= n_classes
	fpr["macro"] = all_fpr
	tpr["macro"] = mean_tpr
	missRate["macro"] = 1 - tpr["macro"]
	return  min(fpr["micro"][np.argmin(abs(fpr["micro"]-missRate["micro"]))], fpr["macro"][np.argmin(abs(fpr["macro"]-missRate["macro"]))])


In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
data= pickle.load(open('/Users/chaitanya/Documents/python/keystrokes/data_augmentation/CMU/CMU2X_dat.pickle','rb'))


In [3]:
X = np.array(data['data'])
Y = np.array(data['labels'])
X = X.reshape(X.shape[0], 31)

n = X.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

In [49]:
def GetEER(y_score, y_test):
    #y_score = y_score.get_label()
    y_test = np.array(y_test.get_label()).reshape(len(y_score), 1)
    n_classes = y_score.shape[1]
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    missRate = dict()
    y_test_ = [onehot(int(x+1), 100) for x in y_test]
    y_test = np.reshape(y_test_, (len(y_test_),100))
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    missRate["micro"] = 1 - tpr["micro"]
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])
    mean_tpr /= n_classes
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    missRate["macro"] = 1 - tpr["macro"]
    #print(np.dtype(min(fpr["micro"][np.argmin(abs(fpr["micro"]-missRate["micro"]))], fpr["macro"][np.argmin(abs(fpr["macro"]-missRate["macro"]))])))
    return 'EER', min(fpr["micro"][np.argmin(abs(fpr["micro"]-missRate["micro"]))], fpr["macro"][np.argmin(abs(fpr["macro"]-missRate["macro"]))])


In [112]:
y_train_ = [np.argmax(x) for x in y_train]
y_test_ = [np.argmax(x) for x in y_test]
xg_train = xgb.DMatrix(X_train, label = y_train_)
xg_test = xgb.DMatrix(X_test, label = y_test_)
#xg_train_cv = xgb.DMatrix(X_train, label = np.reshape(y_train,(12593,100)))
params = {
    # Parameters that we are going to tune.
    'max_depth':10,
    'eval_metric': 'merror',
    'min_child_weight': 8,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'multi:softprob',
    'num_class' : 51,
}
num_boost_round = 999
params['tree_method']= 'hist'

In [113]:
model = xgb.train(
    params,
    xg_train,
    num_boost_round=num_boost_round,
    evals=[(xg_test, "Test")],
    early_stopping_rounds=20,
    #feval = GetEER,
    
)

[0]	Test-merror:0.134858
Will train until Test-merror hasn't improved in 20 rounds.
[1]	Test-merror:0.110022
[2]	Test-merror:0.099455
[3]	Test-merror:0.092375
[4]	Test-merror:0.086383
[5]	Test-merror:0.078758
[6]	Test-merror:0.075272
[7]	Test-merror:0.070697
[8]	Test-merror:0.068627
[9]	Test-merror:0.065033
[10]	Test-merror:0.062636
[11]	Test-merror:0.060893
[12]	Test-merror:0.058497
[13]	Test-merror:0.056209
[14]	Test-merror:0.055229
[15]	Test-merror:0.053595
[16]	Test-merror:0.051634
[17]	Test-merror:0.050218
[18]	Test-merror:0.049673
[19]	Test-merror:0.047821
[20]	Test-merror:0.04695
[21]	Test-merror:0.045752
[22]	Test-merror:0.044662
[23]	Test-merror:0.043791
[24]	Test-merror:0.042702
[25]	Test-merror:0.042375
[26]	Test-merror:0.041721
[27]	Test-merror:0.041503
[28]	Test-merror:0.040414
[29]	Test-merror:0.039651
[30]	Test-merror:0.038998
[31]	Test-merror:0.038235
[32]	Test-merror:0.038344
[33]	Test-merror:0.0378
[34]	Test-merror:0.036819
[35]	Test-merror:0.036275
[36]	Test-merror:0

In [104]:
np.shape(xg_train.get_label())

(52020,)

In [122]:
pred = model.predict(xg_test)

In [123]:
GetEER_(pred,y_test)

0.003043572984749455

In [68]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [116]:
model.save_model('CMU.model')

In [72]:
min_merror = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        xg_train,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10,
        #feval = GetEER,
        
    )

    # Update best MAE
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, Merror: {}".format(best_params[0], best_params[1], min_merror))

CV with max_depth=9, min_child_weight=5


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	Merror 0.027239399999999997 for 110 rounds
CV with max_depth=9, min_child_weight=6
	Merror 0.027278000000000004 for 148 rounds
CV with max_depth=9, min_child_weight=7
	Merror 0.028066199999999996 for 123 rounds
CV with max_depth=10, min_child_weight=5
	Merror 0.027278000000000004 for 121 rounds
CV with max_depth=10, min_child_weight=6
	Merror 0.027835200000000004 for 146 rounds
CV with max_depth=10, min_child_weight=7
	Merror 0.0284508 for 85 rounds
CV with max_depth=11, min_child_weight=5
	Merror 0.027028000000000003 for 143 rounds
CV with max_depth=11, min_child_weight=6
	Merror 0.0274702 for 139 rounds
CV with max_depth=11, min_child_weight=7
	Merror 0.026989599999999996 for 175 rounds
Best params: 11, 7, Merror: 0.026989599999999996


In [74]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(10,14)
    for min_child_weight in range(7,10)
]
min_merror = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        xg_train,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10,
        #feval = GetEER,
        
    )

    # Update best MAE
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (max_depth,min_child_weight)

print("Best params: {}, {}, Merror: {}".format(best_params[0], best_params[1], min_merror))

CV with max_depth=10, min_child_weight=7


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	Merror 0.0284508 for 85 rounds
CV with max_depth=10, min_child_weight=8
	Merror 0.028316 for 117 rounds
CV with max_depth=10, min_child_weight=9
	Merror 0.0284888 for 160 rounds
CV with max_depth=11, min_child_weight=7
	Merror 0.026989599999999996 for 175 rounds
CV with max_depth=11, min_child_weight=8
	Merror 0.028200799999999998 for 127 rounds
CV with max_depth=11, min_child_weight=9
	Merror 0.028604199999999996 for 119 rounds
CV with max_depth=12, min_child_weight=7
	Merror 0.028277399999999998 for 78 rounds
CV with max_depth=12, min_child_weight=8
	Merror 0.0283736 for 150 rounds
CV with max_depth=12, min_child_weight=9
	Merror 0.0287774 for 133 rounds
CV with max_depth=13, min_child_weight=7
	Merror 0.028681400000000003 for 72 rounds
CV with max_depth=13, min_child_weight=8
	Merror 0.028642800000000003 for 117 rounds
CV with max_depth=13, min_child_weight=9
	Merror 0.0287582 for 122 rounds
Best params: 11, 7, Merror: 0.026989599999999996


In [75]:
params['max_depth'] = 11
params['min_child_weight'] = 7

In [76]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(10,11)]
]

In [77]:
min_merror = float("Inf")
best_params = None

# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))

    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample

    # Run CV
    cv_results = xgb.cv(
        params,
        xg_train,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'merror'},
        early_stopping_rounds=10
    )

    # Update best score
    mean_merror = cv_results['test-merror-mean'].min()
    boost_rounds = cv_results['test-merror-mean'].argmin()
    print("\tMerror {} for {} rounds".format(mean_merror, boost_rounds))
    if mean_merror < min_merror:
        min_merror = mean_merror
        best_params = (subsample,colsample)

print("Best params: {}, {}, Merror: {}".format(best_params[0], best_params[1], min_merror))

CV with subsample=1.0, colsample=1.0


will be corrected to return the positional minimum in the future.
Use 'series.values.argmin' to get the position of the minimum now.


	Merror 0.026989599999999996 for 175 rounds
CV with subsample=1.0, colsample=0.9
	Merror 0.0274702 for 130 rounds
CV with subsample=1.0, colsample=0.8
	Merror 0.027450999999999996 for 104 rounds
CV with subsample=1.0, colsample=0.7
	Merror 0.028200600000000003 for 99 rounds
CV with subsample=0.9, colsample=1.0
	Merror 0.027950799999999998 for 88 rounds
CV with subsample=0.9, colsample=0.9
	Merror 0.027085599999999998 for 154 rounds
CV with subsample=0.9, colsample=0.8
	Merror 0.027124199999999998 for 134 rounds
CV with subsample=0.9, colsample=0.7
	Merror 0.027335599999999998 for 128 rounds
CV with subsample=0.8, colsample=1.0
	Merror 0.0272588 for 137 rounds
CV with subsample=0.8, colsample=0.9
	Merror 0.0284122 for 96 rounds
CV with subsample=0.8, colsample=0.8
	Merror 0.0281236 for 135 rounds
CV with subsample=0.8, colsample=0.7
	Merror 0.0284506 for 94 rounds
CV with subsample=0.7, colsample=1.0
	Merror 0.0279892 for 120 rounds
CV with subsample=0.7, colsample=0.9
	Merror 0.0284892

In [80]:
params['subsample'] = 1.0
params['colsample_bytree'] = 1.0
params['eta'] = 0.1

In [81]:
model = xgb.train(
    params,
    xg_train,
    num_boost_round=num_boost_round,
    evals=[(xg_test, "Test")],
    early_stopping_rounds=20,
    #feval = GetEER,
    
)

[0]	Test-merror:0.120261
Will train until Test-merror hasn't improved in 20 rounds.
[1]	Test-merror:0.097168
[2]	Test-merror:0.086928
[3]	Test-merror:0.082244
[4]	Test-merror:0.076688
[5]	Test-merror:0.072331
[6]	Test-merror:0.068627
[7]	Test-merror:0.06634
[8]	Test-merror:0.063072
[9]	Test-merror:0.06122
[10]	Test-merror:0.059041
[11]	Test-merror:0.05719
[12]	Test-merror:0.055447
[13]	Test-merror:0.054139
[14]	Test-merror:0.052614
[15]	Test-merror:0.05098
[16]	Test-merror:0.050218
[17]	Test-merror:0.049129
[18]	Test-merror:0.048039
[19]	Test-merror:0.047059
[20]	Test-merror:0.046841
[21]	Test-merror:0.046296
[22]	Test-merror:0.04488
[23]	Test-merror:0.044444
[24]	Test-merror:0.043355
[25]	Test-merror:0.043246
[26]	Test-merror:0.042484
[27]	Test-merror:0.041721
[28]	Test-merror:0.041176
[29]	Test-merror:0.040632
[30]	Test-merror:0.040523
[31]	Test-merror:0.039434
[32]	Test-merror:0.03878
[33]	Test-merror:0.037582
[34]	Test-merror:0.037037
[35]	Test-merror:0.036492
[36]	Test-merror:0.03

In [82]:
pred = model.predict(xg_test)

In [83]:
GetEER_(pred,y_test)

0.0034531590413943354

In [117]:
!which python3


/usr/local/bin/python3


In [121]:
!which python3.6


/usr/local/bin/python3.6
