In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import hyperopt
import sys
sys.path.append('../spock/')
#from simsetup import get_sim
#from modelfitting import ROC_curve, stable_unstable_hist, calibration_plot, unstable_error_fraction
try:
    plt.style.use('paper')
except:
    pass
%matplotlib inline

In [35]:
dataset1 = pd.read_csv('../secondMMRintData/fullFeature2MMRsPlusOuter.csv')
dataset2 = pd.read_csv('../secondMMRintData/fullFeature2MMRsPlusOuter5thOrder.csv')
dataset1 = dataset1.iloc[25000:]
dataset2 = dataset2.iloc[25000:]


In [36]:
def hasnull(row):
    numnulls = row.isnull().sum()
    if numnulls == 0:
        return 0
    else:
        return 1

def tmax(row):
    #sim = get_sim(row, csvfolder)
    tmax = 1e4 # replace with a calculation of tmax
    return tmax

In [37]:
%%time
if 'hasnull' not in dataset1.columns:
    dataset1['hasnull'] = dataset1.apply(hasnull, axis=1)
if 'hasnull' not in dataset2.columns:
    dataset2['hasnull'] = dataset2.apply(hasnull, axis=1)

CPU times: user 9.14 s, sys: 52.1 ms, total: 9.19 s
Wall time: 9.19 s


In [38]:
near = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear','twoMMRstrengthnear']
far = ['EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar','twoMMRstrengthfar']
outer = ['EMcrossouter', 'EMfracstdouter', 'EPstdouter', 'MMRstrengthouter','twoMMRstrengthouter']
megno = ['MEGNO', 'MEGNOstd']
#what is being used to train
features = near + far + outer + megno
pfeatures = ['EMcrossnear', 'EMfracstdnear', 'EPstdnear', 'MMRstrengthnear','EMcrossfar', 'EMfracstdfar', 'EPstdfar', 'MMRstrengthfar','MEGNO', 'MEGNOstd']
Oldfeatures = near + far + megno

In [39]:
mask = (dataset1['hasnull'] == 0 )
data1 = dataset1[mask]
mask = (dataset2['hasnull'] == 0 )
data2 = dataset2[mask]

In [42]:
Nrows = int(0.8*data1.shape[0])
train1 = data1.iloc[:Nrows, :]
test1 = data1.iloc[Nrows:, :]
train2 = data2.iloc[:Nrows, :]
test2 = data2.iloc[Nrows:, :]

In [43]:
resultBalance1 = train1['Stable'].value_counts()[False]/train1['Stable'].value_counts()[True]
resultBalance2 = train2['Stable'].value_counts()[False]/train2['Stable'].value_counts()[True]


In [44]:
realTest1 = test1.loc[test1['prelimStable']==True]
realTest2 = test2.loc[test2['prelimStable']==True]


In [45]:
from sklearn.metrics import accuracy_score

In [46]:
OuterModel1 = XGBClassifier(learning_rate = 0.05, 
                         max_depth = 40, 
                         subsample = 0.95,
                         min_child_weight = 5,
                         n_estimators = 400,
                         scale_pos_weight=resultBalance1)

OuterModel2 = XGBClassifier(learning_rate = 0.05, 
                         max_depth = 40, 
                         subsample = 0.95,
                         min_child_weight = 5,
                         n_estimators = 400,
                         scale_pos_weight=resultBalance2)

In [47]:
OuterModel1.fit(train1[features], train1["Stable"])

In [48]:
OuterModel2.fit(train2[features], train2["Stable"])

In [49]:
resPred = OuterModel1.predict_proba(realTest1[features])[:,1]
outMMRAUC1 = metrics.roc_auc_score(realTest1['Stable'],resPred)
outMMRacc1 = accuracy_score(OuterModel1.predict(realTest1[features]), realTest1["Stable"])
print('2nd order')
print (metrics.roc_auc_score(realTest1['Stable'],resPred))
print(accuracy_score(OuterModel1.predict(realTest1[features]), realTest1["Stable"]))

2nd order
0.962717649474162
0.9065950315457413


In [50]:
resPred = OuterModel2.predict_proba(realTest2[features])[:,1]
outMMRAUC2 = metrics.roc_auc_score(realTest2['Stable'],resPred)
outMMRacc2 = accuracy_score(OuterModel2.predict(realTest2[features]), realTest2["Stable"])
print('5th order')
print (metrics.roc_auc_score(realTest2['Stable'],resPred))
print(accuracy_score(OuterModel2.predict(realTest2[features]), realTest2["Stable"]))

5th order
0.9667173964659711
0.9137421135646687


In [52]:
oneModel = XGBClassifier(learning_rate = 0.03, 
                         max_depth = 20, 
                         subsample = 0.95,
                         min_child_weight = 5,
                         n_estimators = 100,
                         scale_pos_weight=resultBalance1)
oneModel.fit(train1[pfeatures], train1["Stable"])
resPred = oneModel.predict_proba(realTest1[pfeatures])[:,1]
oneMMRAUC = metrics.roc_auc_score(realTest1['Stable'],resPred)
oneMMRacc = accuracy_score(oneModel.predict(realTest1[pfeatures]), realTest1["Stable"])
print (metrics.roc_auc_score(realTest1['Stable'],resPred))
print(accuracy_score(oneModel.predict(realTest1[pfeatures]), realTest1["Stable"]))

0.948078645260755
0.8832807570977917


In [54]:
OuterModel2.save_model('SPOCKalt.bin')

In [53]:
print('compaired to improved model')
print(f'decrease in AOC: {((1-oneMMRAUC)-(1-outMMRAUC2))/(1-oneMMRAUC)}')
print(f'decrease in error: {((1-oneMMRacc)-(1-outMMRacc2))/(1-oneMMRacc)}')

compaired to improved model
decrease in AOC: 0.3589804483881831
decrease in error: 0.26097972972972966
