### stage_model_classifier

prediction models of sign classifiers on stage dataset

data format:
    target , eventid ,    car_number,    stageid,     features...

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

# to use only one GPU.
# use this on r-001
# otherwise comment
import os
os.environ["CUDA_VISIBLE_DEVICES"]="7"

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import xgboost as xgb


In [3]:
# bulid regression model
classifiers = ['currank','avgrank','dice','lr','lrl1','lsvc','lsvcl2','rf','lrbias','xgb']
def get_classifier(classifier = 'lr'):
    
    class_weight = None
    
    if classifier == "lsvc":
        clf = LinearSVC(penalty='l1',dual=False, tol=1e-3, class_weight=class_weight )
    elif classifier == "lsvcl2":
        clf = LinearSVC(penalty='l2', tol=1e-4, class_weight=class_weight)
    elif classifier == 'rf':
        #clf = RandomForestClassifier(n_estimators=100, n_jobs=4,criterion='entropy', min_samples_split=1,class_weight = class_weight)
        clf = RandomForestClassifier(n_estimators=100, n_jobs=-1,criterion='entropy', class_weight = class_weight)
    elif classifier == 'lr':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = False, verbose = 0)
    elif classifier == 'lrbias':
        clf = LogisticRegression(class_weight = class_weight, n_jobs=-1, fit_intercept = True, verbose = 1)
    elif classifier == 'lrl1':
        clf = LogisticRegression(class_weight = class_weight, penalty='l1',n_jobs=-1)
    elif classifier == 'xgb':
        clf = xgb.XGBClassifier(booster = 'gbtree', nthread = -1, subsample = 1, 
                                n_estimators = 600, colsample_bytree = 1, max_depth = 6, min_child_weight = 1)
    elif classifier == 'dice':
        clf = RandomDice('1234')
    elif classifier == 'currank':
        clf = CurRank()
    elif classifier == 'avgrank':
        clf = AverageRank()        
    else:
        clf = None
        
    return clf


class CurRank():
    """
    predict with current rank
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = [0 for x in range(test_x.shape[0])]
        return np.array(pred_y)
    
class AverageRank():
    """
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    """
    def __init__(self):
        pass
    def fit(self, x, y):
        pass
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            #13, change_in_rank_all
            pred_y.append(x[13])
        pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in pred_y])
        return np.array(pred_y_avg)   

class RandomDice():
    """
    a random dice model
    """
    def __init__(self, seed='1234'):
        self.dist = []
        self.val = []
        random.seed(seed)
    
    def fit(self, x, y):
        total = y.shape[0]
        yval = set(y)
        
        ratio = 0.
        for val in yval:
            self.val.append(val)
            ratio += np.sum(y==val)*1.0 / total
            self.dist.append(ratio)
            
    def predict(self, test_x):
        pred_y = []
        for x in test_x:
            dice = random.random()
            #search in self.dist
            find_idx = -1
            for idx, ratio in enumerate(self.dist):
                if dice <= ratio:
                    find_idx = idx
                    break
            
            #or the last one match
            pred_y.append(self.val[find_idx])
            
        return np.array(pred_y)

def evaluate(test_y, pred_y):
    precision = metrics.precision_score(test_y, pred_y, average=None) 
    recall = metrics.recall_score(test_y, pred_y, average=None)
    f1 = metrics.f1_score(test_y, pred_y, average=None)
    accuracy = metrics.accuracy_score(test_y, pred_y)
    print('precision=%s, recall=%s, f1=%s, accuracy=%.2f'%(precision,recall, f1, accuracy))
    return accuracy
    
#
#features
#    cols=[Myidx, 'target','eventid','car_number','stageid',
#             'firststage','pit_in_caution','start_position',
#             'start_rank','start_rank_ratio','top_pack','bottom_pack',
#             'average_rank','average_rank_all',
#             'change_in_rank','change_in_rank_all','rate_of_change','rate_of_change_all']    
def split_by_eventid(stagedata, eventid):
    """
    split by eventid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['eventid'] != eventid].to_numpy()
    test  = stagedata[stagedata['eventid'] == eventid].to_numpy()

    #2:car_number
    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


def split_by_stageid(stagedata, stageid):
    """
    split by stageid
    """
    #if not eventid in stagedata:
    #    print('error, %d not found in stagedata'%eventid)
    #    return
    
    train = stagedata[stagedata['stageid'] <= stageid].to_numpy()
    test  = stagedata[stagedata['stageid'] > stageid].to_numpy()

    train_x = train[:,2:]
    train_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in train[:,1]])
    test_x = test[:,2:]
    test_y = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in test[:,1]])
    
    return train, test, train_x, train_y, test_x, test_y


### baseline
def baseline_model():
    #1. predict with current rank, rankchg = 0
    print('[*] predict with current rank, rankchg = 0')
    pred_y_simple = np.zeros_like(test_y)
    score1 = evaluate(test_y, pred_y_simple)

    #2. predict with average rankchg (change_in_rank_all):idx = 15
    print('[*] predict with average rankchg (change_in_rank_all):idx = 15')
    change_in_rank_all = test[:,15]
    pred_y_avg = np.array([1 if x > 0 else (-1 if x < 0 else 0) for x in change_in_rank_all])
    score2 = evaluate(test_y, pred_y_avg)
    return score1, score2

def classifier_model(name='lr'):
    ### test learning models
    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    return score

In [4]:
#load data
suffix='-withneighbor-newfeatures-timediff'
stagedata = pd.read_csv('stage-2018%s.csv'%suffix)
stagedata.fillna(0, inplace=True)
stagedata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 805 entries, 0 to 804
Data columns (total 35 columns):
Unnamed: 0                   805 non-null int64
target                       805 non-null int64
eventid                      805 non-null int64
car_number                   805 non-null int64
stageid                      805 non-null int64
firststage                   805 non-null int64
pit_in_caution               805 non-null int64
start_position               805 non-null int64
start_rank                   805 non-null int64
start_rank_ratio             805 non-null float64
top_pack                     805 non-null int64
bottom_pack                  805 non-null int64
average_rank                 805 non-null float64
average_rank_all             805 non-null float64
change_in_rank               805 non-null int64
change_in_rank_all           805 non-null float64
rate_of_change               805 non-null int64
rate_of_change_all           805 non-null float64
laptime_green_mean_pr

In [5]:
stagedata.head(5)

Unnamed: 0.1,Unnamed: 0,target,eventid,car_number,stageid,firststage,pit_in_caution,start_position,start_rank,start_rank_ratio,...,laptime_std_all,laps_prev,laps_after_last_pitstop,pittime_prev,prev_nb0_change_in_rank,prev_nb1_change_in_rank,prev_nb2_change_in_rank,follow_nb0_change_in_rank,follow_nb1_change_in_rank,follow_nb2_change_in_rank
0,0,0,0,1,0,0,0,7,7,0.304348,...,0.0,0,0,0.0,0,0,0,0,0,0
1,1,-2,0,1,1,1,1,7,7,0.304348,...,6.431794,43,43,11.54325,-2,-1,1,3,0,-1
2,2,6,0,1,2,1,0,7,5,0.217391,...,7.771622,76,76,59.63585,0,-2,7,-3,18,2
3,3,-7,0,1,3,1,0,7,11,0.478261,...,6.740054,57,57,40.4385,-12,-9,-18,-6,-7,4
4,4,-3,0,1,4,1,1,7,4,0.173913,...,6.458577,56,56,39.5124,-8,-2,-2,-4,2,-3


In [6]:
cols = ['runid','trainsize','testsize','testdistribution']
cols.extend(classifiers)
print('cols:%s'%cols)
retdf = pd.DataFrame([],columns=cols)

eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
for eventid in events:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])        
    
retdf.to_csv('crossvalid_stagedata_splitbyevent%s.csv'%suffix)
df_event = retdf

cols:['runid', 'trainsize', 'testsize', 'testdistribution', 'currank', 'avgrank', 'dice', 'lr', 'lrl1', 'lsvc', 'lsvcl2', 'rf', 'lrbias', 'xgb']
Testset = Phoenix
[*] predict with currank model
precision=[0.        0.1754386 0.       ], recall=[0. 1. 0.], f1=[0.         0.29850746 0.        ], accuracy=0.18
[*] predict with avgrank model
precision=[0.3877551  0.22580645 0.17647059], recall=[0.34545455 0.35       0.15384615], f1=[0.36538462 0.2745098  0.16438356], accuracy=0.28
[*] predict with dice model
precision=[0.50877193 0.125      0.33333333], recall=[0.52727273 0.15       0.28205128], f1=[0.51785714 0.13636364 0.30555556], accuracy=0.38
[*] predict with lr model
precision=[0.54545455 0.36363636 0.61538462], recall=[0.76363636 0.2        0.41025641], f1=[0.63636364 0.25806452 0.49230769], accuracy=0.54
[*] predict with lrl1 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.55555556 0.42857143 0.52272727], recall=[0.63636364 0.15       0.58974359], f1=[0.59322034 0.22222222 0.55421687], accuracy=0.54
[*] predict with lsvc model




precision=[0.54       0.41666667 0.48076923], recall=[0.49090909 0.25       0.64102564], f1=[0.51428571 0.3125     0.54945055], accuracy=0.50
[*] predict with lsvcl2 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.80952381 0.         0.38709677], recall=[0.30909091 0.         0.92307692], f1=[0.44736842 0.         0.54545455], accuracy=0.46
[*] predict with rf model
precision=[0.53246753 0.25       0.54545455], recall=[0.74545455 0.05       0.46153846], f1=[0.62121212 0.08333333 0.5       ], accuracy=0.53
[*] predict with lrbias model
[LibLinear]precision=[0.55737705 0.42857143 0.52173913], recall=[0.61818182 0.15       0.61538462], f1=[0.5862069  0.22222222 0.56470588], accuracy=0.54
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.55555556 0.3        0.48780488], recall=[0.63636364 0.15       0.51282051], f1=[0.59322034 0.2        0.5       ], accuracy=0.51
rec:['Phoenix', 691, 114, '+:39,0:20,-:55', 0.17543859649122806, 0.2807017543859649, 0.37719298245614036, 0.543859649122807, 0.5350877192982456, 0.5, 0.4649122807017544, 0.5263157894736842, 0.5350877192982456, 0.5087719298245614]
Testset = Indy500
[*] predict with currank model
precision=[0.         0.10222222 0.        ], recall=[0. 1. 0.], f1=[0.         0.18548387 0.        ], accuracy=0.10
[*] predict with avgrank model
precision=[0.39784946 0.02222222 0.20689655], recall=[0.3245614  0.04347826 0.20454545], f1=[0.35748792 0.02941176 0.20571429], accuracy=0.25
[*] predict with dice model
precision=[0.46902655 0.10638298 0.35384615], recall=[0.46491228 0.2173913  0.26136364], f1=[0.46696035 0.14285714 0.30065359], accuracy=0.36
[*] predict with lr model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.5477707  0.125      0.72222222], recall=[0.75438596 0.17391304 0.29545455], f1=[0.63468635 0.14545455 0.41935484], accuracy=0.52
[*] predict with lrl1 model
precision=[0.55172414 0.17948718 0.73170732], recall=[0.70175439 0.30434783 0.34090909], f1=[0.61776062 0.22580645 0.46511628], accuracy=0.52
[*] predict with lsvc model




precision=[0.54716981 0.17241379 0.7027027 ], recall=[0.76315789 0.2173913  0.29545455], f1=[0.63736264 0.19230769 0.416     ], accuracy=0.52
[*] predict with lsvcl2 model




precision=[0.55263158 0.09677419 0.69047619], recall=[0.73684211 0.13043478 0.32954545], f1=[0.63157895 0.11111111 0.44615385], accuracy=0.52
[*] predict with rf model
precision=[0.48245614 0.1686747  0.67857143], recall=[0.48245614 0.60869565 0.21590909], f1=[0.48245614 0.26415094 0.32758621], accuracy=0.39
[*] predict with lrbias model
[LibLinear]precision=[0.54482759 0.15789474 0.71428571], recall=[0.69298246 0.26086957 0.34090909], f1=[0.61003861 0.19672131 0.46153846], accuracy=0.51
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.48484848 0.1547619  0.69047619], recall=[0.42105263 0.56521739 0.32954545], f1=[0.45070423 0.24299065 0.44615385], accuracy=0.40
rec:['Indy500', 580, 225, '+:88,0:23,-:114', 0.10222222222222223, 0.24888888888888888, 0.36, 0.5155555555555555, 0.52, 0.5244444444444445, 0.5155555555555555, 0.39111111111111113, 0.5111111111111111, 0.4]
Testset = Texas
[*] predict with currank model
precision=[0.         0.23622047 0.        ], recall=[0. 1. 0.], f1=[0.         0.38216561 0.        ], accuracy=0.24
[*] predict with avgrank model
precision=[0.42857143 0.16666667 0.19047619], recall=[0.33870968 0.2        0.22857143], f1=[0.37837838 0.18181818 0.20779221], accuracy=0.28
[*] predict with dice model
precision=[0.42857143 0.32       0.28205128], recall=[0.43548387 0.26666667 0.31428571], f1=[0.432      0.29090909 0.2972973 ], accuracy=0.36
[*] predict with lr model
precision=[0.67346939 0.35       0.37931034], recall=[0.53225806 0.23333333 0.62857143], f1=[0.59459459 0.28       0.47

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.67391304 0.44444444 0.34722222], recall=[0.5        0.13333333 0.71428571], f1=[0.57407407 0.20512821 0.46728972], accuracy=0.47
[*] predict with lsvc model
precision=[0.68571429 0.6        0.31034483], recall=[0.38709677 0.1        0.77142857], f1=[0.49484536 0.17142857 0.44262295], accuracy=0.43
[*] predict with lsvcl2 model




precision=[0.57142857 0.375      0.28846154], recall=[0.06451613 0.2        0.85714286], f1=[0.11594203 0.26086957 0.43165468], accuracy=0.31
[*] predict with rf model
precision=[0.53225806 0.         0.3125    ], recall=[0.53225806 0.         0.57142857], f1=[0.53225806 0.         0.4040404 ], accuracy=0.42
[*] predict with lrbias model
[LibLinear]precision=[0.70454545 0.5        0.36231884], recall=[0.5        0.23333333 0.71428571], f1=[0.58490566 0.31818182 0.48076923], accuracy=0.50
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.55263158 0.         0.31034483], recall=[0.33870968 0.         0.77142857], f1=[0.42       0.         0.44262295], accuracy=0.38
rec:['Texas', 678, 127, '+:35,0:30,-:62', 0.23622047244094488, 0.2755905511811024, 0.36220472440944884, 0.4881889763779528, 0.47244094488188976, 0.4251968503937008, 0.31496062992125984, 0.41732283464566927, 0.49606299212598426, 0.3779527559055118]
Testset = Iowa
[*] predict with currank model
precision=[0.        0.2293578 0.       ], recall=[0. 1. 0.], f1=[0.         0.37313433 0.        ], accuracy=0.23
[*] predict with avgrank model
precision=[0.35714286 0.17241379 0.31578947], recall=[0.33333333 0.2        0.30769231], f1=[0.34482759 0.18518519 0.31168831], accuracy=0.29
[*] predict with dice model
precision=[0.43636364 0.30434783 0.22580645], recall=[0.53333333 0.28       0.17948718], f1=[0.48       0.29166667 0.2       ], accuracy=0.35
[*] predict with lr model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.53424658 0.7        0.73076923], recall=[0.86666667 0.28       0.48717949], f1=[0.66101695 0.4        0.58461538], accuracy=0.60
[*] predict with lrl1 model
precision=[0.53424658 0.63636364 0.76      ], recall=[0.86666667 0.28       0.48717949], f1=[0.66101695 0.38888889 0.59375   ], accuracy=0.60
[*] predict with lsvc model




precision=[0.52702703 0.63636364 0.75      ], recall=[0.86666667 0.28       0.46153846], f1=[0.65546218 0.38888889 0.57142857], accuracy=0.59
[*] predict with lsvcl2 model




precision=[0.58823529 0.26530612 0.33333333], recall=[0.66666667 0.52       0.07692308], f1=[0.625      0.35135135 0.125     ], accuracy=0.42
[*] predict with rf model
precision=[0.45454545 0.4        0.59259259], recall=[0.77777778 0.08       0.41025641], f1=[0.57377049 0.13333333 0.48484848], accuracy=0.49
[*] predict with lrbias model
[LibLinear]precision=[0.53424658 0.7        0.73076923], recall=[0.86666667 0.28       0.48717949], f1=[0.66101695 0.4        0.58461538], accuracy=0.60
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.4556962  0.11111111 0.57142857], recall=[0.8        0.04       0.30769231], f1=[0.58064516 0.05882353 0.4       ], accuracy=0.45
rec:['Iowa', 696, 109, '+:39,0:25,-:45', 0.22935779816513763, 0.29357798165137616, 0.3486238532110092, 0.5963302752293578, 0.5963302752293578, 0.5871559633027523, 0.42201834862385323, 0.48623853211009177, 0.5963302752293578, 0.44954128440366975]
Testset = Pocono
[*] predict with currank model
precision=[0.         0.30952381 0.        ], recall=[0. 1. 0.], f1=[0.         0.47272727 0.        ], accuracy=0.31
[*] predict with avgrank model
precision=[0.33333333 0.3        0.22222222], recall=[0.53061224 0.23076923 0.10526316], f1=[0.40944882 0.26086957 0.14285714], accuracy=0.31
[*] predict with dice model
precision=[0.38461538 0.33333333 0.37837838], recall=[0.51020408 0.20512821 0.36842105], f1=[0.43859649 0.25396825 0.37333333], accuracy=0.37
[*] predict with lr model
precision=[0.5        0.38461538 0.45454545], recall=[0.57142857 0.25641026 0

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.44262295 0.33333333 0.37735849], recall=[0.55102041 0.1025641  0.52631579], f1=[0.49090909 0.15686275 0.43956044], accuracy=0.40
[*] predict with lsvc model




precision=[0.40677966 0.25       0.40425532], recall=[0.48979592 0.12820513 0.5       ], f1=[0.44444444 0.16949153 0.44705882], accuracy=0.38
[*] predict with lsvcl2 model




precision=[0.63157895 0.33333333 0.29411765], recall=[0.24489796 0.76923077 0.13157895], f1=[0.35294118 0.46511628 0.18181818], accuracy=0.37
[*] predict with rf model
precision=[0.43589744 0.5        0.47727273], recall=[0.69387755 0.05128205 0.55263158], f1=[0.53543307 0.09302326 0.51219512], accuracy=0.45
[*] predict with lrbias model
[LibLinear]precision=[0.44067797 0.41176471 0.4       ], recall=[0.53061224 0.17948718 0.52631579], f1=[0.48148148 0.25       0.45454545], accuracy=0.42
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.4375     0.27272727 0.45098039], recall=[0.57142857 0.07692308 0.60526316], f1=[0.49557522 0.12       0.51685393], accuracy=0.43
rec:['Pocono', 679, 126, '+:38,0:39,-:49', 0.30952380952380953, 0.30952380952380953, 0.373015873015873, 0.4603174603174603, 0.40476190476190477, 0.38095238095238093, 0.373015873015873, 0.4523809523809524, 0.42063492063492064, 0.42857142857142855]
Testset = Gateway
[*] predict with currank model
precision=[0.         0.25961538 0.        ], recall=[0. 1. 0.], f1=[0.         0.41221374 0.        ], accuracy=0.26
[*] predict with avgrank model
precision=[0.38095238 0.22580645 0.25806452], recall=[0.35555556 0.25925926 0.25      ], f1=[0.36781609 0.24137931 0.25396825], accuracy=0.30
[*] predict with dice model
precision=[0.4        0.25       0.31034483], recall=[0.48888889 0.18518519 0.28125   ], f1=[0.44       0.21276596 0.29508197], accuracy=0.35
[*] predict with lr model
precision=[0.51470588 0.57142857 0.48275862], recall=[0.77777778 0.14814815

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.51666667 0.5        0.46153846], recall=[0.68888889 0.33333333 0.375     ], f1=[0.59047619 0.4        0.4137931 ], accuracy=0.50
[*] predict with lsvc model
precision=[0.50819672 0.47368421 0.5       ], recall=[0.68888889 0.33333333 0.375     ], f1=[0.58490566 0.39130435 0.42857143], accuracy=0.50
[*] predict with lsvcl2 model




precision=[0.51515152 0.6        0.45454545], recall=[0.75555556 0.11111111 0.46875   ], f1=[0.61261261 0.1875     0.46153846], accuracy=0.50
[*] predict with rf model
precision=[0.48571429 0.5        0.54545455], recall=[0.75555556 0.22222222 0.375     ], f1=[0.59130435 0.30769231 0.44444444], accuracy=0.50
[*] predict with lrbias model
[LibLinear]precision=[0.52459016 0.5        0.48      ], recall=[0.71111111 0.33333333 0.375     ], f1=[0.60377358 0.4        0.42105263], accuracy=0.51
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.53125    0.52173913 0.58823529], recall=[0.75555556 0.44444444 0.3125    ], f1=[0.62385321 0.48       0.40816327], accuracy=0.54
rec:['Gateway', 701, 104, '+:32,0:27,-:45', 0.25961538461538464, 0.2980769230769231, 0.34615384615384615, 0.5096153846153846, 0.5, 0.5, 0.5, 0.5, 0.5096153846153846, 0.5384615384615384]


In [7]:
df_event

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lrl1,lsvc,lsvcl2,rf,lrbias,xgb
0,Phoenix,691,114,"+:39,0:20,-:55",0.175439,0.280702,0.377193,0.54386,0.535088,0.5,0.464912,0.526316,0.535088,0.508772
0,Indy500,580,225,"+:88,0:23,-:114",0.102222,0.248889,0.36,0.515556,0.52,0.524444,0.515556,0.391111,0.511111,0.4
0,Texas,678,127,"+:35,0:30,-:62",0.23622,0.275591,0.362205,0.488189,0.472441,0.425197,0.314961,0.417323,0.496063,0.377953
0,Iowa,696,109,"+:39,0:25,-:45",0.229358,0.293578,0.348624,0.59633,0.59633,0.587156,0.422018,0.486239,0.59633,0.449541
0,Pocono,679,126,"+:38,0:39,-:49",0.309524,0.309524,0.373016,0.460317,0.404762,0.380952,0.373016,0.452381,0.420635,0.428571
0,Gateway,701,104,"+:32,0:27,-:45",0.259615,0.298077,0.346154,0.509615,0.5,0.5,0.5,0.5,0.509615,0.538462


In [8]:
retdf = pd.DataFrame([],columns=cols)

for stageid in range(8):
    train, test, train_x, train_y, test_x, test_y =split_by_stageid(stagedata, stageid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec = ['stage%d'%stageid,train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc = [0 for x in range(len(classifiers))]
    for idx, clf in enumerate(classifiers):
        acc[idx] = classifier_model(clf)

    rec.extend(acc)
    print('rec:%s'%rec)
    
    #new df
    df = pd.DataFrame([rec],columns=cols)
    retdf = pd.concat([retdf, df])  
    
retdf.to_csv('crossvalid_stagedata_splitbystage%s.csv'%suffix)
df_stage = retdf

[*] predict with currank model
precision=[0.         0.21472393 0.        ], recall=[0. 1. 0.], f1=[0.         0.35353535 0.        ], accuracy=0.21
[*] predict with avgrank model
precision=[0.3796034 0.2244898 0.224    ], recall=[0.44816054 0.07857143 0.2629108 ], f1=[0.41104294 0.11640212 0.24190065], accuracy=0.31
[*] predict with dice model
precision=[0.45398773 0.19587629 0.28820961], recall=[0.49498328 0.13571429 0.30985915], f1=[0.4736     0.16033755 0.29864253], accuracy=0.36
[*] predict with lr model
precision=[0.51450677 0.         0.55555556], recall=[0.88963211 0.         0.35211268], f1=[0.65196078 0.         0.43103448], accuracy=0.52
[*] predict with lrl1 model
precision=[0.5        0.         0.45945946], recall=[0.71906355 0.         0.47887324], f1=[0.58984911 0.         0.46896552], accuracy=0.49
[*] predict with lsvc model
precision=[0.51356589 0.         0.55147059], recall=[0.88628763 0.         0.35211268], f1=[0.65030675 0.         0.42979943], accuracy=0.52
[*]

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.4664723  0.27083333 0.38697318], recall=[0.53511706 0.09285714 0.4741784 ], f1=[0.49844237 0.13829787 0.42616034], accuracy=0.42
[*] predict with lrbias model
[LibLinear]precision=[0.5083682  0.         0.52325581], recall=[0.81270903 0.         0.42253521], f1=[0.62548263 0.         0.46753247], accuracy=0.51
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.46302251 0.26373626 0.344     ], recall=[0.48160535 0.17142857 0.40375587], f1=[0.47213115 0.20779221 0.37149028], accuracy=0.39
rec:['stage0', 153, 652, '+:213,0:140,-:299', 0.2147239263803681, 0.30828220858895705, 0.35736196319018404, 0.5230061349693251, 0.48619631901840493, 0.5214723926380368, 0.5076687116564417, 0.42024539877300615, 0.5107361963190185, 0.3895705521472393]
[*] predict with currank model
precision=[0.         0.23791103 0.        ], recall=[0. 1. 0.], f1=[0.       0.384375 0.      ], accuracy=0.24
[*] predict with avgrank model
precision=[0.38111888 0.23529412 0.23350254], recall=[0.47391304 0.06504065 0.2804878 ], f1=[0.42248062 0.10191083 0.25484765], accuracy=0.32
[*] predict with dice model
precision=[0.48351648 0.26153846 0.35195531], recall=[0.57391304 0.13821138 0.38414634], f1=[0.52485089 0.18085106 0.36734694], accuracy=0.41
[*] predict with lr model
precision=[0.46153846 0.6        0.33192389], recall=[0.07826087 0.02439024 0.95731707], f1=[0.1

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.51612903 0.52       0.34490239], recall=[0.06956522 0.10569106 0.9695122 ], f1=[0.12260536 0.17567568 0.5088    ], accuracy=0.36
[*] predict with lsvcl2 model
precision=[0.         0.         0.31782946], recall=[0. 0. 1.], f1=[0.         0.         0.48235294], accuracy=0.32
[*] predict with rf model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.52186589 0.25       0.53529412], recall=[0.77826087 0.00813008 0.55487805], f1=[0.62478185 0.01574803 0.54491018], accuracy=0.52
[*] predict with lrbias model
[LibLinear]precision=[0.41666667 0.57142857 0.32921811], recall=[0.04347826 0.03252033 0.97560976], f1=[0.07874016 0.06153846 0.49230769], accuracy=0.34
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.49802372 0.30769231 0.42211055], recall=[0.54782609 0.16260163 0.51219512], f1=[0.52173913 0.21276596 0.46280992], accuracy=0.44
rec:['stage1', 288, 517, '+:164,0:123,-:230', 0.2379110251450677, 0.3152804642166344, 0.41005802707930367, 0.344294003868472, 0.32495164410058025, 0.36363636363636365, 0.31721470019342357, 0.5241779497098646, 0.3365570599613153, 0.4448742746615087]
[*] predict with currank model
precision=[0.         0.26822917 0.        ], recall=[0. 1. 0.], f1=[0.         0.42299795 0.        ], accuracy=0.27
[*] predict with avgrank model
precision=[0.36619718 0.26086957 0.21621622], recall=[0.46428571 0.05825243 0.28318584], f1=[0.40944882 0.0952381  0.24521073], accuracy=0.30
[*] predict with dice model
precision=[0.46190476 0.275      0.29104478], recall=[0.57738095 0.10679612 0.34513274], f1=[0.51322751 0.15384615 0.31578947], accuracy=0.38
[*] predict with lr model
precision=[0.46689895 0.30188679 0.59090909], recall=[0.79761905 0.15533981 0.2300885 ], f1

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.47011952 0.27419355 0.54929577], recall=[0.70238095 0.16504854 0.34513274], f1=[0.56324582 0.20606061 0.42391304], accuracy=0.45
[*] predict with lsvc model
precision=[0.46315789 0.2745098  0.60416667], recall=[0.78571429 0.13592233 0.25663717], f1=[0.58278146 0.18181818 0.36024845], accuracy=0.46
[*] predict with lsvcl2 model




precision=[0.45859873 0.31034483 0.43902439], recall=[0.85714286 0.08737864 0.15929204], f1=[0.59751037 0.13636364 0.23376623], accuracy=0.45
[*] predict with rf model
precision=[0.48339483 0.57142857 0.53773585], recall=[0.7797619  0.03883495 0.50442478], f1=[0.59681093 0.07272727 0.52054795], accuracy=0.50
[*] predict with lrbias model
[LibLinear]precision=[0.46289753 0.2962963  0.59574468], recall=[0.7797619  0.15533981 0.24778761], f1=[0.58093126 0.20382166 0.35      ], accuracy=0.46
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.46696035 0.37931034 0.4296875 ], recall=[0.63095238 0.10679612 0.48672566], f1=[0.53670886 0.16666667 0.45643154], accuracy=0.45
rec:['stage2', 421, 384, '+:113,0:103,-:168', 0.2682291666666667, 0.3020833333333333, 0.3828125, 0.4583333333333333, 0.453125, 0.4557291666666667, 0.4453125, 0.5, 0.4557291666666667, 0.4479166666666667]
[*] predict with currank model
precision=[0.        0.3372093 0.       ], recall=[0. 1. 0.], f1=[0.         0.50434783 0.        ], accuracy=0.34
[*] predict with avgrank model
precision=[0.33561644 0.22222222 0.21276596], recall=[0.48514851 0.04597701 0.28571429], f1=[0.39676113 0.07619048 0.24390244], accuracy=0.28
[*] predict with dice model
precision=[0.38926174 0.4        0.29113924], recall=[0.57425743 0.13793103 0.32857143], f1=[0.464      0.20512821 0.30872483], accuracy=0.36
[*] predict with lr model
precision=[0.44198895 0.45454545 0.5       ], recall=[0.79207921 0.05747126 0.47142857], f1=[0.56737589 0.10204082 0.48529412], accuracy=0.4

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.43617021 0.44444444 0.49180328], recall=[0.81188119 0.04597701 0.42857143], f1=[0.56747405 0.08333333 0.45801527], accuracy=0.45
[*] predict with lsvc model
precision=[0.44324324 0.5        0.47761194], recall=[0.81188119 0.03448276 0.45714286], f1=[0.57342657 0.06451613 0.46715328], accuracy=0.45
[*] predict with lsvcl2 model




precision=[0.25      0.        0.2811245], recall=[0.01980198 0.         1.        ], f1=[0.03669725 0.         0.43887147], accuracy=0.28
[*] predict with rf model
precision=[0.41818182 0.         0.5       ], recall=[0.91089109 0.         0.25714286], f1=[0.57320872 0.         0.33962264], accuracy=0.43
[*] predict with lrbias model
[LibLinear]precision=[0.43820225 0.42857143 0.46575342], recall=[0.77227723 0.03448276 0.48571429], f1=[0.55913978 0.06382979 0.47552448], accuracy=0.45
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.425      0.44444444 0.42857143], recall=[0.84158416 0.04597701 0.3       ], f1=[0.56478405 0.08333333 0.35294118], accuracy=0.43
rec:['stage3', 547, 258, '+:70,0:87,-:101', 0.3372093023255814, 0.28294573643410853, 0.36046511627906974, 0.4573643410852713, 0.4496124031007752, 0.45348837209302323, 0.27906976744186046, 0.4263565891472868, 0.44573643410852715, 0.4263565891472868]
[*] predict with currank model
precision=[0.         0.35135135 0.        ], recall=[0. 1. 0.], f1=[0.   0.52 0.  ], accuracy=0.35
[*] predict with avgrank model
precision=[0.39759036 0.2        0.23636364], recall=[0.50769231 0.03846154 0.41935484], f1=[0.44594595 0.06451613 0.30232558], accuracy=0.32
[*] predict with dice model
precision=[0.41772152 0.34782609 0.23913043], recall=[0.50769231 0.15384615 0.35483871], f1=[0.45833333 0.21333333 0.28571429], accuracy=0.35
[*] predict with lr model
precision=[0.575      0.41269841 0.31111111], recall=[0.35384615 0.5        0.4516129 ], f1=[0.43809524 0.452

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.57446809 0.45762712 0.30952381], recall=[0.41538462 0.51923077 0.41935484], f1=[0.48214286 0.48648649 0.35616438], accuracy=0.45
[*] predict with lsvc model
precision=[0.57142857 0.48837209 0.28571429], recall=[0.49230769 0.40384615 0.4516129 ], f1=[0.52892562 0.44210526 0.35      ], accuracy=0.45
[*] predict with lsvcl2 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.5        0.36923077 0.        ], recall=[0.13846154 0.92307692 0.        ], f1=[0.21686747 0.52747253 0.        ], accuracy=0.39
[*] predict with rf model
precision=[0.55263158 0.61904762 0.29411765], recall=[0.64615385 0.25       0.48387097], f1=[0.59574468 0.35616438 0.36585366], accuracy=0.47
[*] predict with lrbias model
[LibLinear]precision=[0.58536585 0.4375     0.3255814 ], recall=[0.36923077 0.53846154 0.4516129 ], f1=[0.45283019 0.48275862 0.37837838], accuracy=0.45
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.59615385 0.44680851 0.32653061], recall=[0.47692308 0.40384615 0.51612903], f1=[0.52991453 0.42424242 0.4       ], accuracy=0.46
rec:['stage4', 657, 148, '+:31,0:52,-:65', 0.35135135135135137, 0.32432432432432434, 0.35135135135135137, 0.42567567567567566, 0.4527027027027027, 0.4527027027027027, 0.38513513513513514, 0.47297297297297297, 0.44594594594594594, 0.4594594594594595]
[*] predict with currank model
precision=[0.     0.3625 0.    ], recall=[0. 1. 0.], f1=[0.         0.53211009 0.        ], accuracy=0.36
[*] predict with avgrank model
precision=[0.33333333 0.14285714 0.17857143], recall=[0.45454545 0.03448276 0.27777778], f1=[0.38461538 0.05555556 0.2173913 ], accuracy=0.26
[*] predict with dice model
precision=[0.4        0.30769231 0.25925926], recall=[0.48484848 0.13793103 0.38888889], f1=[0.43835616 0.19047619 0.31111111], accuracy=0.34
[*] predict with lr model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.55263158 0.55555556 0.83333333], recall=[0.63636364 0.68965517 0.27777778], f1=[0.5915493  0.61538462 0.41666667], accuracy=0.57
[*] predict with lrl1 model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.54054054 0.54054054 0.83333333], recall=[0.60606061 0.68965517 0.27777778], f1=[0.57142857 0.60606061 0.41666667], accuracy=0.56
[*] predict with lsvc model




precision=[0.53488372 0.53571429 0.55555556], recall=[0.6969697  0.51724138 0.27777778], f1=[0.60526316 0.52631579 0.37037037], accuracy=0.54
[*] predict with lsvcl2 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.4125 0.     0.    ], recall=[1. 0. 0.], f1=[0.5840708 0.        0.       ], accuracy=0.41
[*] predict with rf model
precision=[0.525      0.57142857 0.41666667], recall=[0.63636364 0.55172414 0.27777778], f1=[0.57534247 0.56140351 0.33333333], accuracy=0.53
[*] predict with lrbias model
[LibLinear]precision=[0.55555556 0.55263158 0.83333333], recall=[0.60606061 0.72413793 0.27777778], f1=[0.57971014 0.62686567 0.41666667], accuracy=0.57
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.5483871  0.51282051 0.4       ], recall=[0.51515152 0.68965517 0.22222222], f1=[0.53125    0.58823529 0.28571429], accuracy=0.51
rec:['stage5', 725, 80, '+:18,0:29,-:33', 0.3625, 0.2625, 0.3375, 0.575, 0.5625, 0.5375, 0.4125, 0.525, 0.575, 0.5125]
[*] predict with currank model
precision=[0.         0.42105263 0.        ], recall=[0. 1. 0.], f1=[0.         0.59259259 0.        ], accuracy=0.42
[*] predict with avgrank model
precision=[0.40909091 0.2        0.09090909], recall=[0.5625     0.0625     0.16666667], f1=[0.47368421 0.0952381  0.11764706], accuracy=0.29
[*] predict with dice model
precision=[0.42105263 0.33333333 0.1       ], recall=[0.5        0.1875     0.16666667], f1=[0.45714286 0.24       0.125     ], accuracy=0.32
[*] predict with lr model
precision=[0.58333333 0.72727273 0.        ], recall=[0.875 0.5   0.   ], f1=[0.7        0.59259259 0.        ], accuracy=0.58
[*] predict with lrl1 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.56521739 0.69230769 0.        ], recall=[0.8125 0.5625 0.    ], f1=[0.66666667 0.62068966 0.        ], accuracy=0.58
[*] predict with lsvc model
precision=[0.56 0.75 0.  ], recall=[0.875  0.5625 0.    ], f1=[0.68292683 0.64285714 0.        ], accuracy=0.61
[*] predict with lsvcl2 model




precision=[0.57142857 0.52941176 0.28571429], recall=[0.5        0.5625     0.33333333], f1=[0.53333333 0.54545455 0.30769231], accuracy=0.50
[*] predict with rf model
precision=[0.53846154 0.8        0.        ], recall=[0.875 0.5   0.   ], f1=[0.66666667 0.61538462 0.        ], accuracy=0.58
[*] predict with lrbias model
[LibLinear]precision=[0.56521739 0.69230769 0.        ], recall=[0.8125 0.5625 0.    ], f1=[0.66666667 0.62068966 0.        ], accuracy=0.58
[*] predict with xgb model


  " = {}.".format(effective_n_jobs(self.n_jobs)))


precision=[0.52631579 0.61538462 0.16666667], recall=[0.625      0.5        0.16666667], f1=[0.57142857 0.55172414 0.16666667], accuracy=0.50
rec:['stage6', 767, 38, '+:6,0:16,-:16', 0.42105263157894735, 0.2894736842105263, 0.3157894736842105, 0.5789473684210527, 0.5789473684210527, 0.6052631578947368, 0.5, 0.5789473684210527, 0.5789473684210527, 0.5]
[*] predict with currank model
precision=[0.    0.375 0.   ], recall=[0. 1. 0.], f1=[0.         0.54545455 0.        ], accuracy=0.38
[*] predict with avgrank model
precision=[0.45454545 0.         0.        ], recall=[0.625 0.    0.   ], f1=[0.52631579 0.         0.        ], accuracy=0.31
[*] predict with dice model
precision=[0.55555556 0.5        0.33333333], recall=[0.625      0.33333333 0.5       ], f1=[0.58823529 0.4        0.4       ], accuracy=0.50
[*] predict with lr model
precision=[0.5        0.66666667 0.        ], recall=[0.75       0.33333333 0.        ], f1=[0.6        0.44444444 0.        ], accuracy=0.50
[*] predict with

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.5 0.5 0. ], recall=[0.75       0.33333333 0.        ], f1=[0.6 0.4 0. ], accuracy=0.50
[*] predict with lsvc model
precision=[0.53846154 0.66666667 0.        ], recall=[0.875      0.33333333 0.        ], f1=[0.66666667 0.44444444 0.        ], accuracy=0.56
[*] predict with lsvcl2 model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.5 0.  0. ], recall=[1. 0. 0.], f1=[0.66666667 0.         0.        ], accuracy=0.50
[*] predict with rf model


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


precision=[0.57142857 1.         0.        ], recall=[1.         0.33333333 0.        ], f1=[0.72727273 0.5        0.        ], accuracy=0.62
[*] predict with lrbias model
[LibLinear]precision=[0.5 0.5 0. ], recall=[0.75       0.33333333 0.        ], f1=[0.6 0.4 0. ], accuracy=0.50
[*] predict with xgb model
precision=[0.63636364 1.         0.5       ], recall=[0.875 0.5   0.5  ], f1=[0.73684211 0.66666667 0.5       ], accuracy=0.69
rec:['stage7', 789, 16, '+:2,0:6,-:8', 0.375, 0.3125, 0.5, 0.5, 0.5, 0.5625, 0.5, 0.625, 0.5, 0.6875]


In [9]:
#xgb max_tree_depth=3
df_stage

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lrl1,lsvc,lsvcl2,rf,lrbias,xgb
0,stage0,153,652,"+:213,0:140,-:299",0.214724,0.308282,0.357362,0.523006,0.486196,0.521472,0.507669,0.420245,0.510736,0.389571
0,stage1,288,517,"+:164,0:123,-:230",0.237911,0.31528,0.410058,0.344294,0.324952,0.363636,0.317215,0.524178,0.336557,0.444874
0,stage2,421,384,"+:113,0:103,-:168",0.268229,0.302083,0.382812,0.458333,0.453125,0.455729,0.445312,0.5,0.455729,0.447917
0,stage3,547,258,"+:70,0:87,-:101",0.337209,0.282946,0.360465,0.457364,0.449612,0.453488,0.27907,0.426357,0.445736,0.426357
0,stage4,657,148,"+:31,0:52,-:65",0.351351,0.324324,0.351351,0.425676,0.452703,0.452703,0.385135,0.472973,0.445946,0.459459
0,stage5,725,80,"+:18,0:29,-:33",0.3625,0.2625,0.3375,0.575,0.5625,0.5375,0.4125,0.525,0.575,0.5125
0,stage6,767,38,"+:6,0:16,-:16",0.421053,0.289474,0.315789,0.578947,0.578947,0.605263,0.5,0.578947,0.578947,0.5
0,stage7,789,16,"+:2,0:6,-:8",0.375,0.3125,0.5,0.5,0.5,0.5625,0.5,0.625,0.5,0.6875


In [10]:
#xgb max_tree_depth=6
df_stage

Unnamed: 0,runid,trainsize,testsize,testdistribution,currank,avgrank,dice,lr,lrl1,lsvc,lsvcl2,rf,lrbias,xgb
0,stage0,153,652,"+:213,0:140,-:299",0.214724,0.308282,0.357362,0.523006,0.486196,0.521472,0.507669,0.420245,0.510736,0.389571
0,stage1,288,517,"+:164,0:123,-:230",0.237911,0.31528,0.410058,0.344294,0.324952,0.363636,0.317215,0.524178,0.336557,0.444874
0,stage2,421,384,"+:113,0:103,-:168",0.268229,0.302083,0.382812,0.458333,0.453125,0.455729,0.445312,0.5,0.455729,0.447917
0,stage3,547,258,"+:70,0:87,-:101",0.337209,0.282946,0.360465,0.457364,0.449612,0.453488,0.27907,0.426357,0.445736,0.426357
0,stage4,657,148,"+:31,0:52,-:65",0.351351,0.324324,0.351351,0.425676,0.452703,0.452703,0.385135,0.472973,0.445946,0.459459
0,stage5,725,80,"+:18,0:29,-:33",0.3625,0.2625,0.3375,0.575,0.5625,0.5375,0.4125,0.525,0.575,0.5125
0,stage6,767,38,"+:6,0:16,-:16",0.421053,0.289474,0.315789,0.578947,0.578947,0.605263,0.5,0.578947,0.578947,0.5
0,stage7,789,16,"+:2,0:6,-:8",0.375,0.3125,0.5,0.5,0.5,0.5625,0.5,0.625,0.5,0.6875


### save the trained models

In [22]:
import pickle 
eventsname = ['Phoenix','Indy500','Texas','Iowa','Pocono','Gateway']
events = set(stagedata['eventid'])
#for eventid in events:
eventid = 1
name = 'lsvc'
modelname = eventsname[eventid] + '-' + name + '.pkl'
if True:
    print('Testset = %s'%eventsname[eventid])
    
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)

    print('[*] predict with %s model'%name)
    clf = get_classifier(name)
    clf.fit(train_x, train_y)

    pred_y = clf.predict(test_x)
    score = evaluate(test_y, pred_y)
    
    print('rec:%s'%score)
    
    #save the model
    with open(modelname, 'wb') as fout:   
        pickle.dump((clf, test_x, test_y), fout)
        
    train, test, train_x, train_y, test_x, test_y = split_by_eventid(stagedata, eventid)
    test_distribution = '+:%d,0:%d,-:%d'%(np.sum(test_y>0),np.sum(test_y==0),np.sum(test_y<0))
    #print('Testset by stageid= %s, trainsize=%d, testsize=%d, dist=%s'%
    #      (stageid, train_x.shape[0], test_x.shape[0], test_distribution))
    
    #record
    rec0 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    rec1 = [eventsname[eventid],train_x.shape[0],test_x.shape[0],test_distribution]
    
    acc0 = [0 for x in range(len(regressors))]
    acc1 = [0 for x in range(len(regressors))]
    for idx, clf in enumerate(regressors):
        acc0[idx] = regressor_model(clf)[0]
        acc1[idx] = regressor_model(clf)[2]        
        
    


Testset = Indy500
[*] predict with lsvc model
precision=[0.54658385 0.17241379 0.71428571], recall=[0.77192982 0.2173913  0.28409091], f1=[0.64       0.19230769 0.40650407], accuracy=0.52
rec:0.5244444444444445




In [28]:
#load model and predict
with open(modelname, 'rb') as fin:
    clf, test_x, test_y = pickle.load(fin)
    
EMPTY = 100
def predict(carno, stageid):
    #
    # stageid is the id of pitstop, start from 0
    #
    #find input x <eventid, car_num, stageid>
    input_x = []
    for x in test_x:
        if ((x[1] == carno) and (x[2] == stageid)):
            input_x = x.reshape((1,-1))
            #input_x = test_x[(test_x[:,1] == carno) and (test_x[:,2] == stageid)]
            pred_y = clf.predict(input_x)
            #print('prediction(car-%d,stage-%d):%s'%(carno, stageid, pred_y))
            return pred_y[0]
    else:
        return EMPTY
    
yhat = clf.predict(test_x)

#check carno 12
carno=12
idx = (test_x[:,1]==carno)
_yhat = yhat[idx]

ret_y = []
for stageid in range(10):
    Y = predict(carno, stageid)
    if Y == EMPTY:
        break
    ret_y.append(Y)
    
#predict(12, 3)
print('trueth:', test_y[idx])
print('prediction:', _yhat)
print('prediction:', ret_y)




trueth: [ 1  1 -1  1  1 -1]
prediction: [ 1  1 -1  1  1  0]
prediction: [1, 1, -1, 1, 1, 0]


In [26]:
input_x = []
type(input_x)

list