# Training for each cluster

In [37]:
import numpy as np
import os
import pandas as pd
import pprint
import csv
import pickle
import xgboost as xgb
vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']
n_clusters = 50

In [38]:
def xgboost_train(vehicle_type):
    print('vehicle_type :',vehicle_type)
    if not os.path.exists('../../Models/XGBoost/' + vehicle_type):
        os.makedirs('../../Models/XGBoost/' + vehicle_type)
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2]==0]
        if len(sample_) == 0 or len(sample_) == len(sample):
            note = ' only has negative samples'
            if len(sample_) == 0:
                note = ' only has positive samples'
            print('cluster'+str(i)+note)
            continue
        feature = sample.iloc[:,0:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        
        #xgboost
        params = {'booster': 'gbtree', 'eta': 0.05, 'max_depth': 10}
        bst = xgb.XGBClassifier(**params)
        bst.fit(feature, label)
        acc = bst.score(feature, label)
        #pred = bst.predict(feature)
        print("training accuracy: ", acc)
        #print("prediction: ", pred)
        
        #save the trained model
        with open('../../Models/XGBoost/' + vehicle_type + '/trainedXGB' + str(i) + '.pkl', 'wb') as f:
            pickle.dump(bst,f)

In [39]:
for vehicle_type in vehicle_types:
    xgboost_train(vehicle_type)
    #ZV252 cluster30 only has negative samples

vehicle_type : ZVe44
cluster0, shape:  (59515, 13) (59515,)
training accuracy:  0.8480719146433672
cluster1, shape:  (27570, 13) (27570,)
training accuracy:  0.9613347841857091
cluster2, shape:  (27163, 13) (27163,)
training accuracy:  0.9034348194234805
cluster3, shape:  (51135, 13) (51135,)
training accuracy:  0.8631270167204459
cluster4, shape:  (23587, 13) (23587,)
training accuracy:  0.8893034298554289
cluster5, shape:  (22661, 13) (22661,)
training accuracy:  0.9422796875689511
cluster6, shape:  (98354, 13) (98354,)
training accuracy:  0.7392988592228074
cluster7, shape:  (26391, 13) (26391,)
training accuracy:  0.9164108976545035
cluster8, shape:  (48955, 13) (48955,)
training accuracy:  0.8290879379021551
cluster9, shape:  (14655, 13) (14655,)
training accuracy:  0.959467758444217
cluster10, shape:  (16929, 13) (16929,)
training accuracy:  0.9302971232795795
cluster11, shape:  (66246, 13) (66246,)
training accuracy:  0.8558705431271322
cluster12, shape:  (17851, 13) (17851,)
tr

training accuracy:  0.9529367469879518
cluster4, shape:  (12251, 13) (12251,)
training accuracy:  0.9657987103093625
cluster5, shape:  (11876, 13) (11876,)
training accuracy:  0.9773492758504547
cluster6, shape:  (2648, 13) (2648,)
training accuracy:  1.0
cluster7, shape:  (3182, 13) (3182,)
training accuracy:  0.9990571967316153
cluster8, shape:  (6704, 13) (6704,)
training accuracy:  0.9898568019093079
cluster9, shape:  (43444, 13) (43444,)
training accuracy:  0.8079826903600037
cluster10, shape:  (648, 13) (648,)
training accuracy:  1.0
cluster11, shape:  (19072, 13) (19072,)
training accuracy:  0.9385486577181208
cluster12, shape:  (11179, 13) (11179,)
training accuracy:  0.9926648179622507
cluster13, shape:  (9055, 13) (9055,)
training accuracy:  0.9840971838763114
cluster14, shape:  (10345, 13) (10345,)
training accuracy:  0.9852102464958917
cluster15, shape:  (11273, 13) (11273,)
training accuracy:  0.9666459682427038
cluster16, shape:  (4025, 13) (4025,)
training accuracy:  0.9

training accuracy:  1.0
cluster21, shape:  (4796, 13) (4796,)
training accuracy:  0.9968723936613845
cluster22, shape:  (11842, 13) (11842,)
training accuracy:  0.9155548049315994
cluster23, shape:  (14128, 13) (14128,)
training accuracy:  0.9745186862967158
cluster24, shape:  (9277, 13) (9277,)
training accuracy:  0.9874959577449607
cluster25, shape:  (2643, 13) (2643,)
training accuracy:  0.9984865682936057
cluster26, shape:  (2268, 13) (2268,)
training accuracy:  0.9991181657848325
cluster27, shape:  (3814, 13) (3814,)
training accuracy:  1.0
cluster28, shape:  (8913, 13) (8913,)
training accuracy:  0.9943902165376417
cluster29, shape:  (5510, 13) (5510,)
training accuracy:  0.9996370235934664
cluster30, shape:  (5940, 13) (5940,)
training accuracy:  0.9983164983164983
cluster31, shape:  (1787, 13) (1787,)
training accuracy:  1.0
cluster32, shape:  (6745, 13) (6745,)
training accuracy:  1.0
cluster33, shape:  (13656, 13) (13656,)
training accuracy:  0.9927504393673111
cluster34, sha

training accuracy:  1.0
cluster36, shape:  (1471, 13) (1471,)
training accuracy:  1.0
cluster37, shape:  (2726, 13) (2726,)
training accuracy:  1.0
cluster38, shape:  (116, 13) (116,)
training accuracy:  1.0
cluster39, shape:  (554, 13) (554,)
training accuracy:  1.0
cluster40, shape:  (1152, 13) (1152,)
training accuracy:  1.0
cluster41, shape:  (1336, 13) (1336,)
training accuracy:  1.0
cluster42, shape:  (1154, 13) (1154,)
training accuracy:  1.0
cluster43, shape:  (1209, 13) (1209,)
training accuracy:  1.0
cluster44, shape:  (460, 13) (460,)
training accuracy:  1.0
cluster45, shape:  (405, 13) (405,)
training accuracy:  1.0
cluster46, shape:  (1479, 13) (1479,)
training accuracy:  1.0
cluster47, shape:  (241, 13) (241,)
training accuracy:  1.0
cluster48, shape:  (1721, 13) (1721,)
training accuracy:  1.0
cluster49, shape:  (1235, 13) (1235,)
training accuracy:  1.0


# Cross Validation

In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

def xgboost_cv(vehicle_type):
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2]==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,0:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)

        #cv
        params = {'booster': 'gbtree', 'eta': 0.05, 'max_depth': 12}
        bst = xgb.XGBClassifier(**params)
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(bst, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))

In [36]:
xgboost_cv('ZVe44')

cluster0, shape:  (59515, 13) (59515,)
Accuracy:  [0.78845669 0.79517769 0.78568428 0.78946484 0.79198521]
Mean Acc: 79.02%, SD: 0.32%)

cluster1, shape:  (27570, 13) (27570,)
Accuracy:  [0.89046065 0.89209285 0.88792165 0.89553863 0.89046065]
Mean Acc: 89.13%, SD: 0.25%)

cluster2, shape:  (27163, 13) (27163,)
Accuracy:  [0.80084668 0.80379164 0.80139886 0.79952135 0.80044183]
Mean Acc: 80.12%, SD: 0.14%)

cluster3, shape:  (51135, 13) (51135,)
Accuracy:  [0.81030605 0.79906131 0.80336365 0.80649262 0.79622568]
Mean Acc: 80.31%, SD: 0.50%)

cluster4, shape:  (23587, 13) (23587,)
Accuracy:  [0.78740992 0.78613819 0.78757685 0.79096884 0.78990884]
Mean Acc: 78.84%, SD: 0.18%)

cluster5, shape:  (22661, 13) (22661,)
Accuracy:  [0.86785793 0.87224184 0.8729038  0.87422771 0.87511033]
Mean Acc: 87.25%, SD: 0.25%)

cluster6, shape:  (98354, 13) (98354,)


KeyboardInterrupt: 

In [None]:
cluster0, shape:  (59515, 13) (59515,)
Accuracy:  [0.78845669 0.79517769 0.78568428 0.78946484 0.79198521]
Mean Acc: 79.02%, SD: 0.32%)

cluster1, shape:  (27570, 13) (27570,)
Accuracy:  [0.89046065 0.89209285 0.88792165 0.89553863 0.89046065]
Mean Acc: 89.13%, SD: 0.25%)

cluster2, shape:  (27163, 13) (27163,)
Accuracy:  [0.80084668 0.80379164 0.80139886 0.79952135 0.80044183]
Mean Acc: 80.12%, SD: 0.14%)

cluster3, shape:  (51135, 13) (51135,)
Accuracy:  [0.81030605 0.79906131 0.80336365 0.80649262 0.79622568]
Mean Acc: 80.31%, SD: 0.50%)

cluster4, shape:  (23587, 13) (23587,)
Accuracy:  [0.78740992 0.78613819 0.78757685 0.79096884 0.78990884]
Mean Acc: 78.84%, SD: 0.18%)

cluster5, shape:  (22661, 13) (22661,)
Accuracy:  [0.86785793 0.87224184 0.8729038  0.87422771 0.87511033]
Mean Acc: 87.25%, SD: 0.25%)
