# Training for each cluster

In [1]:
import numpy as np
import os
import pandas as pd
import pprint
import csv
import pickle
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

n_clusters = 50

## Non linear SVM

In [2]:
from sklearn import svm

def non_linear_svm_train(vehicle_type):
    
    weight = []
    train_avg = []
    cv_avg = []
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2] ==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,1:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        weight.append(feature.shape[0])
        
        #non-linear svm
        clf = svm.SVC(C = 0.9)
        clf.fit(feature, label)
        acc = clf.score(feature, label)
        print('training accuracy: ', acc)
        train_avg.append(acc)
        
        #cv
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(clf, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))
        cv_avg.append(cv_result.mean() * 100)
    
    print("Overall training accuracy: ", np.average(train_avg, weights = weight))
    print("Overall CV accuracy: ", np.average(cv_avg, weights = weight))

In [10]:
non_linear_svm_train('ZVe44')

cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.5367722422918592
Accuracy:  [0.53734353 0.54036797 0.53146266 0.53616735 0.5385197 ]
Mean Acc: 53.68%, SD: 0.30%)

cluster1, shape:  (27570, 12) (27570,)
training accuracy:  0.6022850924918389
Accuracy:  [0.60065288 0.60536815 0.60917664 0.60246645 0.59212913]
Mean Acc: 60.20%, SD: 0.57%)

cluster2, shape:  (27163, 12) (27163,)
training accuracy:  0.5773662702941501
Accuracy:  [0.58181484 0.57537272 0.57242776 0.5769514  0.5802651 ]
Mean Acc: 57.74%, SD: 0.34%)

cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.5719761415859979
Accuracy:  [0.56741958 0.57543757 0.56839738 0.57641537 0.57221081]
Mean Acc: 57.20%, SD: 0.36%)

cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.5480561326154237
Accuracy:  [0.54726579 0.54853752 0.55607378 0.54526182 0.54314183]
Mean Acc: 54.81%, SD: 0.44%)

cluster5, shape:  (22661, 12) (22661,)
training accuracy:  0.5625965314858127
Accuracy:  [0.56562983 0.56332745 0.566637

training accuracy:  0.5947309417040358
Accuracy:  [0.59342302 0.59192825 0.59865471 0.58258595 0.60706278]
Mean Acc: 59.47%, SD: 0.81%)

cluster48, shape:  (13977, 12) (13977,)
training accuracy:  0.6072833941475281
Accuracy:  [0.61123033 0.60228898 0.59463327 0.59821109 0.61610018]
Mean Acc: 60.45%, SD: 0.80%)

cluster49, shape:  (5751, 12) (5751,)
training accuracy:  0.8236828377673449
Accuracy:  [0.8210252  0.82608696 0.83217391 0.82347826 0.81565217]
Mean Acc: 82.37%, SD: 0.55%)

Overall training accuracy:  0.5638400011681944
Overall CV accuracy:  56.281711317289464


In [3]:
# default kernel = 'rbf', default gamma = 'scale'

# in term of train accuracy:
# svm.NuSVC() > SVC() > NuSVC(kernel = 'poly') > NuSVC(kernel = 'sigmoid')

# svm.NuSVC()
'''
cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.552012097790473
cluster1, shape:  (27570, 12) (27570,)
training accuracy:  0.638411316648531
cluster2, shape:  (27163, 12) (27163,)
training accuracy:  0.5764459006737106
cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.5741468661386526
cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.5552634926018569
'''

# svm.NuSVC(kernel = 'sigmoid')
'''
cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.5087288918759977
cluster1, shape:  (27570, 12) (27570,)
training accuracy:  0.5147986942328618
cluster2, shape:  (27163, 12) (27163,)
training accuracy:  0.5016750727092
cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.5065806199276425
cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.5015474625853225
'''

# svm.NuSVC(kernel = 'poly')
'''
cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.5397462824498026
cluster1, shape:  (27570, 12) (27570,)
training accuracy:  0.5965179542981501
cluster2, shape:  (27163, 12) (27163,)
training accuracy:  0.5546883628465191
cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.564994622078811
cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.5445372450926358
'''

'\ncluster0, shape:  (59515, 12) (59515,)\ntraining accuracy:  0.5397462824498026\ncluster1, shape:  (27570, 12) (27570,)\ntraining accuracy:  0.5965179542981501\ncluster2, shape:  (27163, 12) (27163,)\ntraining accuracy:  0.5546883628465191\ncluster3, shape:  (51135, 12) (51135,)\ntraining accuracy:  0.564994622078811\ncluster4, shape:  (23587, 12) (23587,)\ntraining accuracy:  0.5445372450926358\n'

## DNN

In [4]:
from sklearn.neural_network import MLPClassifier
def dnn_train(vehicle_type):
    
    weight = []
    train_avg = []
    cv_avg = []
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2] ==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,1:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        weight.append(feature.shape[0])
        
        #decision tree
        clf = MLPClassifier()
        clf.fit(feature, label)
        acc = clf.score(feature, label)
        print('training accuracy: ', acc)
        train_avg.append(acc)
        
        #cv
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(clf, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))
        cv_avg.append(cv_result.mean() * 100)
    
    print("Overall training accuracy: ", np.average(train_avg, weights = weight))
    print("Overall CV accuracy: ", np.average(cv_avg, weights = weight))

In [11]:
dnn_train('ZVe44')

cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.5655212971519785
Accuracy:  [0.54767706 0.54633286 0.54792909 0.55725447 0.54742502]
Mean Acc: 54.93%, SD: 0.40%)

cluster1, shape:  (27570, 12) (27570,)




training accuracy:  0.668190061661226




Accuracy:  [0.65034458 0.65614799 0.64961915 0.674465   0.64925644]
Mean Acc: 65.60%, SD: 0.96%)

cluster2, shape:  (27163, 12) (27163,)




training accuracy:  0.6021058056915657




Accuracy:  [0.59414688 0.59617154 0.58807289 0.59204713 0.59756996]
Mean Acc: 59.36%, SD: 0.33%)

cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.5855089469052508
Accuracy:  [0.58345556 0.58355334 0.58404224 0.58629119 0.59166911]
Mean Acc: 58.58%, SD: 0.31%)

cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.5709500996311527




Accuracy:  [0.56061891 0.56952098 0.56688573 0.56010176 0.5738817 ]
Mean Acc: 56.62%, SD: 0.53%)

cluster5, shape:  (22661, 12) (22661,)




training accuracy:  0.661488901637174




Accuracy:  [0.60842709 0.62577229 0.60525154 0.63989409 0.6548985 ]
Mean Acc: 62.68%, SD: 1.88%)

cluster6, shape:  (98354, 12) (98354,)
training accuracy:  0.5831282916810704
Accuracy:  [0.57805907 0.56921356 0.58598953 0.57419552 0.58525674]
Mean Acc: 57.85%, SD: 0.64%)

cluster7, shape:  (26391, 12) (26391,)




training accuracy:  0.5737562047667766




Accuracy:  [0.565448   0.53940887 0.57502842 0.54945055 0.5591133 ]
Mean Acc: 55.77%, SD: 1.24%)

cluster8, shape:  (48955, 12) (48955,)
training accuracy:  0.5548360739454601




Accuracy:  [0.54161985 0.56031049 0.54723726 0.55019916 0.55193545]
Mean Acc: 55.03%, SD: 0.61%)

cluster9, shape:  (14655, 12) (14655,)




training accuracy:  0.6169907881269191




Accuracy:  [0.60047765 0.59706585 0.60696008 0.60730126 0.6066189 ]
Mean Acc: 60.37%, SD: 0.42%)

cluster10, shape:  (16929, 12) (16929,)




training accuracy:  0.5937149270482603




Accuracy:  [0.59775546 0.59539279 0.58712345 0.58771412 0.59025111]
Mean Acc: 59.16%, SD: 0.42%)

cluster11, shape:  (66246, 12) (66246,)
training accuracy:  0.5609395284243577




Accuracy:  [0.55818868 0.56464639 0.55468337 0.56577855 0.56713714]
Mean Acc: 56.21%, SD: 0.48%)

cluster12, shape:  (17851, 12) (17851,)
training accuracy:  0.5949246540810038
Accuracy:  [0.60683282 0.5952381  0.58459384 0.59803922 0.60504202]
Mean Acc: 59.79%, SD: 0.79%)

cluster13, shape:  (39628, 12) (39628,)




training accuracy:  0.5921570606641768




Accuracy:  [0.58200858 0.59626546 0.57115821 0.58851735 0.59179811]
Mean Acc: 58.59%, SD: 0.87%)

cluster14, shape:  (19542, 12) (19542,)




training accuracy:  0.605669839320438




Accuracy:  [0.5860834  0.59273471 0.60056295 0.58674514 0.59032753]
Mean Acc: 59.13%, SD: 0.52%)

cluster15, shape:  (31498, 12) (31498,)




training accuracy:  0.5564797764937456
Accuracy:  [0.54698413 0.54666667 0.5452381  0.54072075 0.53992697]
Mean Acc: 54.39%, SD: 0.30%)

cluster16, shape:  (4436, 12) (4436,)




training accuracy:  0.5737150586113616




Accuracy:  [0.53603604 0.53664036 0.56708005 0.56595265 0.55918828]
Mean Acc: 55.30%, SD: 1.39%)

cluster17, shape:  (28909, 12) (28909,)




training accuracy:  0.5766370334497907




Accuracy:  [0.57177447 0.5814597  0.58267036 0.56952612 0.56720291]
Mean Acc: 57.45%, SD: 0.63%)

cluster18, shape:  (11896, 12) (11896,)




training accuracy:  0.7082212508406187




Accuracy:  [0.70504202 0.70659941 0.71164355 0.70659941 0.71290458]
Mean Acc: 70.86%, SD: 0.31%)

cluster19, shape:  (13348, 12) (13348,)




training accuracy:  0.7170362601138748




Accuracy:  [0.69925094 0.71685393 0.7247191  0.69838891 0.70813039]
Mean Acc: 70.95%, SD: 1.02%)

cluster20, shape:  (19486, 12) (19486,)




training accuracy:  0.6494406240377707




Accuracy:  [0.63596716 0.63125481 0.63228124 0.6410059  0.64998717]
Mean Acc: 63.81%, SD: 0.69%)

cluster21, shape:  (41572, 12) (41572,)
training accuracy:  0.5798614452035024




Accuracy:  [0.58051714 0.57294047 0.59201347 0.57685831 0.58335338]
Mean Acc: 58.11%, SD: 0.65%)

cluster22, shape:  (73268, 12) (73268,)




training accuracy:  0.567068843151171




Accuracy:  [0.57021974 0.56789955 0.56673946 0.5620692  0.56650515]
Mean Acc: 56.67%, SD: 0.27%)

cluster23, shape:  (13842, 12) (13842,)




training accuracy:  0.709724028319607




Accuracy:  [0.70892019 0.68761286 0.70556358 0.70339595 0.69039017]
Mean Acc: 69.92%, SD: 0.85%)

cluster24, shape:  (10927, 12) (10927,)




training accuracy:  0.6281687562917544




Accuracy:  [0.60384263 0.62351327 0.62700229 0.61510297 0.58993135]
Mean Acc: 61.19%, SD: 1.36%)

cluster25, shape:  (23492, 12) (23492,)
training accuracy:  0.5713008683807254




Accuracy:  [0.5701213  0.57161098 0.5538527  0.58003406 0.56406982]
Mean Acc: 56.79%, SD: 0.87%)

cluster26, shape:  (18738, 12) (18738,)




training accuracy:  0.60646813960935




Accuracy:  [0.58084312 0.59044824 0.59178228 0.60688551 0.59194022]
Mean Acc: 59.24%, SD: 0.83%)

cluster27, shape:  (17491, 12) (17491,)




training accuracy:  0.6037962380652907




Accuracy:  [0.59331237 0.59062321 0.55974843 0.56803888 0.5754717 ]
Mean Acc: 57.74%, SD: 1.29%)

cluster28, shape:  (7079, 12) (7079,)




training accuracy:  0.725667467156378




Accuracy:  [0.70833333 0.72881356 0.72316384 0.73658192 0.71378092]
Mean Acc: 72.21%, SD: 1.01%)

cluster29, shape:  (56138, 12) (56138,)
training accuracy:  0.5539028821831914
Accuracy:  [0.55432846 0.55753473 0.55281439 0.54832101 0.56016745]
Mean Acc: 55.46%, SD: 0.41%)

cluster30, shape:  (15713, 12) (15713,)




training accuracy:  0.6592630306115955




Accuracy:  [0.65383392 0.66115177 0.63633471 0.6470401  0.64672183]
Mean Acc: 64.90%, SD: 0.83%)

cluster31, shape:  (28559, 12) (28559,)




training accuracy:  0.5886060436289786




Accuracy:  [0.54376751 0.55042017 0.55917367 0.580007   0.58010856]
Mean Acc: 56.27%, SD: 1.50%)

cluster32, shape:  (6426, 12) (6426,)




training accuracy:  0.6573295985060691




Accuracy:  [0.67340591 0.67315175 0.65447471 0.65525292 0.6381323 ]
Mean Acc: 65.89%, SD: 1.32%)

cluster33, shape:  (40977, 12) (40977,)




training accuracy:  0.5838397149620519




Accuracy:  [0.58418741 0.57747682 0.57803539 0.56949359 0.56863941]
Mean Acc: 57.56%, SD: 0.58%)

cluster34, shape:  (7098, 12) (7098,)




training accuracy:  0.6287686672302057




Accuracy:  [0.60985915 0.63943662 0.61901408 0.5961945  0.62297393]
Mean Acc: 61.75%, SD: 1.43%)

cluster35, shape:  (4986, 12) (4986,)




training accuracy:  0.6119133574007221




Accuracy:  [0.61923848 0.61685055 0.58776329 0.56569709 0.58475426]
Mean Acc: 59.49%, SD: 2.04%)

cluster36, shape:  (36872, 12) (36872,)
training accuracy:  0.5663376003471469
Accuracy:  [0.56677966 0.55972881 0.55505831 0.56509357 0.56265256]
Mean Acc: 56.19%, SD: 0.41%)

cluster37, shape:  (20737, 12) (20737,)




training accuracy:  0.5721656941698413




Accuracy:  [0.55930569 0.55641273 0.5565469  0.56209308 0.57005064]
Mean Acc: 56.09%, SD: 0.50%)

cluster38, shape:  (9386, 12) (9386,)
training accuracy:  0.6031323247389729
Accuracy:  [0.5798722  0.56259989 0.60415557 0.60628663 0.59243474]
Mean Acc: 58.91%, SD: 1.63%)

cluster39, shape:  (7784, 12) (7784,)




training accuracy:  0.6736896197327852




Accuracy:  [0.66217084 0.68721901 0.6596018  0.65510597 0.66773779]
Mean Acc: 66.64%, SD: 1.12%)

cluster40, shape:  (67174, 12) (67174,)
training accuracy:  0.5646827641647065
Accuracy:  [0.55169334 0.55950875 0.57059918 0.56278377 0.56223016]
Mean Acc: 56.14%, SD: 0.61%)

cluster41, shape:  (8837, 12) (8837,)




training accuracy:  0.56716080117687




Accuracy:  [0.56957014 0.57126697 0.57272213 0.57159027 0.57838144]
Mean Acc: 57.27%, SD: 0.30%)

cluster42, shape:  (28772, 12) (28772,)
training accuracy:  0.5729181148338662




Accuracy:  [0.56055604 0.56090356 0.57194995 0.56013208 0.57055961]
Mean Acc: 56.48%, SD: 0.53%)

cluster43, shape:  (6006, 12) (6006,)




training accuracy:  0.6615051615051615




Accuracy:  [0.6530782  0.67277269 0.65029142 0.65362198 0.63863447]
Mean Acc: 65.37%, SD: 1.10%)

cluster44, shape:  (29669, 12) (29669,)




training accuracy:  0.6663857898816947




Accuracy:  [0.66616111 0.65958881 0.66093697 0.66329626 0.66863307]
Mean Acc: 66.37%, SD: 0.33%)

cluster45, shape:  (7638, 12) (7638,)




training accuracy:  0.6459806231997905




Accuracy:  [0.64725131 0.64856021 0.66230366 0.64963982 0.64112639]
Mean Acc: 64.98%, SD: 0.69%)

cluster46, shape:  (64211, 12) (64211,)
training accuracy:  0.5721916805531763
Accuracy:  [0.56754652 0.57436536 0.56759072 0.57218502 0.57016041]
Mean Acc: 57.04%, SD: 0.26%)

cluster47, shape:  (26760, 12) (26760,)




training accuracy:  0.6171150971599402
Accuracy:  [0.60519432 0.58893871 0.58893871 0.59940209 0.59118087]
Mean Acc: 59.47%, SD: 0.65%)

cluster48, shape:  (13977, 12) (13977,)
training accuracy:  0.6132932675109107




Accuracy:  [0.60264664 0.60765379 0.63005367 0.62432916 0.59928444]
Mean Acc: 61.28%, SD: 1.22%)

cluster49, shape:  (5751, 12) (5751,)




training accuracy:  0.8396800556424969




Accuracy:  [0.85925282 0.81391304 0.81043478 0.81043478 0.83304348]
Mean Acc: 82.54%, SD: 1.89%)

Overall training accuracy:  0.5922906467781562
Overall CV accuracy:  58.54377007862186


## Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

def decision_tree_train(vehicle_type):
    
    weight = []
    train_avg = []
    cv_avg = []
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2] ==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,1:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        weight.append(feature.shape[0])
        
        #decision tree
        clf = DecisionTreeClassifier()
        clf.fit(feature, label)
        acc = clf.score(feature, label)
        print('training accuracy: ', acc)
        train_avg.append(acc)
        
        #cv
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(clf, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))
        cv_avg.append(cv_result.mean() * 100)
    
    print("Overall training accuracy: ", np.average(train_avg, weights = weight))
    print("Overall CV accuracy: ", np.average(cv_avg, weights = weight))

In [8]:
decision_tree_train('ZVe44')

cluster0, shape:  (59515, 12) (59515,)
training accuracy:  1.0
Accuracy:  [0.64294716 0.62706881 0.62337226 0.62967319 0.62958918]
Mean Acc: 63.05%, SD: 0.66%)

cluster1, shape:  (27570, 12) (27570,)
training accuracy:  1.0
Accuracy:  [0.75262967 0.76332971 0.76278564 0.75897715 0.75535002]
Mean Acc: 75.86%, SD: 0.42%)

cluster2, shape:  (27163, 12) (27163,)
training accuracy:  1.0
Accuracy:  [0.6813915  0.66869133 0.6637217  0.67525773 0.6732327 ]
Mean Acc: 67.25%, SD: 0.60%)

cluster3, shape:  (51135, 12) (51135,)
training accuracy:  1.0
Accuracy:  [0.62755451 0.63175907 0.63293243 0.62237215 0.63175907]
Mean Acc: 62.93%, SD: 0.39%)

cluster4, shape:  (23587, 12) (23587,)
training accuracy:  1.0
Accuracy:  [0.63819415 0.65684612 0.64235743 0.63854145 0.64278143]
Mean Acc: 64.37%, SD: 0.68%)

cluster5, shape:  (22661, 12) (22661,)
training accuracy:  1.0
Accuracy:  [0.75534966 0.76169462 0.76279788 0.76699029 0.76368049]
Mean Acc: 76.21%, SD: 0.38%)

cluster6, shape:  (98354, 12) (983

## Random Forest

In [6]:
from sklearn.tree import ExtraTreeClassifier

def extra_tree_train(vehicle_type):
    
    weight = []
    train_avg = []
    cv_avg = []
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2] ==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,1:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        weight.append(feature.shape[0])
        
        #random forest
        clf = ExtraTreeClassifier()
        clf.fit(feature, label)
        acc = clf.score(feature, label)
        print('training accuracy: ', acc)
        train_avg.append(acc)
        
        #cv
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(clf, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))
        cv_avg.append(cv_result.mean() * 100)
    
    print("Overall training accuracy: ", np.average(train_avg, weights = weight))
    print("Overall CV accuracy: ", np.average(cv_avg, weights = weight))

In [9]:
extra_tree_train('ZVe44')

cluster0, shape:  (59515, 12) (59515,)
training accuracy:  1.0
Accuracy:  [0.56859615 0.55565824 0.56061497 0.5513736  0.55918676]
Mean Acc: 55.91%, SD: 0.57%)

cluster1, shape:  (27570, 12) (27570,)
training accuracy:  1.0
Accuracy:  [0.69151251 0.68897352 0.69151251 0.69822271 0.68171926]
Mean Acc: 69.04%, SD: 0.53%)

cluster2, shape:  (27163, 12) (27163,)
training accuracy:  1.0
Accuracy:  [0.6051905  0.6228603  0.61917909 0.6283137  0.62242268]
Mean Acc: 61.96%, SD: 0.78%)

cluster3, shape:  (51135, 12) (51135,)
training accuracy:  1.0
Accuracy:  [0.56253056 0.56957074 0.55842378 0.5732864  0.57406864]
Mean Acc: 56.76%, SD: 0.61%)

cluster4, shape:  (23587, 12) (23587,)
training accuracy:  1.0
Accuracy:  [0.5930479  0.57799915 0.57790969 0.57684969 0.60398558]
Mean Acc: 58.60%, SD: 1.08%)

cluster5, shape:  (22661, 12) (22661,)
training accuracy:  1.0
Accuracy:  [0.683212   0.68667255 0.68534863 0.67872904 0.66240071]
Mean Acc: 67.93%, SD: 0.89%)

cluster6, shape:  (98354, 12) (983

## GBDT

In [7]:
import xgboost as xgb

def gbdt_train(vehicle_type):
    
    weight = []
    train_avg = []
    cv_avg = []
    for i in range(n_clusters):
        
        #load data
        sample = pd.read_csv('../../EDA/Clustering/cluster/train/' + vehicle_type + '/cluster' + str(i) + '.csv')
        sample_= sample[sample.iloc[: , -2] ==0]
        if len(sample_) == 0 or len(sample_) == len(sample): continue
        feature = sample.iloc[:,1:13]
        label = sample.iloc[:,14]
        print('cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        weight.append(feature.shape[0])
        
        #xgboost
        params = {'booster': 'gbtree', 'eta': 0.05, 'max_depth': 10}
        bst = xgb.XGBClassifier(**params)
        bst.fit(feature, label)
        acc = bst.score(feature, label)
        print("training accuracy: ", acc)
        train_avg.append(acc)
        
        #cv
        kfold = KFold(n_splits = 5, shuffle = True, random_state = None)
        cv_result = cross_val_score(bst, feature, label, cv = kfold)
        print("Accuracy: ", cv_result)
        print("Mean Acc: %.2f%%, SD: %.2f%%)\n" % (cv_result.mean() * 100, cv_result.std() * 100))
        cv_avg.append(cv_result.mean() * 100)
    
    print("Overall training accuracy: ", np.average(train_avg, weights = weight))
    print("Overall CV accuracy: ", np.average(cv_avg, weights = weight))

In [12]:
gbdt_train('ZVe44')

cluster0, shape:  (59515, 12) (59515,)
training accuracy:  0.804452658993531
Accuracy:  [0.67991263 0.68654961 0.68722171 0.68100479 0.68419726]
Mean Acc: 68.38%, SD: 0.29%)

cluster1, shape:  (27570, 12) (27570,)
training accuracy:  0.934639100471527
Accuracy:  [0.81193326 0.82245194 0.83079434 0.82081973 0.81628582]
Mean Acc: 82.05%, SD: 0.63%)

cluster2, shape:  (27163, 12) (27163,)
training accuracy:  0.8887825350660825
Accuracy:  [0.72814283 0.73881833 0.73550525 0.73472018 0.73011782]
Mean Acc: 73.35%, SD: 0.38%)

cluster3, shape:  (51135, 12) (51135,)
training accuracy:  0.8174049085753398
Accuracy:  [0.69893419 0.70333431 0.70098758 0.70392099 0.70998338]
Mean Acc: 70.34%, SD: 0.37%)

cluster4, shape:  (23587, 12) (23587,)
training accuracy:  0.8737440115317759
Accuracy:  [0.69605765 0.6907588  0.69514522 0.69556922 0.7006572 ]
Mean Acc: 69.56%, SD: 0.31%)

cluster5, shape:  (22661, 12) (22661,)
training accuracy:  0.9145227483341424
Accuracy:  [0.79792632 0.80428067 0.81222418

training accuracy:  0.8725710014947683
Accuracy:  [0.6887145  0.69843049 0.68684604 0.68740658 0.69488042]
Mean Acc: 69.13%, SD: 0.46%)

cluster48, shape:  (13977, 12) (13977,)
training accuracy:  0.9446233097231166
Accuracy:  [0.78147353 0.78540773 0.80178891 0.80465116 0.78604651]
Mean Acc: 79.19%, SD: 0.94%)

cluster49, shape:  (5751, 12) (5751,)
training accuracy:  0.9996522343940184
Accuracy:  [0.95482189 0.94521739 0.95565217 0.94956522 0.95130435]
Mean Acc: 95.13%, SD: 0.38%)

Overall training accuracy:  0.8450192934613967
Overall CV accuracy:  71.7611610489365
