# Training a weak model for each cluster

In [9]:
import numpy as np
import os
import pandas as pd
import pprint
import csv
import pickle
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']
n_clusters = 36

In [21]:
def xgboost_train(vehicle_type, cn=0):
    print('vehicle_type :',vehicle_type)
    model_path = '../models/final'
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model_path = model_path +'/'+  vehicle_type
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    
    status_indicator = list()
    #status:
    ok = 2
    not_found = 0.5
    pos = 1
    neg = 0
    # these are determined optimal settings through tuning, the original tuning result was marked down
    # in the end of this file
    eta = [0.05]
    gamma = [2]
    max_depth = [13]
    
    for i in range(cn, n_clusters):
        
        #load data
        sample = pd.read_csv('../data/final/clusters/' + vehicle_type + '/cluster' + str(i) + '.csv')
        if len(sample) == 0:
            print(vehicle_type+' cluster'+str(i)+': 0 record, skip.')
            status_indicator.append(not_found)
            continue
        sample_= sample[sample.iloc[: , -1]==0]
        if len(sample_) == 0 or len(sample_) == len(sample):
            note = vehicle_type+' cluster'+str(i)+': only has negative samples'
            if len(sample_) == 0:
                status_indicator.append(pos)
                note = vehicle_type+' cluster'+str(i)+': only has positive samples'
            else:
                status_indicator.append(neg)
            print(note)
            continue
        status_indicator.append(ok)
        feature = sample.iloc[:,0:13]
        label = sample.iloc[:,14]
        print(vehicle_type+'cluster' + str(i) + ', shape: ', feature.shape, label.shape)
        
        #xgboost
        best_model = xgb.XGBClassifier()
        best_acc = 0
        for e in eta:
            for m in max_depth:
                for g in gamma:
                    print('[eta=',e, ',gamma=', g, ',max_depth=',m,']')
                    bst = xgb.XGBClassifier( booster ='gbtree', eta = e, max_depth = m, gamma = g, n_jobs = 12)
                    bst.fit(feature, label)
                    acc = bst.score(feature, label)
                    #pred = bst.predict(feature)
                    print("Train Acc: %.2f%%"%(acc*100))
                    #print("prediction: ", pred)
                    feature = sample.iloc[:,0:13]
                    label = sample.iloc[:,14]
                    #cv
                    cv_model = xgb.XGBClassifier( booster ='gbtree', eta = e, max_depth = m, gamma = g, n_jobs = 12)
                    if len(feature) >= 5:
                        kfold = KFold(n_splits = 5, shuffle = False)
                    else:
                        kfold = KFold(n_splits = len(feature), shuffle = False)
                    cv_result = cross_val_score(cv_model, feature, label, cv = kfold, n_jobs = -1)
                    acc = cv_result.mean() * 100
                    #if acc > best_acc:
                    #    best_acc = acc
                    best_model = bst
                    print("CV Acc: %.2f%%, SD: %.2f%%)\n" % (acc, cv_result.std() * 100))
        
        #save the trained model
        with open(model_path+'/trainedXGB' + str(i) + '.pkl', 'wb') as f:
            pickle.dump(best_model,f)
    arr = np.array(status_indicator)
    np.savetxt(model_path+"/status.csv", arr, delimiter=",")

In [11]:

for vehicle_type in vehicle_types:
        xgboost_train(vehicle_type,0)
    #ZV252 cluster30 only has negative samples

vehicle_type : ZVe44
ZVe44cluster0, shape:  (580695, 13) (580695,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 76.55%
CV Acc: 68.32%, SD: 0.72%)

ZVe44cluster1, shape:  (453163, 13) (453163,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 77.87%
CV Acc: 67.74%, SD: 1.30%)

ZVe44cluster2, shape:  (34899, 13) (34899,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 91.60%
CV Acc: 67.12%, SD: 0.92%)

ZVe44cluster3, shape:  (7959, 13) (7959,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.42%
CV Acc: 76.39%, SD: 3.13%)

ZVe44cluster4, shape:  (104, 13) (104,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.12%
CV Acc: 59.86%, SD: 22.09%)

ZVe44cluster5, shape:  (214, 13) (214,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.20%
CV Acc: 77.14%, SD: 20.48%)

ZVe44 cluster6: only has positive samples
ZVe44 cluster7: 0 record, skip.
ZVe44 cluster8: only has positive samples
ZVe44 cluster9: only has positive samples
ZVe44 cluster10: 0 record, skip.
ZVe44 cluster11: 0 recor

CV Acc: 37.33%, SD: 16.79%)

ZVa9ccluster5, shape:  (44, 13) (44,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 88.64%
CV Acc: 58.89%, SD: 11.97%)

ZVa9ccluster6, shape:  (18, 13) (18,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 77.78%
CV Acc: 60.00%, SD: 16.16%)

ZVa9c cluster7: 0 record, skip.
ZVa9c cluster8: 0 record, skip.
ZVa9c cluster9: 0 record, skip.
ZVa9c cluster10: 0 record, skip.
ZVa9c cluster11: 0 record, skip.
ZVa9ccluster12, shape:  (865, 13) (865,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.38%
CV Acc: 67.51%, SD: 4.50%)

ZVa9ccluster13, shape:  (1026, 13) (1026,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.15%
CV Acc: 64.73%, SD: 6.36%)

ZVa9ccluster14, shape:  (320, 13) (320,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.81%
CV Acc: 62.50%, SD: 8.09%)

ZVa9ccluster15, shape:  (159, 13) (159,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 96.23%
CV Acc: 44.54%, SD: 14.63%)

ZVa9ccluster16, shape:  (40402, 13) (40402,)
[eta= 0.05 ,gam

Train Acc: 96.77%
CV Acc: 67.49%, SD: 19.70%)

ZV252cluster23, shape:  (82, 13) (82,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 100.00%
CV Acc: 65.88%, SD: 4.66%)

ZV252 cluster24: 0 record, skip.
ZV252 cluster25: 0 record, skip.
ZV252 cluster26: 0 record, skip.
ZV252 cluster27: 0 record, skip.
ZV252 cluster28: 0 record, skip.
ZV252 cluster29: 0 record, skip.
ZV252 cluster30: 0 record, skip.
ZV252 cluster31: only has negative samples
ZV252cluster32, shape:  (95, 13) (95,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 90.53%
CV Acc: 90.53%, SD: 9.65%)

ZV252cluster33, shape:  (142, 13) (142,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 96.48%
CV Acc: 50.00%, SD: 11.35%)

ZV252cluster34, shape:  (1107, 13) (1107,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.01%
CV Acc: 69.63%, SD: 15.62%)

ZV252cluster35, shape:  (1519, 13) (1519,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.41%
CV Acc: 84.47%, SD: 8.85%)



# Refresh status of the weak models

In [12]:
def retrieve_status(vehicle_type, cn=0):
    print('vehicle_type :',vehicle_type)
    model_path = '../models/final'
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    model_path = model_path +'/'+  vehicle_type
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    
    status_indicator = list()
    #status:
    ok = 2
    not_found = 0.5
    pos = 1
    neg = 0
    for i in range(cn, n_clusters):
        
        #load data
        sample = pd.read_csv('../data/final/clusters/' + vehicle_type + '/cluster' + str(i) + '.csv')
        if len(sample) == 0:
            print(vehicle_type+' cluster'+str(i)+': 0 record, skip.')
            status_indicator.append(not_found)
            continue
        sample_= sample[sample.iloc[: , -1]==0]
        if len(sample_) == 0 or len(sample_) == len(sample):
            note = vehicle_type+' cluster'+str(i)+': only has negative samples'
            if len(sample_) == 0:
                status_indicator.append(pos)
                note = vehicle_type+' cluster'+str(i)+': only has positive samples'
            else:
                status_indicator.append(neg)
            print(note)
            continue
        status_indicator.append(ok)
    arr = np.array(status_indicator)
    np.savetxt(model_path+"/status.csv", arr, delimiter=",")

for vehicle_type in vehicle_types:
        retrieve_status(vehicle_type, 0)

vehicle_type : ZVe44
ZVe44 cluster6: only has positive samples
ZVe44 cluster7: 0 record, skip.
ZVe44 cluster8: only has positive samples
ZVe44 cluster9: only has positive samples
ZVe44 cluster10: 0 record, skip.
ZVe44 cluster11: 0 record, skip.
ZVe44 cluster27: only has positive samples
vehicle_type : ZV573
ZV573 cluster8: only has negative samples
ZV573 cluster9: 0 record, skip.
ZV573 cluster10: 0 record, skip.
ZV573 cluster11: 0 record, skip.
ZV573 cluster27: only has negative samples
vehicle_type : ZV63d
ZV63d cluster9: only has negative samples
ZV63d cluster10: 0 record, skip.
ZV63d cluster11: 0 record, skip.
ZV63d cluster25: only has positive samples
ZV63d cluster26: 0 record, skip.
ZV63d cluster27: only has positive samples
vehicle_type : ZVfd4
ZVfd4 cluster4: only has positive samples
ZVfd4 cluster5: 0 record, skip.
ZVfd4 cluster7: only has positive samples
ZVfd4 cluster8: 0 record, skip.
ZVfd4 cluster9: 0 record, skip.
ZVfd4 cluster10: 0 record, skip.
ZVfd4 cluster11: 0 record,

## Tuning Evidence

[eta= 0.05 ,gamma= 0.5 ,max_depth= 8 ]
Train Acc: 68.84%
CV Acc: 65.15%, SD: 1.12%)

[eta= 0.05 ,gamma= 0.9 ,max_depth= 8 ]
Train Acc: 69.05%
CV Acc: 65.13%, SD: 1.12%)

[eta= 0.05 ,gamma= 1.2 ,max_depth= 8 ]
Train Acc: 69.06%
CV Acc: 65.19%, SD: 1.15%)

[eta= 0.05 ,gamma= 1.5 ,max_depth= 8 ]
Train Acc: 68.85%
CV Acc: 65.22%, SD: 1.05%)

[eta= 0.05 ,gamma= 0.5 ,max_depth= 10 ]
Train Acc: 72.05%
CV Acc: 66.72%, SD: 0.82%)

[eta= 0.05 ,gamma= 0.9 ,max_depth= 10 ]
Train Acc: 71.90%
CV Acc: 66.76%, SD: 0.76%)

[eta= 0.05 ,gamma= 1.2 ,max_depth= 10 ]
Train Acc: 72.05%
CV Acc: 66.87%, SD: 0.77%)

[eta= 0.05 ,gamma= 1.5 ,max_depth= 10 ]
Train Acc: 71.94%
CV Acc: 66.89%, SD: 0.75%)

[eta= 0.05 ,gamma= 0.5 ,max_depth= 13 ]
Train Acc: 76.66%
CV Acc: 68.29%, SD: 0.71%)

[eta= 0.05 ,gamma= 0.9 ,max_depth= 13 ]
Train Acc: 76.82%
CV Acc: 68.30%, SD: 0.69%)

[eta= 0.05 ,gamma= 1.2 ,max_depth= 13 ]
Train Acc: 76.72%
CV Acc: 68.32%, SD: 0.70%)

[eta= 0.05 ,gamma= 1.5 ,max_depth= 13 ]
Train Acc: 76.86%
CV Acc: 68.30%, SD: 0.73%)

vehicle_type : ZVe44
ZVe44cluster0, shape:  (580695, 13) (580695,)


[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 76.55%
CV Acc: 68.32%, SD: 0.72%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 76.20%
CV Acc: 68.31%, SD: 0.68%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 75.91%
CV Acc: 68.19%, SD: 0.68%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 75.23%
CV Acc: 68.02%, SD: 0.62%)

ZVe44cluster1, shape:  (453163, 13) (453163,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 77.87%
CV Acc: 67.74%, SD: 1.30%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 77.29%
CV Acc: 67.65%, SD: 1.32%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 77.00%
CV Acc: 67.72%, SD: 1.31%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 75.38%
CV Acc: 67.52%, SD: 1.28%)

ZVe44cluster2, shape:  (34899, 13) (34899,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 91.60%
CV Acc: 67.12%, SD: 0.92%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 89.86%
CV Acc: 67.15%, SD: 1.01%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 86.90%
CV Acc: 66.55%, SD: 1.07%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 82.92%
CV Acc: 65.81%, SD: 1.42%)

ZVe44cluster3, shape:  (7959, 13) (7959,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.42%
CV Acc: 76.39%, SD: 3.13%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 96.62%
CV Acc: 76.27%, SD: 3.78%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 94.20%
CV Acc: 75.17%, SD: 2.95%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 92.02%
CV Acc: 74.52%, SD: 2.21%)

ZVe44cluster4, shape:  (104, 13) (104,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.12%
CV Acc: 59.86%, SD: 22.09%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 95.19%
CV Acc: 59.81%, SD: 21.24%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 92.31%
CV Acc: 60.76%, SD: 21.84%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 70.19%
CV Acc: 66.57%, SD: 27.02%)

ZVe44cluster5, shape:  (214, 13) (214,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.20%
CV Acc: 77.14%, SD: 20.48%)

[eta= 0.05 ,gamma= 5 ,max_depth= 13 ]
Train Acc: 94.86%
CV Acc: 77.13%, SD: 17.83%)

[eta= 0.05 ,gamma= 10 ,max_depth= 13 ]
Train Acc: 92.99%
CV Acc: 70.49%, SD: 19.54%)

[eta= 0.05 ,gamma= 20 ,max_depth= 13 ]
Train Acc: 85.51%
CV Acc: 56.28%, SD: 32.50%)


In [22]:
for vehicle_type in vehicle_types:
    xgboost_train(vehicle_type,0)

vehicle_type : ZVfd4
ZVfd4cluster0, shape:  (34940, 13) (34940,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 97.55%
CV Acc: 83.20%, SD: 1.66%)

ZVfd4cluster1, shape:  (7489, 13) (7489,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.32%
CV Acc: 79.72%, SD: 4.92%)

ZVfd4cluster2, shape:  (11501, 13) (11501,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.23%
CV Acc: 71.25%, SD: 3.00%)

ZVfd4cluster3, shape:  (4861, 13) (4861,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.59%
CV Acc: 80.19%, SD: 6.00%)

ZVfd4 cluster4: only has positive samples
ZVfd4 cluster5: 0 record, skip.
ZVfd4cluster6, shape:  (2, 13) (2,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 50.00%
CV Acc: 0.00%, SD: 0.00%)

ZVfd4 cluster7: only has positive samples
ZVfd4 cluster8: 0 record, skip.
ZVfd4 cluster9: 0 record, skip.
ZVfd4 cluster10: 0 record, skip.
ZVfd4 cluster11: 0 record, skip.
ZVfd4cluster12, shape:  (415, 13) (415,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.31%
CV Acc: 6

Train Acc: 98.03%
CV Acc: 64.97%, SD: 2.72%)

ZVa78cluster13, shape:  (5668, 13) (5668,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 95.77%
CV Acc: 63.60%, SD: 4.80%)

ZVa78cluster14, shape:  (104, 13) (104,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.04%
CV Acc: 92.38%, SD: 7.74%)

ZVa78 cluster15: 0 record, skip.
ZVa78cluster16, shape:  (88888, 13) (88888,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 89.05%
CV Acc: 70.24%, SD: 1.08%)

ZVa78cluster17, shape:  (357558, 13) (357558,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 80.37%
CV Acc: 67.65%, SD: 1.04%)

ZVa78cluster18, shape:  (4647, 13) (4647,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 99.87%
CV Acc: 91.48%, SD: 6.93%)

ZVa78 cluster19: 0 record, skip.
ZVa78cluster20, shape:  (1542, 13) (1542,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.25%
CV Acc: 57.84%, SD: 5.63%)

ZVa78cluster21, shape:  (4902, 13) (4902,)
[eta= 0.05 ,gamma= 2 ,max_depth= 13 ]
Train Acc: 98.25%
CV Acc: 60.28%, SD: 5.3