## Load pretrained weak models

In [6]:
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import pickle
import numpy as np
import sys
import sklearn.svm.classes
import pandas as pd
import os


#path constants
train_path = '../data/final/train/'
test_path = '../data/final/test/'

#type constants
vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']

#label dataframe
label_df = pd.read_csv('../data/final/label.csv', delimiter = ',', encoding = 'utf-8')

#sys.path.append(r'D:/ProgramData/Anaconda3/Lib/site-packages/sklearn/svm/')

cluster_n = 36

ok = 2
pos = 1
not_found = 0.5
neg = 0

#load all the cluster oriented models and map them with the corresponding vehicle type
saved_model_path = '../models/final/'
GBDT_map = dict()
for vehicle_type in vehicle_types:
    GBDT_list = list()
    df=pd.read_csv(saved_model_path+vehicle_type+'/status.csv', sep=',',header=None)
    df = df.iloc[:,0].to_numpy()
    for i in range(len(df)):
        if df[i] == ok:
            fn = saved_model_path +vehicle_type+'/trainedXGB'+str(i)+'.pkl'
            print(fn)
            with open(fn, 'rb') as file:
                pickle_model = pickle.load(file)
                GBDT_list.append(pickle_model)
        elif df[i] == neg:
            print('neg')
            GBDT_list.append(neg)
        elif df[i] == pos:
            print('pos')
            GBDT_list.append(pos)
        else:
            print('not found')
            GBDT_list.append(not_found)
    GBDT_map[vehicle_type] = GBDT_list

../models/final/ZVe44/trainedXGB0.pkl
../models/final/ZVe44/trainedXGB1.pkl
../models/final/ZVe44/trainedXGB2.pkl
../models/final/ZVe44/trainedXGB3.pkl
../models/final/ZVe44/trainedXGB4.pkl
../models/final/ZVe44/trainedXGB5.pkl
pos
not found
pos
pos
not found
not found
../models/final/ZVe44/trainedXGB12.pkl
../models/final/ZVe44/trainedXGB13.pkl
../models/final/ZVe44/trainedXGB14.pkl
../models/final/ZVe44/trainedXGB15.pkl
../models/final/ZVe44/trainedXGB16.pkl
../models/final/ZVe44/trainedXGB17.pkl
../models/final/ZVe44/trainedXGB18.pkl
../models/final/ZVe44/trainedXGB19.pkl
../models/final/ZVe44/trainedXGB20.pkl
../models/final/ZVe44/trainedXGB21.pkl
../models/final/ZVe44/trainedXGB22.pkl
../models/final/ZVe44/trainedXGB23.pkl
../models/final/ZVe44/trainedXGB24.pkl
../models/final/ZVe44/trainedXGB25.pkl
../models/final/ZVe44/trainedXGB26.pkl
pos
../models/final/ZVe44/trainedXGB28.pkl
../models/final/ZVe44/trainedXGB29.pkl
../models/final/ZVe44/trainedXGB30.pkl
../models/final/ZVe44/tr

In [37]:
GBDT_map['ZVe44']

[XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, eta=0.05, gamma=2,
               gpu_id=-1, importance_type='gain', interaction_constraints=None,
               learning_rate=0.0500000007, max_delta_step=0, max_depth=13,
               min_child_weight=1, missing=nan, monotone_constraints=None,
               n_estimators=100, n_jobs=12, num_parallel_tree=1,
               objective='binary:logistic', random_state=0, reg_alpha=0,
               reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
               validate_parameters=False, verbosity=None),
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, colsample_bytree=1, eta=0.05, gamma=2,
               gpu_id=-1, importance_type='gain', interaction_constraints=None,
               learning_rate=0.0500000007, max_delta_step=0, max_depth=13,
               min_child_weight=1, missing=nan, mo

## Load data to generate score tensor

In [38]:
def getLabel(filename, label_df):
    idx = label_df.loc[label_df['sample_file_name'] == filename]
    return idx.iloc[0]['label']

feature_thresholds = dict()
feature_thresholds[1] = [3000,5000] #engine rpm
feature_thresholds[2] = [4500,7000] #oil pump rpm
feature_thresholds[7] = [700, 1650, 2500] #displacement current
#3x3x4 = 36 clusters

cluster_dict = dict()

def clear_dict():
    global cluster_dict
    cluster_dict = dict()
    for i in range(36):
        cluster_dict[i] = None
        
def clustering(df, feature_thresholds, keys, cluster_n, this_num):
    if len(keys) == 0:
        #print('cluster '+str(this_num)+':'+note)
        global cluster_dict
        if len(df) == 0:
            cluster_dict[this_num] = None
        else:
            cluster_dict[this_num] = df
    else:
        keys_ = keys.copy()
        key = keys_.pop(0)
        thresholds = feature_thresholds[key]
        prev = 0
        cluster_n = int(cluster_n / (len(thresholds)+1))
        i = 0
        for val in thresholds:
            new_df = df[(df.iloc[:,key] > prev) & (df.iloc[:,key] <= val)]
            clustering(new_df, feature_thresholds, keys_, cluster_n, this_num + cluster_n*i)
            prev = val
            i+=1
        
        new_df = df[df.iloc[:,key] > prev]
        clustering(new_df, feature_thresholds, keys_, cluster_n, this_num + cluster_n*i)
        i+=1
        

def feature_tensor_gen(path, label_df, model_list):
#path: train_path or test_path
#vehicle_type: one string element under vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']
    n_cluster = 36
    #these are variables to calculate traversing progress (DO NOT CHANGE)
    counts_per_percent = int(len(os.listdir(path)) / 100)
    percentage_completion = 0
    counter = 0
    
    #pooling result from 50 weak learners then concatenated with the label
    feature_tensor = np.empty((0, cluster_n+1))
    
    #thresholds to categorize data points
    feature_thresholds = dict()
    feature_thresholds[1] = [3000,5000] #engine rpm
    feature_thresholds[2] = [4500,7000] #oil pump rpm
    feature_thresholds[7] = [700, 1650, 2500] #displacement current
    
    global cluster_dict
    
    for file in os.listdir(path):
        
        sample_df = pd.read_csv(path + '/' + file, delimiter = ',', encoding = 'utf-8')
        n = len(sample_df)
        label = getLabel(file, label_df)
        feature_vector = list()
        clear_dict()
        clustering(sample_df, feature_thresholds, list(feature_thresholds.keys()), cluster_n, 0)
        
        for i in range(cluster_n):
            df = cluster_dict[i]
            
            if df is None or len(df) == 0:
                feature_vector.append(np.nan)
                continue
                
            model = model_list[i]
            pooling_score = 0
            if model == 0:
                pooling_score = 0
            elif model == 0.5:
                pooling_score = np.nan
            elif model == 1:
                pooling_score = 1
            else:
                result = model.predict(df.iloc[:,:-1], validate_features=False)
                pooling_score = np.average(result)
            feature_vector.append(pooling_score)
        
        feature_vector.append(label)
        feature_vector = np.array(feature_vector) 
        feature_tensor = np.append(feature_tensor, [feature_vector], axis=0)
        # --------------------------------------------------------------------------
        # NO NEED TO CHANGE ANYTHING BELOW
        
        #belows are to show traversing progress (DO NOT CHANGE)
        counter += 1
        if counter == counts_per_percent:
            counter = 0
            percentage_completion += 1
            print('traversing files under', path, ':', percentage_completion, "%", end="\r", flush=True)
    return feature_tensor

In [39]:
train_tensor = dict()
test_tensor = dict()

train_path = '../data/final/train/'
test_path = '../data/final/test/'

for vehicle_type in vehicle_types:

    train_tensor[vehicle_type] = feature_tensor_gen(train_path+vehicle_type, label_df, GBDT_map[vehicle_type])
    test_tensor[vehicle_type] = feature_tensor_gen(test_path+vehicle_type, label_df, GBDT_map[vehicle_type])

traversing files under ../data/final/train/ZVe44 : 6 %

KeyboardInterrupt: 

traversing files under ../data/final/train/ZV252 : 115 %

## Output score tensor

In [7]:
#print(feature_tensor.shape)
tensor_path = '../data/final/feature_tensors'
if not os.path.exists(tensor_path):
        os.makedirs(tensor_path)
for vehicle_type in vehicle_types:
    trainset = train_tensor[vehicle_type]
    testset = test_tensor[vehicle_type]
    np.savetxt(tensor_path+'/'+vehicle_type+"_train.csv", trainset, delimiter=",")
    np.savetxt(tensor_path+'/'+vehicle_type+"_test.csv", testset, delimiter=",")
    

In [8]:
path = '../data/final/feature_tensors/'
print(path)
train_tensor = dict()
test_tensor = dict()
for vehicle_type in vehicle_types:
    train_tensor[vehicle_type] =pd.read_csv(path+vehicle_type+'_train.csv',sep=',',header=None).to_numpy()
    print(train_tensor[vehicle_type].shape)
    test_tensor[vehicle_type] = pd.read_csv(path+vehicle_type+'_test.csv', sep=',',header=None).to_numpy()
    print(test_tensor[vehicle_type].shape)

../data/final/feature_tensors/
(13883, 37)
(3471, 37)
(49193, 37)
(12299, 37)
(3869, 37)
(968, 37)
(938, 37)
(235, 37)
(4178, 37)
(1045, 37)
(8208, 37)
(2052, 37)
(345, 37)
(87, 37)


# Test report of GBDT kernel

In [4]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def performance_summary(vehicle_type, train, test):
    print('Model performance report for vehicle type:', vehicle_type)
    params = {'booster': 'gbtree', 'eta': 1, 'max_depth': 16, 'gamma' : 1.5}
    bst = xgb.XGBClassifier(**params)
    bst.fit(train[:,:36],train[:,36])
    
    y_hat = bst.predict(train[:,:36])
    acc = accuracy_score(train[:,36], y_hat)
    print('train acc:',acc)
    y_hat = bst.predict(test[:,:36])
    acc = accuracy_score(test[:,36], y_hat)
    print(classification_report(test[:,36], y_hat,digits=4))
    print('test acc:',acc)
    fpr, tpr, thresholds = metrics.roc_curve(test[:,36], y_hat, pos_label=1)
    print('AUC:',metrics.auc(fpr, tpr))
    with open('../models/final/'+vehicle_type+'_integrated_model.pkl', 'wb') as f:
                pickle.dump(bst,f)
    return acc * test.shape[0]

correct = 0
nums = 0
for vehicle_type in vehicle_types:
    nums += test_tensor[vehicle_type].shape[0]
    correct += performance_summary(vehicle_type, train_tensor[vehicle_type], test_tensor[vehicle_type])
print('Overal test acc:', correct / nums)

Model performance report for vehicle type: ZVe44
train acc: 0.987682777497659
              precision    recall  f1-score   support

         0.0     0.6750    0.6672    0.6711      1731
         1.0     0.6727    0.6805    0.6766      1740

    accuracy                         0.6739      3471
   macro avg     0.6739    0.6739    0.6738      3471
weighted avg     0.6739    0.6739    0.6739      3471

test acc: 0.6738692019590896
AUC: 0.67385206876631
Model performance report for vehicle type: ZV573
train acc: 0.9791637021527453
              precision    recall  f1-score   support

         0.0     0.6441    0.6450    0.6446      6253
         1.0     0.6323    0.6315    0.6319      6046

    accuracy                         0.6383     12299
   macro avg     0.6382    0.6382    0.6382     12299
weighted avg     0.6383    0.6383    0.6383     12299

test acc: 0.6383445808602325
AUC: 0.6382311548346307
Model performance report for vehicle type: ZV63d
train acc: 0.9966399586456449
      

# Test report of AVG Pooling Kernel

In [9]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

def test_report(vehicle_type, train, test):
    print('summary of test accuracy for vehicle type:', vehicle_type)
    arr = np.copy(train)
    where_are_NaNs = np.isnan(arr)
    arr[where_are_NaNs] = 0.5
    scores = np.mean(arr[:,0:36],axis=1)
    scores = [1 if num >= 0.5 else 0 for num in scores]
    scores = np.array(scores)
    acc = accuracy_score(train[:,36], scores)
    print('Train acc:', acc)
    
    arr = np.copy(test)
    where_are_NaNs = np.isnan(arr)
    arr[where_are_NaNs] = 0.5
    scores = np.mean(arr[:,0:36],axis=1)
    scores = [1 if num >= 0.5 else 0 for num in scores]
    scores = np.array(scores)
    acc = accuracy_score(test[:,36], scores)
    print(classification_report(test[:,36], scores, digits=4))
    print('Test acc:', acc)
    fpr, tpr, thresholds = metrics.roc_curve(test[:,36], scores, pos_label=1)
    print('AUC:',metrics.auc(fpr, tpr))
    correct = int(acc * test.shape[0])
    #print(correct,'/',test.shape[0])
    return correct 
avg_acc = 0
for i in range(1):
    path = '../data/final/feature_tensors/'
    print(path)
    train_tensor = dict()
    test_tensor = dict()
    for vehicle_type in vehicle_types:
        train_tensor[vehicle_type] =pd.read_csv(path+vehicle_type+'_train.csv',sep=',',header=None).to_numpy()
        test_tensor[vehicle_type] = pd.read_csv(path+vehicle_type+'_test.csv', sep=',',header=None).to_numpy()



    correct = 0
    nums = 0
    for vehicle_type in vehicle_types:
        nums += test_tensor[vehicle_type].shape[0]
        correct += test_report(vehicle_type, train_tensor[vehicle_type], test_tensor[vehicle_type])
    avg_acc += (correct / nums)
    print('Test acc:', correct / nums)
print('average accuracy:', avg_acc)

../data/final/feature_tensors/
summary of test accuracy for vehicle type: ZVe44
Train acc: 0.8461427645321616
              precision    recall  f1-score   support

         0.0     0.6979    0.6979    0.6979      1731
         1.0     0.6994    0.6994    0.6994      1740

    accuracy                         0.6986      3471
   macro avg     0.6986    0.6986    0.6986      3471
weighted avg     0.6986    0.6986    0.6986      3471

Test acc: 0.6986459233650245
AUC: 0.6986438972887907
summary of test accuracy for vehicle type: ZV573
Train acc: 0.8176163275262741
              precision    recall  f1-score   support

         0.0     0.6738    0.6387    0.6558      6253
         1.0     0.6454    0.6801    0.6623      6046

    accuracy                         0.6591     12299
   macro avg     0.6596    0.6594    0.6590     12299
weighted avg     0.6598    0.6591    0.6590     12299

Test acc: 0.6590779738190097
AUC: 0.6594262474819232
summary of test accuracy for vehicle type: ZV63d
Tr