In [2]:
import numpy as np
import os
import pandas as pd
import pprint
import csv

#path constants
path = '../../data_all/'
output_path = '../data/'

#type constants
vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']

labels = dict()
for vehicle_type in vehicle_types:
    df = pd.read_csv(path + vehicle_type + '_label.csv', delimiter = ',', encoding = 'utf-8')
    labels[vehicle_type] = df


### Cluster the new train data based on the defined thresholds of most important power input and the vehicle performance features

In [3]:
#feature index(int) : thresholds(list(int))
feature_thresholds = dict()
feature_thresholds[1] = [3000,5000] #engine rpm
feature_thresholds[2] = [4500,7000] #oil pump rpm
feature_thresholds[7] = [700, 1650, 2500] #displacement current
#3x3x4 = 36 clusters


def clustering(df, path, feature_thresholds, keys, note, cluster_n, this_num):
    if len(keys) == 0:
        print('cluster '+str(this_num)+':'+note)
        df.to_csv(path+'cluster'+str(this_num)+'.csv', index = False)
    else:
        keys_ = keys.copy()
        key = keys_.pop(0)
        thresholds = feature_thresholds[key]
        prev = 0
        cluster_n = int(cluster_n / (len(thresholds)+1))
        i = 0
        for val in thresholds:
            new_df = df[(df.iloc[:,key] > prev) & (df.iloc[:,key] <= val)]
            clustering(new_df, path, feature_thresholds, keys_, note+' ['+str(key)+']>'+str(prev)+'and<='+str(val), cluster_n, this_num + cluster_n*i)
            prev = val
            i+=1
        
        new_df = df[df.iloc[:,key] > prev]
        clustering(new_df, path, feature_thresholds, keys_, note+' ['+str(key)+']>'+str(prev), cluster_n, this_num + cluster_n*i)
        i+=1
        
    

### Choose 80% of the combined data samples to form a new train set and use the rest to form the new test set

In [4]:
from sklearn.model_selection import train_test_split
from shutil import copyfile

def getLabel(filename, label_df):
    idx = label_df.loc[label_df['sample_file_name'] == filename]
    return idx.iloc[0]['label']

def CreateDir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)
        
def TraverseFiles(path, vehicle_type, label_df, output_path, random_seed, iteration_num, feature_thresholds):
    
    #create directory
    #dir_name = output_path+'iteration_'+str(iteration_num)
    dir_name = output_path+'final'
    CreateDir(dir_name)
    CreateDir(dir_name+'/train')
    CreateDir(dir_name+'/test')
    CreateDir(dir_name+'/train/'+vehicle_type)
    CreateDir(dir_name+'/test/'+vehicle_type)
    CreateDir(dir_name+'/clusters/')
    CreateDir(dir_name+'/clusters/'+vehicle_type)
    
    #export label file
    file_name = dir_name + '/' + 'label.csv'
    if os.path.exists(file_name):
        df = pd.read_csv(file_name, delimiter = ',', encoding = 'utf-8')
        df = df.append(label_df, ignore_index=True)
        df.to_csv(file_name, index = False)
    else:
        label_df.to_csv(file_name, index = False)
        
    path = path + vehicle_type
    #these are variables to calculate traversing progress (DO NOT CHANGE)
    counts_per_percent = int(len(os.listdir(path)) / 100)
    percentage_completion = 0
    counter = 0
    
    df_list = list()
    #0.632+ bootstrap
    x_train, x_test, y_train, y_test = train_test_split(label_df.iloc[:,0], label_df.iloc[:,1], test_size=0.2, random_state=random_seed)
    print(label_df.shape)
    print(x_train.shape)
    #for train samples, copy and paste into the new directory, meanwhile cluster the data
    for file in x_train:
        
        copyfile(path+'/'+file, dir_name + '/train/' + vehicle_type +'/' + file)
        
        sample_df = pd.read_csv(path + '/' + file, delimiter = ',', encoding = 'utf-8')
        
        label = getLabel(file, label_df)
        label_vector = np.repeat(label, sample_df.shape[0])
        sample_df['actual_label'] = label_vector
        
        df_list.append(sample_df)
        #belows are to show traversing progress (DO NOT CHANGE)
        counter += 1
        if counter == counts_per_percent:
            counter = 0
            percentage_completion += 1
            print('traversing files under', path, ':', percentage_completion, "%", end="\r", flush=True)
            
    df = pd.concat(df_list, ignore_index=True)
    #extract clusters from df
    cluster_n = 36
    clustering(df, dir_name+'/clusters/'+vehicle_type+'/', feature_thresholds, list(feature_thresholds.keys()), '', cluster_n, 0)
    
    #for test samples, just copy and paste into the new directory
    for file in x_test:
        copyfile(path+'/'+file, dir_name + '/test/' + vehicle_type +'/' + file)
        #belows are to show traversing progress (DO NOT CHANGE)
        counter += 1
        if counter == counts_per_percent:
            counter = 0
            percentage_completion += 1
            print('traversing files under', path, ':', percentage_completion, "%", end="\r", flush=True)

            

    

In [5]:
for i in range(1,2):
    for vehicle_type in vehicle_types:
        TraverseFiles(path, vehicle_type, labels[vehicle_type], output_path, i, i, feature_thresholds)


(17354, 2)
(13883,)
cluster 0: [1]>0and<=3000 [2]>0and<=4500 [7]>0and<=700
cluster 1: [1]>0and<=3000 [2]>0and<=4500 [7]>700and<=1650
cluster 2: [1]>0and<=3000 [2]>0and<=4500 [7]>1650and<=2500
cluster 3: [1]>0and<=3000 [2]>0and<=4500 [7]>2500
cluster 4: [1]>0and<=3000 [2]>4500and<=7000 [7]>0and<=700
cluster 5: [1]>0and<=3000 [2]>4500and<=7000 [7]>700and<=1650
cluster 6: [1]>0and<=3000 [2]>4500and<=7000 [7]>1650and<=2500
cluster 7: [1]>0and<=3000 [2]>4500and<=7000 [7]>2500
cluster 8: [1]>0and<=3000 [2]>7000 [7]>0and<=700
cluster 9: [1]>0and<=3000 [2]>7000 [7]>700and<=1650
cluster 10: [1]>0and<=3000 [2]>7000 [7]>1650and<=2500
cluster 11: [1]>0and<=3000 [2]>7000 [7]>2500
cluster 12: [1]>3000and<=5000 [2]>0and<=4500 [7]>0and<=700
cluster 13: [1]>3000and<=5000 [2]>0and<=4500 [7]>700and<=1650
cluster 14: [1]>3000and<=5000 [2]>0and<=4500 [7]>1650and<=2500
cluster 15: [1]>3000and<=5000 [2]>0and<=4500 [7]>2500
cluster 16: [1]>3000and<=5000 [2]>4500and<=7000 [7]>0and<=700
cluster 17: [1]>3000and<

1
