In [1]:
#libs
import numpy as np
import os
import pandas as pd
import pprint
import csv

#path constants
train_path = '../data/final/train/'
test_path = '../data/final/test/'
train_cleaned_path = '../data/final/train_norm/'
test_cleaned_path = '../data/final/test_norm/'

#type constants
vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']

In [2]:

def TraverseFiles(path, vehicle_type):

#vehicle_type: one string element under vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']
    path = path + vehicle_type
    #these are variables to calculate traversing progress (DO NOT CHANGE)
    counts_per_percent = int(len(os.listdir(path)) / 100)
    percentage_completion = 0
    counter = 0
    
    df_list = list()
    for file in os.listdir(path):
        sample_df = pd.read_csv(path + '/' + file, delimiter = ',', encoding = 'utf-8')
        df_list.append(sample_df)
        #belows are to show traversing progress (DO NOT CHANGE)
        counter += 1
        if counter == counts_per_percent:
            counter = 0
            percentage_completion += 1
            print('traversing files under', path, ':', percentage_completion, "%", end="\r", flush=True)
    df = pd.concat(df_list, ignore_index=True)
    return df.to_numpy()

## Fetching records of each vehicle type

In [3]:
train_data = dict()
for vehicle_type in vehicle_types:
    train_data[vehicle_type] = TraverseFiles(train_path, vehicle_type)

traversing files under ../data/final/train/ZV252 : 115 %32 % ../data/final/train/ZVfd4 : 42 % ../data/final/train/ZVfd4 : 43 % ../data/final/train/ZVfd4 : 44 % ../data/final/train/ZVfd4 : 50 % ../data/final/train/ZVfd4 : 51 % ../data/final/train/ZVfd4 : 52 % ../data/final/train/ZVfd4 : 53 % ../data/final/train/ZVfd4 : 54 % ../data/final/train/ZVfd4 : 55 % ../data/final/train/ZVfd4 : 56 % ../data/final/train/ZVfd4 : 57 % ../data/final/train/ZVfd4 : 58 % ../data/final/train/ZVfd4 : 59 % ../data/final/train/ZVfd4 : 60 % ../data/final/train/ZVfd4 : 61 % ../data/final/train/ZVfd4 : 62 % ../data/final/train/ZVfd4 : 63 % ../data/final/train/ZVfd4 : 66 % ../data/final/train/ZVfd4 : 67 % ../data/final/train/ZVfd4 : 68 % 69 % ../data/final/train/ZVfd4 : 70 % ../data/final/train/ZVfd4 : 71 % ../data/final/train/ZVfd4 : 75 % ../data/final/train/ZVfd4 : 76 % ../data/final/train/ZVfd4 : 77 % ../data/final/train/ZVfd4 : 78 % ../data/final/train/ZVfd4 : 79 % ../data/final/train/ZVfd4 : 80 % ../data/fi

In [4]:
for vehicle_type in vehicle_types:
    print(train_data[vehicle_type].shape)

(2066103, 14)
(7619485, 14)
(647055, 14)
(121510, 14)
(495118, 14)
(1091578, 14)
(69295, 14)


## Basic stats before excluding idling records

## Min Max Normalisation

In [6]:
min_max_tbl = dict()

cols = train_data[vehicle_types[0]].shape[1]-1

for vehicle_type in vehicle_types:
    data = train_data[vehicle_type]
    print(vehicle_type,'stats:')
    vehicle_col_mm = list()
    for i in range(cols):
        min_max = dict()
        min_max['min'] = np.min( data[:, i])
        min_max['max'] = np.max( data[:, i])
        print('\t\tmin:', min_max['min'])
        print('\t\tmax:', min_max['max'])
        vehicle_col_mm.append(min_max)
    min_max_tbl[vehicle_type] = vehicle_col_mm

ZVe44 stats:
		min: 1.0
		max: 30.0
		min: 2000.02
		max: 7743.82
		min: 2000.68
		max: 10748.89
		min: 0.0
		max: 240.49
		min: -67.29
		max: 212.37
		min: 0.0
		max: 30.27
		min: 20.0
		max: 110.07
		min: 0.0
		max: 2729.99
		min: 0
		max: 0
		min: 1
		max: 1
		min: 0.0
		max: 1.0
		min: 0
		max: 1
		min: 0.0
		max: 1.0
ZV573 stats:
		min: 1.0
		max: 39.0
		min: 2000.0
		max: 7599.05
		min: 2000.09
		max: 16382.76
		min: 0.0
		max: 190.72
		min: -67.19
		max: 194.89
		min: 0.0
		max: 30.28
		min: 20.0
		max: 156.86
		min: 0.0
		max: 2993.83
		min: 0
		max: 1
		min: 0.0
		max: 1.0
		min: 0.0
		max: 1.0
		min: 0
		max: 1
		min: 0.0
		max: 1.0
ZV63d stats:
		min: 1.0
		max: 62.0
		min: 2000.87
		max: 5873.4
		min: 2010.22
		max: 8521.97
		min: 1.04
		max: 155.69
		min: -60.68
		max: 157.48
		min: 0.0
		max: 30.27
		min: 20.0
		max: 131.11
		min: 0.0
		max: 2865.11
		min: 0
		max: 1
		min: 0.0
		max: 1.0
		min: 0.0
		max: 1.0
		min: 0
		max: 1
		min: 0.0
		max: 1.0
ZVfd4 stats:
		min: 1.

In [7]:
def CreateDir(dir_name):
    if not os.path.exists(dir_name):
        os.makedirs(dir_name)

def Normalise(path, export_path, vehicle_type, mm_list):

#vehicle_type: one string element under vehicle_types = ['ZVe44', 'ZV573', 'ZV63d', 'ZVfd4', 'ZVa9c', 'ZVa78', 'ZV252']
    path = path + vehicle_type
    export_path = export_path+vehicle_type
    CreateDir(export_path)
    #these are variables to calculate traversing progress (DO NOT CHANGE)
    counts_per_percent = int(len(os.listdir(path)) / 100)
    percentage_completion = 0
    counter = 0

    for file in os.listdir(path):
        sample_df = pd.read_csv(path + '/' + file, delimiter = ',', encoding = 'utf-8')
        rows = sample_df.shape[0]
        for i in range(len(mm_list)):
            f_min = mm_list[i]['min']
            f_max = mm_list[i]['max']
            diff = f_max - f_min
            if diff == 0:
                continue
            for n in range(rows):
                v = sample_df.iloc[n,i]
                sample_df.iloc[n, i] = (v - f_min) / diff
        sample_df.to_csv (export_path+'/'+file, index = False)

        
        #belows are to show traversing progress (DO NOT CHANGE)
        counter += 1
        if counter == counts_per_percent:
            counter = 0
            percentage_completion += 1
            print('normalising data under', path, ':', percentage_completion, "%", end="\r", flush=True)

In [8]:
for vehicle_type in vehicle_types:
    Normalise(train_path, train_cleaned_path, vehicle_type, min_max_tbl[vehicle_type])
    Normalise(test_path, test_cleaned_path, vehicle_type, min_max_tbl[vehicle_type])
    

normalising data under ../data/final/train/ZV252 : 115 %