## Import Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import cluster
from sklearn import mixture
from sklearn import preprocessing
from collections import Counter
from mpl_toolkits.mplot3d import Axes3D

## Reading data

In [2]:
def read_data(folder):
    file_path = "../data/50mph/" + str(folder) + "/" + str(folder) + "_Step1.csv"
    data = np.abs(pd.read_csv(file_path))
    data["movement_x"] = 0
    data["movement_y"] = 0
    data["movement_z"] = 0

    for i in range(1,303):
        # create file path
        file_path_1 = "../data/50mph/" + str(folder) + "/" + str(folder) + "_Step" + str(i) + ".csv"
        file_path_2 = "../data/50mph/" + str(folder) + "/" + str(folder) + "_Step" + str(i+1) + ".csv"
        print(file_path_1)
        # read in file 
        data_1 = pd.read_csv(file_path_1)
        data_2 = pd.read_csv(file_path_2)

        # calculate movement in x,y,z directions
        data.iloc[:,20:23] = data.iloc[:,20:23] + np.abs(data_2.iloc[:,2:5] - data_1.iloc[:,2:5]).values

        # add moment of inertiain x,y,z directions
        data.iloc[:,5:20] = data.iloc[:,5:20] + np.abs(data_2.iloc[:,5:20])
        
        return data


## Data Preprocess

In [3]:
def feature_engineering(data):
    data["velocity"] = np.sqrt(data["velocity_vx"]**2 + 
                               data["velocity_vy"]**2 + 
                               data["velocity_vz"]**2)
    
    data["movement"] = np.sqrt(data["movement_x"]**2 + 
                               data["movement_y"]**2 + 
                               data["movement_z"]**2)
    
    data["Fsc"] = np.sqrt(data["Fsc_x"]**2 + 
                          data["Fsc_y"]**2 + 
                          data["Fsc_z"]**2)
    
    data["F"] = np.sqrt(data["F_x"]**2 + 
                        data["F_y"]**2 + 
                        data["F_z"]**2)
    return data

def util_winsorization(data):
    p = 0.95
    quantile_value = np.quantile(data, p)
    
    data = data.apply(lambda x: x if x < quantile_value else quantile_value)
    
    return data
    
def data_preprocess(data):
    # take natural logarithm for some columns
    min_value = 1e-5
    data.iloc[:,8:28] = np.log(data.iloc[:,8:28] + min_value)
    
    # winsorization
    data.iloc[:,8:28] = data.iloc[:,8:28].apply(util_winsorization, axis = 0)
    
    # data standardization 
    tmp = pd.DataFrame(preprocessing.scale(data.iloc[:,1:28]))
    tmp.columns = data.columns[1:28]
    data.iloc[:,1:28] = tmp
    
    return data

def plot_coordinate(data, label):
    color_map = {0: 'red', 1: 'blue', 2: 'yellow', 3: 'black', 4: 'white',
                 5: 'green', 6: 'magenta', 7: 'cyan'}
    color = [color_map[labels] for labels in label ]
    fig = plt.figure()
    ax = fig.add_subplot(1,1,1, projection = '3d')
    x, y, z = data["centroid_x"], data["centroid_y"], data["centroid_z"]
    ax.scatter(x, y, z, s = 2, c=color, marker='o')
    ax.set_xlabel('X Label')
    ax.set_ylabel('Y Label')
    ax.set_zlabel('Z Label')
    plt.show()


## Semi-supervised KMeans

### Center Binding

In [27]:
%matplotlib qt

from semi_kmeans_util import SemiKMeans

folder = "cb_d16out"
data = read_data(folder)

# define pre-labeled points
def check_range(x):
    if 1.75 < x["centroid_x"] < 3.25 and 0.2 < x["centroid_z"] < 0.4:
        return True
    else:
        return False
    
idx  = [i for i, v in data.iterrows() if check_range(v)]

data = feature_engineering(data)
data = data_preprocess(data)
data_noxyz = data.drop(['centroid_x', 'centroid_y', 'centroid_z'], axis=1)


../data/50mph/cb_d16out/cb_d16out_Step1.csv


In [28]:
len(idx)

2150

In [29]:
know_data = np.array([idx, [], [], [], []])

In [30]:
# kmeans clustering - semi-unsupurvised
kmeans = SemiKMeans(n_clusters = 5, labeled_data = know_data, weight=1, max_iter=100, verbose=True)
kmeans_semi_results = kmeans.fit_predict(data_noxyz.values)
kmeans_semi_results = [int(x) for x in kmeans_semi_results]

label = pd.DataFrame({'part_num':data["part_num"],'label':kmeans_semi_results})
label.to_csv('labels_semi_cb.csv',header=True, index=False)

Counter({1.0: 4667, 0.0: 3356, 3.0: 1874, 2.0: 833, 4.0: 696})
Iteration 1	Convergance: 12.060507276300825
Counter({1.0: 4075, 0.0: 3377, 3.0: 2424, 2.0: 823, 4.0: 727})
Iteration 2	Convergance: 6.330274031186519
Counter({1.0: 3743, 0.0: 3490, 3.0: 2622, 2.0: 821, 4.0: 750})
Iteration 3	Convergance: 4.5623985214065215
Counter({0.0: 3572, 1.0: 3526, 3.0: 2730, 2.0: 830, 4.0: 768})
Iteration 4	Convergance: 3.294613855707979
Counter({0.0: 3630, 1.0: 3379, 3.0: 2796, 2.0: 843, 4.0: 778})
Iteration 5	Convergance: 2.2971029712260815
Counter({0.0: 3660, 1.0: 3277, 3.0: 2844, 2.0: 850, 4.0: 795})
Iteration 6	Convergance: 1.770711616120849
Counter({0.0: 3690, 1.0: 3208, 3.0: 2864, 2.0: 860, 4.0: 804})
Iteration 7	Convergance: 1.2953433251423974
Counter({0.0: 3705, 1.0: 3158, 3.0: 2888, 2.0: 857, 4.0: 818})
Iteration 8	Convergance: 0.9309599010488796
Counter({0.0: 3711, 1.0: 3124, 3.0: 2899, 2.0: 867, 4.0: 825})
Iteration 9	Convergance: 0.7226207651007144
Counter({0.0: 3721, 1.0: 3100, 3.0: 2909

### Full Support

In [21]:
%matplotlib qt

from semi_kmeans_util import SemiKMeans

folder = "fs_d14out"
data = read_data(folder)

# define pre-labeled points
def check_range(x):
    if 1.75 < x["centroid_x"] < 3.25 and 0.2 < x["centroid_z"] < 0.4:
        return True
    else:
        return False
    
idx  = [i for i, v in data.iterrows() if check_range(v)]

data = feature_engineering(data)
data = data_preprocess(data)
data_noxyz = data.drop(['centroid_x', 'centroid_y', 'centroid_z'], axis=1)

know_data = np.array([idx, [], [], [], []])

../data/50mph/fs_d14out/fs_d14out_Step1.csv


In [31]:
# kmeans clustering - semi-unsupurvised
kmeans = SemiKMeans(n_clusters = 5, labeled_data = know_data, weight=1, max_iter=100, verbose=True)
kmeans_semi_results = kmeans.fit_predict(data_noxyz.values)
kmeans_semi_results = [int(x) for x in kmeans_semi_results]

label = pd.DataFrame({'part_num':data["part_num"],'label':kmeans_semi_results})
label.to_csv('labels_semi_fs.csv',header=True, index=False)

Counter({0.0: 4359, 1.0: 2575, 3.0: 1534, 2.0: 1481, 4.0: 1477})
Iteration 1	Convergance: 10.175194147949867
Counter({0.0: 4092, 1.0: 2542, 2.0: 1782, 3.0: 1522, 4.0: 1488})
Iteration 2	Convergance: 4.5007291666569245
Counter({0.0: 3983, 1.0: 2470, 2.0: 1931, 3.0: 1589, 4.0: 1453})
Iteration 3	Convergance: 3.9814748733380725
Counter({0.0: 3919, 1.0: 2401, 2.0: 2012, 3.0: 1676, 4.0: 1418})
Iteration 4	Convergance: 3.3539390784810337
Counter({0.0: 3875, 1.0: 2332, 2.0: 2065, 3.0: 1761, 4.0: 1393})
Iteration 5	Convergance: 2.7363209370713264
Counter({0.0: 3840, 1.0: 2268, 2.0: 2101, 3.0: 1839, 4.0: 1378})
Iteration 6	Convergance: 2.2211070449164367
Counter({0.0: 3810, 1.0: 2222, 2.0: 2127, 3.0: 1906, 4.0: 1361})
Iteration 7	Convergance: 1.9411473437278346
Counter({0.0: 3785, 1.0: 2173, 2.0: 2151, 3.0: 1963, 4.0: 1354})
Iteration 8	Convergance: 1.5361842875032892
Counter({0.0: 3765, 2.0: 2168, 1.0: 2135, 3.0: 2012, 4.0: 1346})
Iteration 9	Convergance: 1.3132138848270074
Counter({0.0: 3753,

### Lack of Center Support

In [32]:
%matplotlib qt

from semi_kmeans_util import SemiKMeans

folder = "locs_d16out"
data = read_data(folder)

# define pre-labeled points
def check_range(x):
    if 1.75 < x["centroid_x"] < 3.25 and 0.2 < x["centroid_z"] < 0.4:
        return True
    else:
        return False
    
idx  = [i for i, v in data.iterrows() if check_range(v)]

data = feature_engineering(data)
data = data_preprocess(data)
data_noxyz = data.drop(['centroid_x', 'centroid_y', 'centroid_z'], axis=1)

know_data = np.array([idx, [], [], [], []])

../data/50mph/locs_d16out/locs_d16out_Step1.csv


In [33]:
# kmeans clustering - semi-unsupurvised
kmeans = SemiKMeans(n_clusters = 5, labeled_data = know_data, weight=1, max_iter=100, verbose=True)
kmeans_semi_results = kmeans.fit_predict(data_noxyz.values)
kmeans_semi_results = [int(x) for x in kmeans_semi_results]

label = pd.DataFrame({'part_num':data["part_num"],'label':kmeans_semi_results})
label.to_csv('labels_semi_locs.csv',header=True, index=False)

Counter({3.0: 3606, 0.0: 2979, 1.0: 2420, 2.0: 1307, 4.0: 1093})
Iteration 1	Convergance: 5.828248165239046
Counter({3.0: 3340, 0.0: 3158, 1.0: 2450, 2.0: 1232, 4.0: 1225})
Iteration 2	Convergance: 5.044780016333874
Counter({0.0: 3253, 3.0: 3135, 1.0: 2466, 4.0: 1334, 2.0: 1217})
Iteration 3	Convergance: 4.2860716155308864
Counter({0.0: 3321, 3.0: 2977, 1.0: 2463, 4.0: 1426, 2.0: 1218})
Iteration 4	Convergance: 3.6006917110385928
Counter({0.0: 3366, 3.0: 2854, 1.0: 2449, 4.0: 1511, 2.0: 1225})
Iteration 5	Convergance: 2.935810690322654
Counter({0.0: 3414, 3.0: 2755, 1.0: 2434, 4.0: 1577, 2.0: 1225})
Iteration 6	Convergance: 2.3810799723235903
Counter({0.0: 3458, 3.0: 2675, 1.0: 2417, 4.0: 1625, 2.0: 1230})
Iteration 7	Convergance: 2.092779388589264
Counter({0.0: 3478, 3.0: 2610, 1.0: 2395, 4.0: 1682, 2.0: 1240})
Iteration 8	Convergance: 1.739209700445258
Counter({0.0: 3500, 3.0: 2555, 1.0: 2380, 4.0: 1721, 2.0: 1249})
Iteration 9	Convergance: 1.4822778620508643
Counter({0.0: 3530, 3.0:

### Lack of Rail Seat Support

In [34]:
%matplotlib qt

from semi_kmeans_util import SemiKMeans

folder = "lorss_d20out"
data = read_data(folder)

# define pre-labeled points
def check_range(x):
    if 1.75 < x["centroid_x"] < 3.25 and 0.2 < x["centroid_z"] < 0.4:
        return True
    else:
        return False
    
idx  = [i for i, v in data.iterrows() if check_range(v)]

data = feature_engineering(data)
data = data_preprocess(data)
data_noxyz = data.drop(['centroid_x', 'centroid_y', 'centroid_z'], axis=1)

know_data = np.array([idx, [], [], [], []])

../data/50mph/lorss_d20out/lorss_d20out_Step1.csv


In [35]:
# kmeans clustering - semi-unsupurvised
kmeans = SemiKMeans(n_clusters = 5, labeled_data = know_data, weight=1, max_iter=100, verbose=True)
kmeans_semi_results = kmeans.fit_predict(data_noxyz.values)
kmeans_semi_results = [int(x) for x in kmeans_semi_results]

label = pd.DataFrame({'part_num':data["part_num"],'label':kmeans_semi_results})
label.to_csv('labels_semi_lorss.csv',header=True, index=False)

Counter({0.0: 3051, 1.0: 3038, 2.0: 2421, 4.0: 1976, 3.0: 1291})
Iteration 1	Convergance: 3.6839593221530436
Counter({0.0: 3391, 1.0: 2766, 2.0: 2578, 4.0: 1810, 3.0: 1232})
Iteration 2	Convergance: 1.5477617978141893
Counter({0.0: 3496, 2.0: 2627, 1.0: 2598, 4.0: 1822, 3.0: 1234})
Iteration 3	Convergance: 1.6525782494678367
Counter({0.0: 3550, 2.0: 2622, 1.0: 2512, 4.0: 1845, 3.0: 1248})
Iteration 4	Convergance: 1.6111198211980593
Counter({0.0: 3579, 2.0: 2596, 1.0: 2468, 4.0: 1878, 3.0: 1256})
Iteration 5	Convergance: 1.26400590481353
Counter({0.0: 3597, 2.0: 2565, 1.0: 2443, 4.0: 1906, 3.0: 1266})
Iteration 6	Convergance: 1.1119873206886837
Counter({0.0: 3610, 2.0: 2535, 1.0: 2426, 4.0: 1933, 3.0: 1273})
Iteration 7	Convergance: 0.9211331981001933
Counter({0.0: 3622, 2.0: 2509, 1.0: 2415, 4.0: 1958, 3.0: 1273})
Iteration 8	Convergance: 0.6618550081897183
Counter({0.0: 3638, 2.0: 2487, 1.0: 2407, 4.0: 1972, 3.0: 1273})
Iteration 9	Convergance: 0.5976838504334125
Counter({0.0: 3645, 2