In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print("Start Classification Model Data Preprocessing")
pd.set_option("display.max_columns", None)

# reading data in
df = pd.read_csv("/content/drive/MyDrive/Data Mining Project/Notebooks/anomaly_detection_output.csv")
df.drop(['Unnamed: 0'], axis=1, inplace = True)
df.drop(['Unnamed: 0.1'], axis=1, inplace = True)

Start Classification Model Data Preprocessing


In [None]:
# aggregate data by unit level
result_df = df.groupby('unit_id').agg(
    operator_id=('operator_id', list),
    dist_measure=('dist_from_measure_avg', list),
    dist_ct=('dist_from_ct_avg', list),
    timestamp=('timestamp', list),
    measurement=('measurement',list),
    cycle_time=('cycle_time',list),
    wounded=('wounded',list),
    scores=('scores',list)
).reset_index()

# Expand the column of lists into a new table
df_distmeasure = result_df['dist_measure'].apply(pd.Series)
df_distct = result_df['dist_ct'].apply(pd.Series)
df_operator = result_df['operator_id'].apply(pd.Series)
df_timestamp = result_df['timestamp'].apply(pd.Series)
df_measurement = result_df['measurement'].apply(pd.Series)
df_cycletime = result_df['cycle_time'].apply(pd.Series)
df_scores = result_df['scores'].apply(pd.Series)

# Concatenate the expanded table with the original DataFrame
result_df = pd.concat([result_df['unit_id'], df_operator, df_distmeasure, df_distct, df_timestamp, df_measurement, df_cycletime, df_scores], axis=1)

new_column_names = [
    'Unit_id',
    'Op_P1', 'Op_P2','Op_P3','Op_P4','Op_P5','Op_P6','Op_P7','Op_P8','Op_P9','Op_P10',
    'dist_measurement_P1','dist_measurement_P2','dist_measurement_P3','dist_measurement_P4','dist_measurement_P5','dist_measurement_P6','dist_measurement_P7','dist_measurement_P8','dist_measurement_P9','dist_measurement_P10',
    'dist_ct_P1','dist_ct_P2','dist_ct_P3','dist_ct_P4','dist_ct_P5','dist_ct_P6','dist_ct_P7','dist_ct_P8','dist_ct_P9','dist_ct_P10',
    'timestamp_P1','timestamp_P2','timestamp_P3','timestamp_P4','timestamp_P5','timestamp_P6','timestamp_P7','timestamp_P8','timestamp_P9','timestamp_P10',
    'measurement_P1', 'measurement_P2', 'measurement_P3', 'measurement_P4', 'measurement_P5', 'measurement_P6', 'measurement_P7', 'measurement_P8', 'measurement_P9', 'measurement_P10',
    'cycle_time_P1', 'cycle_time_P2', 'cycle_time_P3', 'cycle_time_P4', 'cycle_time_P5', 'cycle_time_P6', 'cycle_time_P7', 'cycle_time_P8', 'cycle_time_P9', 'cycle_time_P10',
    'scores_P1', 'scores_P2', 'scores_P3', 'scores_P4', 'scores_P5', 'scores_P6', 'scores_P7', 'scores_P8', 'scores_P9', 'scores_P10'
]

# Rename the columns of the concatenated DataFrame
result_df.columns = new_column_names

In [None]:
# ohe for operator id
df_encoded = pd.get_dummies(result_df, columns=['Op_P1', 'Op_P2','Op_P3','Op_P4','Op_P5','Op_P6','Op_P7','Op_P8','Op_P9','Op_P10'], drop_first=True, dummy_na = True)

In [None]:
# impute nulls with 0 and change columns to numeric
df_encoded.fillna(0, inplace=True)
df_encoded.apply(pd.to_numeric)

# feature engineering; summing the anomaly detection model scores to aggregate output
scores_sum_df = df_encoded.copy()
scores_sum_df['score_sum'] = scores_sum_df[['scores_P1', 'scores_P2', 'scores_P3', 'scores_P4', 'scores_P5', 'scores_P6', 'scores_P7', 'scores_P8', 'scores_P9', 'scores_P10']].sum(axis=1)

In [None]:
# min max scaling
scaler = MinMaxScaler()
df_scaled = scores_sum_df[['scores_P1', 'scores_P2', 'scores_P3', 'scores_P4', 'scores_P5', 'scores_P6', 'scores_P7', 'scores_P8', 'scores_P9', 'scores_P10',
                           'score_sum']]
df_non_numeric = scores_sum_df[list(set(scores_sum_df.columns.tolist()) - set(df_scaled.columns.tolist()))]
df_scaled = scaler.fit_transform(df_scaled.to_numpy())
df_scaled = pd.DataFrame(df_scaled,columns=['scores_P1', 'scores_P2', 'scores_P3', 'scores_P4', 'scores_P5', 'scores_P6', 'scores_P7', 'scores_P8', 'scores_P9', 'scores_P10',
                           'scores_sum'])

df_scaled = pd.concat([df_scaled, df_non_numeric], axis=1)
df_scaled.head(100)

Unnamed: 0,scores_P1,scores_P2,scores_P3,scores_P4,scores_P5,scores_P6,scores_P7,scores_P8,scores_P9,scores_P10,scores_sum,cycle_time_P10,measurement_P3,Op_P1_nan,Op_P9_P9-2,measurement_P10,Op_P4_P4-3,Op_P2_P2-3,cycle_time_P6,cycle_time_P1,dist_ct_P10,Op_P3_P3-4,Op_P2_P2-2,dist_measurement_P1,Op_P3_nan,timestamp_P6,Op_P10_P10-3,Op_P3_P3-2,Op_P5_P5-3,Op_P6_P6-2,dist_measurement_P4,dist_ct_P5,Op_P4_nan,Op_P6_P6-5,timestamp_P2,dist_measurement_P10,dist_ct_P4,Op_P7_P7-4,Op_P7_nan,dist_ct_P6,timestamp_P7,cycle_time_P5,Op_P10_nan,timestamp_P5,measurement_P9,dist_ct_P7,measurement_P8,Op_P1_P1-2,Op_P9_P9-5,Op_P7_P7-2,dist_measurement_P3,Op_P10_P10-5,timestamp_P9,Op_P3_P3-5,Op_P9_P9-3,dist_measurement_P6,Op_P5_P5-5,Op_P6_P6-4,Op_P3_P3-3,Op_P5_P5-2,Op_P5_nan,Op_P5_P5-4,Op_P10_P10-6,dist_ct_P8,Op_P9_P9-6,cycle_time_P8,Op_P6_P6-6,measurement_P7,Op_P8_nan,measurement_P5,Op_P4_P4-2,timestamp_P4,dist_ct_P9,timestamp_P1,dist_measurement_P9,Op_P10_P10-2,Op_P9_P9-4,measurement_P6,measurement_P1,Op_P9_nan,timestamp_P10,Op_P7_P7-5,dist_ct_P2,Op_P6_nan,cycle_time_P2,dist_measurement_P2,cycle_time_P4,Op_P8_P8-2,measurement_P4,Op_P10_P10-4,dist_measurement_P7,timestamp_P3,cycle_time_P3,Op_P6_P6-3,dist_ct_P1,Op_P2_nan,Unit_id,timestamp_P8,cycle_time_P9,Op_P1_P1-3,measurement_P2,dist_measurement_P8,dist_ct_P3,cycle_time_P7,Op_P7_P7-3,dist_measurement_P5
0,0.897995,0.113652,0.241847,0.272625,0.169099,0.841244,0.000000,0.000000,0.000000,0.000000,0.159741,0.000000,0.098645,0,0,0.000000,1,0,0.084559,0.132353,0.000000,0,1,0.549110,0,0.015417,0,1,1,0,0.351080,0.189110,0,0,0.002871,0.000000,0.010785,0,1,0.219605,0.000000,0.047794,1,0.012653,0.000000,0.000000,0.000000,1,0,0,0.351080,0,0.000000,0,0,1.000000,0,0,0,0,0,0,0,0.000000,0,0.000000,0,0.000000,1,0.304236,0,0.010420,0.000000,0.000000,0.000000,0,0,0.389469,0.011195,1,0.000000,0,0.788741,0,0.091912,0.351080,0.110294,0,0.098645,0,0.000000,0.007283,0.198529,1,0.079286,0,2,0.000000,0.000000,0,0.098645,0.000000,0.051940,0.000000,0,0.086031
1,1.000000,0.060888,0.028746,0.000629,0.659608,0.510918,0.000000,0.000000,0.000000,0.000000,0.131451,0.000000,0.099851,0,0,0.000000,0,1,0.077206,0.194853,0.000000,0,0,0.666961,0,0.015311,0,0,0,1,0.219519,0.123925,0,0,0.003243,0.000000,0.025212,0,1,0.234090,0.000000,0.080882,1,0.012653,0.000000,0.000000,0.000000,0,0,0,0.219519,0,0.000000,0,0,0.690772,0,0,1,1,0,0,0,0.000000,0,0.000000,0,0.000000,1,0.298351,1,0.009942,0.000000,0.000904,0.000000,0,0,0.405019,0.000000,1,0.000000,0,0.861169,0,0.055147,0.219519,0.091912,0,0.099851,0,0.000000,0.007071,0.158088,0,0.202413,0,3,0.000000,0.000000,1,0.099851,0.000000,0.131611,0.000000,0,0.551288
2,0.000000,0.159792,0.712903,0.683320,0.492516,0.075542,0.532958,0.767220,0.815451,0.758618,0.654982,0.283088,0.107205,0,0,0.893352,0,0,0.242647,0.091912,0.015028,1,0,0.113703,0,0.020893,0,0,0,0,0.577824,0.100383,0,0,0.005157,0.090513,0.075911,0,0,0.091616,0.024880,0.194853,0,0.015843,0.996373,0.011532,0.996373,0,0,0,0.577824,0,0.030569,0,0,0.169143,0,0,0,0,0,1,0,0.017732,0,0.018382,0,0.400234,0,0.307031,0,0.011483,0.079670,0.001808,0.406191,0,0,0.400234,0.007201,0,0.036204,0,0.723556,0,0.125000,0.577824,0.066176,0,0.107205,0,0.179246,0.008985,0.158088,0,0.000167,0,4,0.026688,0.161765,0,0.107205,0.406191,0.131611,0.169118,0,0.390698
3,0.840604,0.113136,0.275655,0.235905,0.275766,0.575001,0.812955,0.633489,0.714754,0.898407,0.734595,0.316176,0.098645,0,1,0.890393,0,0,0.250000,0.091912,0.049939,0,1,0.549110,0,0.022541,0,0,0,1,0.351080,0.100383,0,0,0.005795,0.413077,0.083154,0,0,0.106101,0.026582,0.194853,0,0.017384,0.991440,0.004289,0.991440,1,0,1,0.351080,0,0.033014,1,0,0.690772,0,0,0,0,0,0,0,0.039460,0,0.007353,0,0.405019,0,0.303882,1,0.013025,0.043239,0.002871,0.127282,1,0,0.405019,0.011195,0,0.039128,0,0.781498,0,0.095588,0.351080,0.062500,1,0.098645,0,0.700875,0.010579,0.224265,0,0.000167,0,5,0.028230,0.224265,0,0.098645,0.127282,0.001241,0.172794,0,0.047437
4,0.854002,0.055728,0.085445,0.071844,0.835362,0.822493,0.910037,0.751184,0.806130,0.954278,0.823044,0.286765,0.099851,0,0,0.899708,0,1,0.268382,0.088235,0.007786,0,0,0.666961,0,0.024030,0,0,0,0,0.219519,0.122111,0,0,0.006592,0.598060,0.097639,0,0,0.142315,0.027698,0.205882,0,0.018607,0.989019,0.054988,0.989019,0,0,0,0.219519,0,0.034131,0,1,0.900992,0,1,0,1,0,0,0,0.017732,0,0.018382,0,0.390378,0,0.298351,0,0.014088,0.021511,0.003721,0.391152,0,0,0.390378,0.000000,0,0.039819,0,0.788741,0,0.091912,0.219519,0.055147,0,0.099851,1,0.890889,0.011749,0.250000,0,0.007410,0,6,0.029506,0.213235,1,0.099851,0.391152,0.049241,0.147059,1,0.551288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000839,0.082977,0.258827,0.255199,0.322931,0.325656,0.000000,0.000000,0.000000,0.000000,0.051461,0.000000,0.098645,0,0,0.000000,1,0,0.069853,0.084559,0.000000,0,1,0.113703,0,0.108612,0,0,0,0,0.351080,0.167382,0,1,0.096332,0.000000,0.003484,0,1,0.248576,0.000000,0.058824,1,0.106061,0.000000,0.000000,0.000000,0,0,0,0.351080,0,0.000000,0,0,0.518850,0,0,1,0,0,1,0,0.000000,0,0.000000,0,0.000000,1,0.307031,0,0.103668,0.000000,0.090537,0.000000,0,0,0.403442,0.007201,1,0.000000,0,0.390390,0,0.294118,0.351080,0.102941,0,0.098645,0,0.000000,0.100638,0.191176,0,0.014653,0,97,0.000000,0.000000,0,0.098645,0.000000,0.066426,0.000000,0,0.390698
96,0.845625,0.050293,0.037853,0.022889,0.835362,0.545221,0.853742,0.756479,0.817175,0.897818,0.777141,0.316176,0.099851,0,0,0.890393,0,1,0.202206,0.095588,0.049939,0,0,0.549110,0,0.111696,0,0,0,0,0.219519,0.122111,0,0,0.095853,0.413077,0.090397,0,0,0.011946,0.116215,0.205882,0,0.107230,0.989019,0.060678,0.989019,1,0,0,0.219519,0,0.122754,0,1,0.726463,0,0,0,1,0,0,0,0.017732,0,0.018382,1,0.405346,0,0.298351,1,0.102711,0.035997,0.091760,0.391152,1,0,0.405346,0.011195,0,0.128868,0,0.622158,0,0.176471,0.219519,0.058824,0,0.099851,0,0.736566,0.100319,0.202206,0,0.006858,0,98,0.118022,0.220588,0,0.099851,0.391152,0.044698,0.205882,1,0.551288
97,0.854421,0.145334,0.748246,0.667610,0.354404,0.607263,0.823221,0.958450,0.980414,0.830342,0.892190,0.253676,0.107205,0,1,0.891771,0,0,0.264706,0.084559,0.072970,0,0,0.666961,0,0.115417,0,1,0,1,0.577824,0.129354,0,0,0.097129,0.262881,0.017969,0,0,0.135072,0.119617,0.209559,0,0.110048,1.000000,0.017222,1.000000,0,0,1,0.577824,1,0.126209,0,0,0.690772,0,0,0,0,0,0,0,0.003779,0,0.029412,0,0.405019,0,0.303882,0,0.105476,0.021511,0.092717,0.801623,0,0,0.405019,0.000000,0,0.131419,0,0.578702,0,0.198529,0.577824,0.095588,1,0.107205,0,0.700875,0.102552,0.268382,0,0.014653,0,99,0.121584,0.213235,1,0.107205,0.801623,0.085455,0.183824,0,0.047437
98,0.031206,0.043870,0.064451,0.075920,0.216667,0.048804,0.550317,0.757582,0.807195,0.815767,0.585996,0.279412,0.099851,0,0,0.896413,0,1,0.231618,0.113971,0.022271,1,0,0.113703,0,0.114354,1,0,1,0,0.219519,0.042441,0,0,0.098246,0.238963,0.061484,0,0,0.069888,0.118873,0.165441,0,0.109463,0.989019,0.060678,0.989019,0,0,0,0.219519,0,0.124349,0,0,0.169143,0,0,0,0,0,0,0,0.032218,0,0.011029,0,0.400234,0,0.304236,1,0.105529,0.094156,0.093727,0.391152,0,0,0.400234,0.007201,0,0.129931,0,0.564216,0,0.205882,0.219519,0.136029,0,0.099851,0,0.179246,0.102020,0.154412,0,0.043072,0,100,0.120574,0.154412,0,0.099851,0.391152,0.138853,0.205882,0,0.086031


In [None]:
# creating labels
df_labelled = df_scaled.copy()

# using six sigma benchmark, label data (within 6 standard deviations of the mean)
stddev = df_labelled.loc[(df_labelled['measurement_P10']> 0), 'measurement_P10'].std()
mean = df_labelled.loc[(df_labelled['measurement_P10']> 0), 'measurement_P10'].mean()
upperbound = mean+(3*stddev)
lowerbound = mean-(3*stddev)
print(mean, stddev)
print(upperbound,lowerbound)

df_labelled["pass"] = 1
df_labelled.loc[(df_labelled['measurement_P10'] > upperbound) | (df_labelled['measurement_P10'] < lowerbound), 'pass'] = 0

0.8942017206897043 0.0031190092321824936
0.9035587483862517 0.8848446929931568


In [None]:
df_labelled.to_csv("/content/drive/MyDrive/Data Mining Project/Notebooks/ok_ng_data.csv")

In [None]:
df_labelled.head(100)