<a href="https://colab.research.google.com/github/Ferb168/Industrial_AI_and_eMaintenance_Assignments/blob/main/Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Grade 3 - Data Preprocessing and Transformation

---

1. Load the data from all three files.
2. Combine the three datasets into a single unified dataset.
3. Remove the columns: `start_time`, `axle`, `cluster`, `tsne_1`, and `tsne_2`.
4. Replace all 'normal' events with 0 and all other events with 1 to create a binary classification target.
5. Normalize the dataset to standardize feature scales.


In [22]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt

import pdb # for debugging

# Load the data from all three files

from google.colab import drive
drive.mount('/content/drive')

csv_path_trail1 = '/content/drive/My Drive/Course/LuleaUniversity/eMaintenance_and_IndustrialAI/Assignment3/Data/Trail1_extracted_features_acceleration_m1ai1.csv'
csv_path_trail2 = '/content/drive/My Drive/Course/LuleaUniversity/eMaintenance_and_IndustrialAI/Assignment3/Data/Trail2_extracted_features_acceleration_m1ai1.csv'
csv_path_trail3 = '/content/drive/My Drive/Course/LuleaUniversity/eMaintenance_and_IndustrialAI/Assignment3/Data/Trail3_extracted_features_acceleration_m2ai0.csv'

df_trail1 = pd.read_csv(csv_path_trail1)
df_trail2 = pd.read_csv(csv_path_trail2)
df_trail3 = pd.read_csv(csv_path_trail3)
print("Shape of combined dataframe1:", df_trail1.shape)
print("Shape of combined dataframe2:", df_trail2.shape)
print("Shape of combined dataframe3:", df_trail3.shape)

# Combine the three datasets into a single unified dataset.

combined_df = pd.concat([df_trail1, df_trail2, df_trail3], ignore_index=True) # ignore_index=True: disregard the original index from each DataFrame and instead create a new, sequential index for the table
print("Dataframes combined successfully.")
print("Shape of combined dataframe:", combined_df.shape)

# Remove the columns start_time, axle, cluster, tsne_1, and tsne_2 from the dataset.

columns_remove = ['start_time', 'axle', 'cluster', 'tsne_1', 'tsne_2']
processed_df = combined_df.drop(columns=columns_remove)
print(f"Columns {columns_remove} dropped successfully.")
print("Shape of combined dataframe after dropping columns:", processed_df.shape)

# Replace all normal events with 0 and all other events with 1.
## 0 = normal = the trail still works fine. 1 = abnormal = the trail might have things happen.
## Thus, the label of 'event' indicates if something happen or not.

processed_df['event'] = processed_df['event'].apply(lambda x: 0 if x == 'normal' else 1)
print("Event column transformed successfully.")
display(processed_df.head())

# Normalize the dataset by Min-Max scaling, i.e., X_scaled = (X - X_min) / (X_max - X_min)
## Identify the numerical columns to normalize, but exclude the 'event' column
normalized_df = processed_df.copy() # create a data frame space that will store the normalized data.
numerical_cols = processed_df.select_dtypes(include=np.number).columns.tolist()
if 'event' in numerical_cols:
    numerical_cols.remove('event')

for col in numerical_cols:
    min_val = processed_df[col].min()
    max_val = processed_df[col].max()

    if max_val - min_val != 0: # Avoid division by zero if all values in the column are the same
        normalized_df[col] = (processed_df[col] - min_val) / (max_val - min_val)
    else:
        normalized_df[col] = 0.5 # if all values in the column are the same, then set them to a value between 0-1 as they have been 'normalized'

print("Data normalized successfully using manual Min-Max Scaling.")
display(normalized_df.head())





Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Shape of combined dataframe1: (52, 19)
Shape of combined dataframe2: (49, 19)
Shape of combined dataframe3: (49, 22)
Dataframes combined successfully.
Shape of combined dataframe: (150, 22)
Columns ['start_time', 'axle', 'cluster', 'tsne_1', 'tsne_2'] dropped successfully.
Shape of combined dataframe after dropping columns: (150, 17)
Event column transformed successfully.


Unnamed: 0,mean,std,max,min,range,skewness,kurtosis,rms,crest_factor,variance,zero_crossings,dominant_freq,spectral_energy,spectral_centroid,spectral_bandwidth,spectral_flatness,event
0,-5e-06,0.00135,0.007542,-0.006189,0.013731,-0.004788,0.472182,0.00135,5.587349,2e-06,5798,475.0,7.037723e-08,1962.160093,2412.052659,0.274188,0
1,-6e-06,0.02436,0.215148,-0.249093,0.464241,-0.036717,26.678484,0.02436,8.831983,0.000593,2809,375.0,2.468464e-05,352.868951,257.055863,0.001911,1
2,1.6e-05,0.003036,0.013389,-0.014713,0.028103,-0.058478,0.208181,0.003036,4.409818,9e-06,2598,475.0,3.563915e-07,681.2514,1274.1871,0.066875,0
3,6.7e-05,0.024002,0.298642,-0.290638,0.589279,0.990779,39.908555,0.024002,12.442279,0.000576,1212,75.0,2.348424e-05,263.747571,322.445494,0.002548,1
4,-0.000148,0.008061,0.024657,-0.042391,0.067048,-0.331677,1.217695,0.008062,3.058305,6.5e-05,426,75.0,2.240564e-06,244.161218,566.499799,0.011984,0


Data normalized successfully using manual Min-Max Scaling.


Unnamed: 0,mean,std,max,min,range,skewness,kurtosis,rms,crest_factor,variance,zero_crossings,dominant_freq,spectral_energy,spectral_centroid,spectral_bandwidth,spectral_flatness,event
0,0.500636,0.004049,0.002282,0.998019,0.002108,0.357544,0.012879,0.004049,0.210791,0.000104,0.752236,0.75,9.9e-05,0.563868,0.920766,0.595135,0
1,0.499171,0.262517,0.13442,0.778827,0.170808,0.34958,0.31411,0.262517,0.415485,0.073124,0.359153,0.583333,0.076233,0.051548,0.042569,0.003571,1
2,0.540572,0.022993,0.006004,0.990327,0.00749,0.344152,0.009844,0.022993,0.136504,0.001017,0.331405,0.75,0.000983,0.156089,0.457067,0.144716,0
3,0.639169,0.258496,0.187563,0.741338,0.217631,0.605874,0.466185,0.258496,0.643248,0.070987,0.149132,0.083333,0.07252,0.023177,0.069216,0.004955,1
4,0.224558,0.079433,0.013175,0.965351,0.022074,0.276006,0.021448,0.079447,0.051241,0.007899,0.045765,0.083333,0.006811,0.016941,0.168672,0.025456,0


# Grade 4 - Dataset Splitting and Model Evaluation


---


1. Split the unified, preprocessed dataset into training and testing sets with an 80/20 ratio.
2. Perform k-fold cross-validation (e.g., 5-fold) on the training set to evaluate the stability of the model.
3. Train a Support Vector Machine (SVM) model using:
<br>3-1. The 80/20 train-test split.
<br>3-2. k-fold cross-validation on the training data.
4. Evaluate and compare the performance of both approaches in terms of accuracy, consistency of the results, and generalization ability.
5. Discuss the differences observed between train-test split and cross-validation results.


In [21]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC # Importing a Support Vector Classifier as a placeholder model

# we are going to use features to predict 'event' results, so these two dataset should be split.
feature_df = normalized_df.drop('event', axis=1) # axis=1=let panda library process based on column, where axis=0 indicate processing in row.
outcome_df = normalized_df['event']

# Split the data into training and testing sets by train_test_split function, which was imported from scikit-learn
# Complete syntax default = train_test_split(*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None)
# test_size=0.2 means 20% of the data will be in the test set
# random_state: Set a random state for reproducibility. 42 = Answer to the Ultimate Question of Life, the Universe, and Everything
# other parameters are default value as we want dataset to be shuffled for train/test purpose, and no need for stratify as our data is not imbalance.
feature_df_2080train, feature_df_2080test, outcome_df_2080train, outcome_df_2080test = train_test_split(feature_df, outcome_df, test_size=0.2, random_state=42)

# Initialize a placeholder model (Support Vector Classifier)
# In a real scenario, you would tune the parameters of your model.
model = SVC(random_state=42)

# 1. Train the model on the entire training set
model.fit(feature_df_2080train, outcome_df_2080train)

# 2. Evaluate the trained model on the held-out test set
test_accuracy = model.score(feature_df_2080test, outcome_df_2080test) # For classifiers, .score() typically returns accuracy

print(f"Accuracy on the 20/80 test set: {test_accuracy:.4f}")


print("Data split into training and testing sets successfully.")
print(f"Shape of feature_df_train in 20/80 spliit: {feature_df_2080train.shape}")
print(f"Shape of feature_df_test in 20/80 spliit: {feature_df_2080test.shape}")
print(f"Shape of outcome_df_train in 20/80 spliit: {outcome_df_2080train.shape}")
print(f"Shape of outcome_df_test in 20/80 spliit: {outcome_df_2080test.shape}")

#############################

# Initialize the KFold cross-validator
# shuffle=True ensures the data is shuffled before splitting into folds
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
# cross_val_score trains the model on k-1 folds and tests on the remaining fold,
# repeating this k times and returning the scores for each fold.
# scoring='accuracy' specifies that we want to use accuracy as the evaluation metric.
cv_scores = cross_val_score(model, feature_df, outcome_df, cv=kf, scoring='accuracy')

# Print the cross-validation scores
print(f"Cross-validation accuracy scores for each fold: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean():.4f}")
print(f"Standard deviation of cross-validation accuracy: {cv_scores.std():.4f}")

Accuracy on the 20/80 test set: 0.9667
Data split into training and testing sets successfully.
Shape of feature_df_train in 20/80 spliit: (120, 16)
Shape of feature_df_test in 20/80 spliit: (30, 16)
Shape of outcome_df_train in 20/80 spliit: (120,)
Shape of outcome_df_test in 20/80 spliit: (30,)
Cross-validation accuracy scores for each fold: [0.96666667 0.93333333 1.         1.         0.96666667]
Mean cross-validation accuracy: 0.9733
Standard deviation of cross-validation accuracy: 0.0249
