## Import Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# Access to Google Drive
from google.colab import drive
drive.mount('/content/drive')

## Load the SELECTED (Top 30) Features Dataset
* Results of ML3-1 and ML3-2

In [None]:
FeatureSelected = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/SavedFiles/FeatureSelected.csv', header=None)
FeatureSelected = FeatureSelected.T
FeatureSelected.shape

## Standardize the feature values

In [None]:
from sklearn.preprocessing import StandardScaler

FeatureSelected_std = StandardScaler().fit_transform(FeatureSelected)
FeatureSelected_std.shape

## Split Dataset into Training and Test Sets
- Use 'train_test_split' function
- It randomly samples the training and testing data according to the designated ratio.

In [None]:
# Number of data for each condition: 180
NoOfData   = int(FeatureSelected_std.shape[0]/2)

# Separate the dataset into normal and abnormal sets
NormalSet   = FeatureSelected_std[:NoOfData , :]
AbnormalSet = FeatureSelected_std[NoOfData: , :]

NormalSet.shape, AbnormalSet.shape

In [None]:
from sklearn.model_selection    import train_test_split

# Define the test data ratio
TestData_Ratio = 0.2 

# Split the normal and abnormal sets into training and test sets
TrainData_Nor, TestData_Nor = train_test_split(NormalSet  , test_size=TestData_Ratio, random_state=777)
TrainData_Abn, TestData_Abn = train_test_split(AbnormalSet, test_size=TestData_Ratio, random_state=777)

print(TrainData_Nor.shape, TestData_Nor.shape)
print(TrainData_Abn.shape, TestData_Abn.shape)

## Label the data using np.zeros and np.ones
- in this tutorial, 0 refers to 'Normal' and 1 refers to 'Abnormal'

In [None]:
# Create labels for the training and test sets
TrainLabel_Nor = np.zeros(TrainData_Nor.shape[0]) # 0: Normal
TrainLabel_Abn = np.ones( TrainData_Abn.shape[0]) # 1: Abnormal
TestLabel_Nor  = np.zeros(TestData_Nor.shape[0])  # 0: Normal
TestLabel_Abn  = np.ones( TestData_Abn.shape[0])  # 1: Abnormal

print(TrainLabel_Nor.shape, TestLabel_Nor.shape)
print(TrainLabel_Abn.shape, TestLabel_Abn.shape)

## Prepare the final Data and Label for ML modeling


In [None]:
# Combine the normal and abnormal data/labels
TrainData  = np.concatenate([TrainData_Nor , TrainData_Abn ], axis=0)
TestData   = np.concatenate([TestData_Nor  , TestData_Abn  ], axis=0)
TrainLabel = np.concatenate([TrainLabel_Nor, TrainLabel_Abn], axis=0)
TestLabel  = np.concatenate([TestLabel_Nor , TestLabel_Abn ], axis=0)

print(TrainData.shape,  TestData.shape)
print(TrainLabel.shape, TestLabel.shape)

.

.

.

.

.

.

.



## Grid search for Support Vector Machine (SVM) hyperparameters

### [Main hyperparameters of SVM]

1. **Kernel type**: The kernel is a function that transforms the input data into a higher-dimensional space, making it easier to find the optimal decision boundary. There are several kernel types available for SVM, but the most common ones are:

- Linear: $K(x, y) = x^T * y$
- Polynomial: $K(x, y) = (gamma * x^T * y + coef0)^{degree}$
- Radial basis function (RBF): $K(x, y) = exp(-gamma * ||x - y||^2)$
- Sigmoid: $K(x, y) = tanh(gamma * x^T * y + coef0)$

.

2. **$C$ (Cost parameter)**: $C$ is a regularization parameter that controls the trade-off between maximizing the margin and minimizing the classification error. A smaller value of $C$ creates a wider margin but allows some misclassifications, which can be useful for noisy data. A larger value of $C$ will force the SVM to classify all training samples correctly, leading to a smaller margin and potentially overfitting. Choosing the appropriate value for $C$ is critical for the model's performance.

.

3. **$Gamma (γ)$**: Gamma is a parameter specific to the RBF and polynomial kernels. It controls the shape of the decision boundary. A small gamma value results in a more flexible decision boundary, while a large gamma value leads to a more rigid decision boundary. Selecting the right gamma value is essential for avoiding overfitting or underfitting.

### Prepare lists of hyperparameters for grid search

In [None]:
param_kernel = ['linear', 'poly', 'rbf', 'sigmoid'] # kernel type
param_C      = [0.01, 0.1, 1, 10, 100]              # regularization parameter
param_gamma  = [0.01, 0.1, 1, 10, 100]              # boundary shape parameter

# Calculate the number of cases
NoOfCases = len(param_kernel) * len(param_C) * len(param_gamma)
NoOfCases

In [None]:
# Create an empty dataframe to store the accuracy results
Accuracy_df = pd.DataFrame(np.zeros(shape=(NoOfCases , 4)),
                           columns=['kernel', 'C', 'gamma', 'Accuracy'])
Accuracy_df

### Train the SVM models with different combinations of hyperparameters and save them

In [None]:
# Import necessary packages for SVM
from sklearn import svm
import joblib

# Initialize a count value to store the performance of each model
cnt = 0

# Iterate through all possible combinations of kernel, C, and gamma values
for temp_kernel in param_kernel:            # Select each 'kernel' type in the list
    for temp_c in param_C:                  # Select each 'C' (regularization parameter) value in the list
        for temp_gamma in param_gamma:      # Select each 'gamma' (boundary shape parameter) value in the list
            
            # Create, train, and validate a temporary SVM model with the current combination of hyperparameters
            tempsvmModel = svm.SVC(kernel=temp_kernel, C=temp_c, gamma=temp_gamma)
            tempsvmModel.fit(TrainData, TrainLabel)
            tempAccuracy = tempsvmModel.score(TestData, TestLabel)

            # Save the temporary model to a file with a corresponding name
            formatted_C     = f"{temp_c:.2f}"     # Format the C values with two decimal places
            formatted_gamma = f"{temp_gamma:.2f}" # Format the gamma values with two decimal places
            tempsvmModel_name = f'SVM_{temp_kernel}_C{formatted_C}_G{formatted_gamma}.plk'
            joblib.dump(tempsvmModel, '/content/drive/MyDrive/Colab Notebooks/SavedFiles/ML_Models/GridSearch_SVM/'+tempsvmModel_name)
            
            # Store the performance (accuracy) of the temporary model in the dataframe
            Accuracy_df.iloc[cnt, :] = [temp_kernel, temp_c, temp_gamma, tempAccuracy]
            cnt += 1

# Display the resulting dataframe with model performances
Accuracy_df

### Confirm the grid search results

In [None]:
# Sort the Accuracy_df by 'Accuracy' column in descending order
Accuracy_df_sorted = Accuracy_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

# Output the best case
print("[Best case]\nKernel: " +Accuracy_df_sorted.iloc[0,0]+
      "\nC     : %.2f\ngamma : %.2f\n\nAccuracy: %.2f"%(Accuracy_df_sorted.iloc[0,1], 
                                                        Accuracy_df_sorted.iloc[0,2], 
                                                        Accuracy_df_sorted.iloc[0,3]))

In [None]:
# Calculate mean and standard deviation accuracy for each kernel
mean_accuracy_Kernel = Accuracy_df.groupby(['kernel'])['Accuracy'].agg(['mean', 'std']).reset_index()
mean_accuracy_Kernel

In [None]:
# Calculate mean and standard deviation of accuracy for each C
mean_accuracy_C = Accuracy_df.groupby(['C'])['Accuracy'].agg(['mean', 'std']).reset_index()
mean_accuracy_C

In [None]:
# Calculate mean and standard deviation of accuracy for each gamma
mean_accuracy_Gamma = Accuracy_df.groupby(['gamma'])['Accuracy'].agg(['mean', 'std']).reset_index()
mean_accuracy_Gamma

### Visualize the performance comparison for the selected hyperparameter

In [None]:
# Set an index to select a hyperparmeter
# 0: kernel // 1: C // 2: Gamma
idx = 0

# Automatically define variables based on the selected index
H_Param  = ['Kernel', 'C', 'Gamma']
Selected = H_Param[idx]
exec('Result = mean_accuracy_' + H_Param[idx])

xLabel = Result.iloc[:,0]
x_pos = np.arange(Result.shape[0])
y_val = Result['mean']
y_err = Result['std']

# Draw a bar chart to compare the model performance (diagnostic accuracy) for each hyperparameter
fig, ax = plt.subplots(figsize=(10,5))

# Create a bar plot with error bars
ax.bar(x_pos, y_val, yerr=y_err, align='center', alpha=0.5, ecolor='black', capsize=10,
       color = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple'])
ax.set_ylabel('Accuracy (mean)', fontsize=15)
ax.set_title(f"Model performance comparsion by '{Selected}'", fontsize=20)
ax.set_xticks(x_pos)
ax.set_xticklabels(xLabel, fontsize=15)
ax.yaxis.grid()

plt.tight_layout()
plt.show()

.

.

.

.

.

# Task

- Load the 90th ranked SVM model and predict the output (Robotic spot-welding condition) for test data.

1. Refer to the provided codes if you need assistance.
2. Regard the 90th row of 'Accuracy_df_sorted' as the 90th ranked case.
3. Follow these steps:
    - Retrieve the kernel, C, and gamma values from the 'Accuracy_df_sorted'.
    - Format the C and gamma values with two decimal places.
    - Load the 90th ranked SVM model using the retrieved hyperparameters.
    - Predict the output (Robotic spot-welding condition) for the test data.

In [None]:









Predicted = 

- You can confirm the performance of model by 1) confusion matrix and 2) evaluation metrics 


## [ Confusion Matrix ]
- A table that visualizes the performance of a classification model by displaying the number of true positive (TP), true negative (TN), false positive (FP), and false negative (FN) predictions. The rows represent the true class labels, while the columns represent the predicted class labels. In a binary classification problem:

    - TP: The number of instances where the model correctly predicted the positive class.
    - TN: The number of instances where the model correctly predicted the negative class.
    - FP: The number of instances where the model falsely predicted the positive class (actual negative instances).
    - FN: The number of instances where the model falsely predicted the negative class (actual positive instances).

In [None]:
# Plot the confusion matrix
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(TestLabel, Predicted)

plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Blues, cbar=False, square=True)
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.title("Confusion Matrix of the Best SVM Model")
plt.show()

## [ Evaluation metrics (for classfication) ]

1. $Accuracy$: The proportion of correctly classified instances out of the total instances. It measures the overall performance of a classification model.

    - $Accuracy: (TP + TN) / (TP + TN + FP + FN)$

2. $Precision$: The proportion of true positive instances among the instances predicted as positive. It measures how well the model correctly identifies positive instances.

    - $Precision: TP / (TP + FP)$

3. $Recall$: The proportion of true positive instances among the actual positive instances. It measures the ability of the model to find all the positive instances.

    - $Recall: TP / (TP + FN)$

4. $F1 Score$: The harmonic mean of precision and recall. It provides a single score that balances both precision and recall, which is especially useful when dealing with imbalanced datasets.

    - $F1 Score: 2 * (Precision * Recall) / (Precision + Recall)$

In [None]:
from sklearn import metrics

# Calculate the evaluation metrics
accuracy  = metrics.accuracy_score(TestLabel, Predicted)
precision = metrics.precision_score(TestLabel, Predicted)
recall    = metrics.recall_score(TestLabel, Predicted)
f1_score  = metrics.f1_score(TestLabel, Predicted)

# Print the evaluation metrics
print(f"Best SVM Model Evaluation:\n")
print(f"Accuracy : {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall   : {recall:.2f}")
print(f"F1 Score : {f1_score:.2f}")