In [1]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.svm import SVC                            # Import SVM model using guassian
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

In [2]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [3]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [4]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

- **Radial Basis Function (RBF) Kernel**:
  - Most commonly used kernel, also known as the Gaussian kernel.
  - Measures similarity using a Gaussian-like function in a high-dimensional space.
  - Suitable for various data types due to its versatility.

- **Linear Kernel**:
  - Simplest kernel; computes the dot product between samples in the original space.
  - Works well when data is linearly separable.

- **Polynomial Kernel**:
  - Computes similarity based on polynomial function degrees.
  - Allows for more complex decision boundaries than the linear kernel.
  - Controlled by the `degree` parameter.

- **Sigmoid Kernel**:
  - Based on hyperbolic tangent functions.
  - Useful for non-linear data; works well when data distribution is uncertain.


In [5]:
kernels = ['rbf', 'linear', 'poly', 'sigmoid']

In [6]:
# Create an empty list to store results
def trainer(k, X, y):
    # Randomly split dataset into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test
    
    results = []
    for k in kernels:
        # Create a pipeline object for our model SVM using rbf kernel
        pipe_SVM = make_pipeline(StandardScaler(),
                         SVC(kernel=k,
                         cache_size=2000,       # Default is 200 MB.
                         verbose=0)
                         )
        
        # Measure training time
        start_train = time.time()
        pipe_SVM.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_SVM.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'kernel': k,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [7]:
results = []
for x in tqdm(np.arange(10)): # should be 10
    r = trainer(kernels, X, y)
    results.append(r)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [53:06<00:00, 318.60s/it]


In [8]:
# Flatten the list of lists
flattened_results = [item for sublist in results for item in sublist]

# Convert to DataFrame
flattend_results_df = pd.DataFrame(flattened_results)

In [20]:
mean_df = flattend_results_df.groupby('kernel').mean().reset_index()
mean_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,linear,0.853317,0.852342,0.853542,0.853542,0.002092,0.000692
1,poly,0.707178,0.731541,0.706453,0.706453,0.001547,0.000822
2,rbf,0.803253,0.801155,0.803638,0.803638,0.000849,0.002195
3,sigmoid,0.529774,0.543095,0.529517,0.529517,0.001145,0.000981


In [17]:
# Round the columns to the 4th decimal place
mean_df['accuracy'] = mean_df['accuracy'].round(2)
mean_df['precision'] = mean_df['precision'].round(2)
mean_df['recall'] = mean_df['recall'].round(2)
mean_df['f1'] = mean_df['f1'].round(2)

# Scale from seconds to milliseconds
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'] * 1000 # to microsec. 
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'].round(2)

mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'] * 1000 # to microsec.
mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'].round(2)

mean_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,linear,0.85,0.85,0.85,0.85,2.09,0.69
1,poly,0.71,0.73,0.71,0.71,1.55,0.82
2,rbf,0.8,0.8,0.8,0.8,0.85,2.19
3,sigmoid,0.53,0.54,0.53,0.53,1.15,0.98


In [21]:
std_df = flattend_results_df.groupby('kernel').std().reset_index()
std_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,linear,0.002919,0.002848,0.002709,0.002709,7.4e-05,3.1e-05
1,poly,0.003054,0.003845,0.002669,0.002669,0.000101,3.4e-05
2,rbf,0.003391,0.003133,0.002892,0.002892,4.6e-05,5.3e-05
3,sigmoid,0.006658,0.01232,0.005837,0.005837,0.000135,5.9e-05


In [22]:
# Round the columns to the 4th decimal place
std_df['accuracy'] = std_df['accuracy'].round(4)
std_df['precision'] = std_df['precision'].round(4)
std_df['recall'] = std_df['recall'].round(4)
std_df['f1'] = std_df['f1'].round(4)

# Scale from seconds to milliseconds
std_df['train_time_per_sample'] = std_df['train_time_per_sample'] * 1000 # to microsec. 
std_df['train_time_per_sample'] = std_df['train_time_per_sample'].round(2)

std_df['test_time_per_sample'] = std_df['test_time_per_sample'] * 1000 # to microsec.
std_df['test_time_per_sample'] = std_df['test_time_per_sample'].round(2)

std_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,linear,0.0029,0.0028,0.0027,0.0027,0.07,0.03
1,poly,0.0031,0.0038,0.0027,0.0027,0.1,0.03
2,rbf,0.0034,0.0031,0.0029,0.0029,0.05,0.05
3,sigmoid,0.0067,0.0123,0.0058,0.0058,0.13,0.06


In [23]:
markdown_table = mean_df.to_markdown(index=False)
markdown_table

'| kernel   |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |\n|:---------|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|\n| linear   |   0.853317 |    0.852342 | 0.853542 | 0.853542 |             0.00209208  |            0.000691631 |\n| poly     |   0.707178 |    0.731541 | 0.706453 | 0.706453 |             0.00154747  |            0.000821536 |\n| rbf      |   0.803253 |    0.801155 | 0.803638 | 0.803638 |             0.000848904 |            0.00219455  |\n| sigmoid  |   0.529774 |    0.543095 | 0.529517 | 0.529517 |             0.00114507  |            0.00098072  |'

In [24]:
# Convert DataFrame to LaTeX table
latex_table = mean_df.to_latex(index=False)
print(latex_table)

\begin{tabular}{lrrrrrr}
\toprule
 kernel &  accuracy &  precision &   recall &       f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
 linear &  0.853317 &   0.852342 & 0.853542 & 0.853542 &               0.002092 &              0.000692 \\
   poly &  0.707178 &   0.731541 & 0.706453 & 0.706453 &               0.001547 &              0.000822 \\
    rbf &  0.803253 &   0.801155 & 0.803638 & 0.803638 &               0.000849 &              0.002195 \\
sigmoid &  0.529774 &   0.543095 & 0.529517 & 0.529517 &               0.001145 &              0.000981 \\
\bottomrule
\end{tabular}



  latex_table = mean_df.to_latex(index=False)
