In [1]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.svm import SVC                            # Import SVM model using guassian
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

In [2]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [3]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [4]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

- **Radial Basis Function (RBF) Kernel**:
  - Most commonly used kernel, also known as the Gaussian kernel.
  - Measures similarity using a Gaussian-like function in a high-dimensional space.
  - Suitable for various data types due to its versatility.

- **Linear Kernel**:
  - Simplest kernel; computes the dot product between samples in the original space.
  - Works well when data is linearly separable.

- **Polynomial Kernel**:
  - Computes similarity based on polynomial function degrees.
  - Allows for more complex decision boundaries than the linear kernel.
  - Controlled by the `degree` parameter.

- **Sigmoid Kernel**:
  - Based on hyperbolic tangent functions.
  - Useful for non-linear data; works well when data distribution is uncertain.


In [5]:
kernels = ['rbf', 'linear', 'poly', 'sigmoid']

In [6]:
# Create an empty list to store results
def trainer(kernels):
    results = []
    for k in kernels:
        # Create a pipeline object for our model SVM using rbf kernel
        pipe_SVM = make_pipeline(StandardScaler(),
                         SVC(kernel=k,
                         cache_size=2000,       # Default is 200 MB.
                         verbose=0)
                         )
        
        # Measure training time
        start_train = time.time()
        pipe_SVM.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_SVM.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'kernel': k,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [7]:
results = []
for x in tqdm(np.arange(1)): # should be 10
    r = trainer(kernels)
    results.append(r)

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [05:11<00:00, 311.97s/it]


In [8]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,rbf,0.806898,0.805651,0.807083,0.807083,0.000893,0.002159
1,linear,0.857679,0.856908,0.857792,0.857792,0.002077,0.000663
2,poly,0.709703,0.735706,0.709111,0.709111,0.001525,0.000786
3,sigmoid,0.527798,0.552461,0.527576,0.527576,0.001038,0.000944


In [9]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,rbf,0.806898,0.805651,0.807083,0.807083,0.000893,0.002159
1,linear,0.857679,0.856908,0.857792,0.857792,0.002077,0.000663
2,poly,0.709703,0.735706,0.709111,0.709111,0.001525,0.000786
3,sigmoid,0.527798,0.552461,0.527576,0.527576,0.001038,0.000944


In [10]:
markdown_table = results_df.to_markdown(index=False)

In [11]:
markdown_table

'| kernel   |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |\n|:---------|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|\n| rbf      |   0.806898 |    0.805651 | 0.807083 | 0.807083 |             0.000893276 |            0.00215888  |\n| linear   |   0.857679 |    0.856908 | 0.857792 | 0.857792 |             0.00207662  |            0.000662778 |\n| poly     |   0.709703 |    0.735706 | 0.709111 | 0.709111 |             0.0015255   |            0.000785909 |\n| sigmoid  |   0.527798 |    0.552461 | 0.527576 | 0.527576 |             0.00103787  |            0.000943808 |'

| kernel   |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |
|:---------|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|
| rbf      |   0.806898 |    0.804855 | 0.806395 | 0.806395 |             0.000968527 |            0.00228607  |
| linear   |   0.856783 |    0.855553 | 0.856039 | 0.856039 |             0.00212661  |            0.000734085 |
| poly     |   0.708303 |    0.736    | 0.707387 | 0.707387 |             0.00161534  |            0.00087245  |
| sigmoid  |   0.527966 |    0.551219 | 0.52752  | 0.52752  |             0.00123145  |            0.00107637  |

In [12]:
# Convert DataFrame to LaTeX table
latex_table = results_df.to_latex(index=False)
print(latex_table)

\begin{tabular}{lrrrrrr}
\toprule
 kernel &  accuracy &  precision &   recall &       f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
    rbf &  0.806898 &   0.805651 & 0.807083 & 0.807083 &               0.000893 &              0.002159 \\
 linear &  0.857679 &   0.856908 & 0.857792 & 0.857792 &               0.002077 &              0.000663 \\
   poly &  0.709703 &   0.735706 & 0.709111 & 0.709111 &               0.001525 &              0.000786 \\
sigmoid &  0.527798 &   0.552461 & 0.527576 & 0.527576 &               0.001038 &              0.000944 \\
\bottomrule
\end{tabular}



  latex_table = results_df.to_latex(index=False)
