In [1]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.svm import SVC                            # Import SVM model using guassian
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm

In [2]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [3]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [4]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [5]:
# c values
# Number of `C` values to consider
num_c_values = 5

# Generate logarithmically spaced `C` values
c_values = np.logspace(-3, 3, num=num_c_values)
c_values

array([1.00000000e-03, 3.16227766e-02, 1.00000000e+00, 3.16227766e+01,
       1.00000000e+03])

In [6]:
# Create an empty list to store results
def trainer(c_values):
    results = []
    for c in c_values:
        # Create a pipeline object for our model SVM using rbf kernel
        pipe_SVM = make_pipeline(StandardScaler(),
                                 SVC(kernel='rbf',
                                     C=c, 
                                     cache_size=2000,
                                     verbose=0)
                                )
        
        # Measure training time
        start_train = time.time()
        pipe_SVM.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_SVM.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'c_value': c,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [7]:
results = []
for x in tqdm(np.arange(10)): # should be 10
    r = trainer(c_values)
    results.append(r)

100%|██████████| 10/10 [1:35:03<00:00, 570.37s/it]


In [12]:
# Flatten the list of lists
flattened_results = [item for sublist in results for item in sublist]

# Convert to DataFrame
flattend_results_df = pd.DataFrame(flattened_results)

In [18]:
mean_df = flattend_results_df.groupby('c_value').mean().reset_index()
mean_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.575612,0.673365,0.573336,0.573336,0.003286,0.004501
1,0.031623,0.731034,0.726448,0.731192,0.731192,0.001393,0.003173
2,1.0,0.805498,0.802608,0.805906,0.805906,0.000854,0.002445
3,31.622777,0.936398,0.936317,0.93648,0.93648,0.000917,0.001584
4,1000.0,0.980012,0.97995,0.980009,0.980009,0.001894,0.000756


In [19]:
# Round the columns to the 4th decimal place
mean_df['accuracy'] = mean_df['accuracy'].round(2)
mean_df['precision'] = mean_df['precision'].round(2)
mean_df['recall'] = mean_df['recall'].round(2)
mean_df['f1'] = mean_df['f1'].round(2)

# Scale from seconds to milliseconds
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'] * 1000 # to millisec. 
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'].round(2)

mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'] * 1000 # to millisec.
mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'].round(2)

mean_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.58,0.67,0.57,0.57,3.29,4.5
1,0.031623,0.73,0.73,0.73,0.73,1.39,3.17
2,1.0,0.81,0.8,0.81,0.81,0.85,2.44
3,31.622777,0.94,0.94,0.94,0.94,0.92,1.58
4,1000.0,0.98,0.98,0.98,0.98,1.89,0.76


In [21]:
std_df = flattend_results_df.groupby('c_value').std().reset_index()
std_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.0,0.0,0.0,0.0,0.000188,0.000392
1,0.031623,0.0,0.0,0.0,0.0,7.2e-05,0.000201
2,1.0,0.0,0.0,0.0,0.0,4.3e-05,0.000221
3,31.622777,0.0,0.0,0.0,0.0,4.5e-05,7.5e-05
4,1000.0,0.0,0.0,0.0,0.0,6.3e-05,5.2e-05


In [22]:
# Round the columns to the 4th decimal place
std_df['accuracy'] = std_df['accuracy'].round(4)
std_df['precision'] = std_df['precision'].round(4)
std_df['recall'] = std_df['recall'].round(4)
std_df['f1'] = std_df['f1'].round(4)

# Scale from seconds to milliseconds
std_df['train_time_per_sample'] = std_df['train_time_per_sample'] * 1000 # to millisec. 
std_df['train_time_per_sample'] = std_df['train_time_per_sample'].round(2)

std_df['test_time_per_sample'] = std_df['test_time_per_sample'] * 1000 # to millisec.
std_df['test_time_per_sample'] = std_df['test_time_per_sample'].round(2)

std_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.0,0.0,0.0,0.0,0.19,0.39
1,0.031623,0.0,0.0,0.0,0.0,0.07,0.2
2,1.0,0.0,0.0,0.0,0.0,0.04,0.22
3,31.622777,0.0,0.0,0.0,0.0,0.04,0.08
4,1000.0,0.0,0.0,0.0,0.0,0.06,0.05


In [23]:
# Convert DataFrame to LaTeX table
latex_table = mean_df.to_latex(index=False)
print(latex_table)

\begin{tabular}{rrrrrrr}
\toprule
    c\_value &  accuracy &  precision &  recall &   f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
   0.001000 &      0.58 &       0.67 &    0.57 & 0.57 &                   3.29 &                  4.50 \\
   0.031623 &      0.73 &       0.73 &    0.73 & 0.73 &                   1.39 &                  3.17 \\
   1.000000 &      0.81 &       0.80 &    0.81 & 0.81 &                   0.85 &                  2.44 \\
  31.622777 &      0.94 &       0.94 &    0.94 & 0.94 &                   0.92 &                  1.58 \\
1000.000000 &      0.98 &       0.98 &    0.98 & 0.98 &                   1.89 &                  0.76 \\
\bottomrule
\end{tabular}



  latex_table = mean_df.to_latex(index=False)
