In [37]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.svm import SVC                            # Import SVM model using guassian
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from tqdm import tqdm
import joblib

In [25]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [26]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [27]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [28]:
# c values
# Number of `C` values to consider
num_c_values = 5

# Generate logarithmically spaced `C` values
c_values = np.logspace(-3, 3, num=num_c_values)
c_values

array([1.00000000e-03, 3.16227766e-02, 1.00000000e+00, 3.16227766e+01,
       1.00000000e+03])

In [29]:
# Create an empty list to store results
def trainer(c_values):
    # Randomly split dataset into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test


    results = []
    for c in c_values:
        # Create a pipeline object for our model SVM using rbf kernel
        pipe_SVM = make_pipeline(StandardScaler(),
                                 SVC(kernel='rbf',
                                     C=c, 
                                     cache_size=2000,
                                     verbose=0)
                                )
        
        # Measure training time
        start_train = time.time()
        pipe_SVM.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_SVM.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'c_value': c,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [30]:
results = []
for x in tqdm(np.arange(10)): # should be 10
    r = trainer(c_values)
    results.append(r)

100%|██████████| 10/10 [1:35:01<00:00, 570.13s/it]


In [31]:
# Flatten the list of lists
flattened_results = [item for sublist in results for item in sublist]

# Convert to DataFrame
flattend_results_df = pd.DataFrame(flattened_results)

In [32]:
mean_df = flattend_results_df.groupby('c_value').mean().reset_index()
mean_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.569251,0.662765,0.56662,0.56662,0.003365,0.004155
1,0.031623,0.729103,0.724979,0.72946,0.72946,0.001474,0.00294
2,1.0,0.803387,0.801099,0.803977,0.803977,0.000871,0.002271
3,31.622777,0.933223,0.933585,0.933472,0.933472,0.000927,0.001532
4,1000.0,0.979822,0.979834,0.979882,0.979882,0.002052,0.000744


In [33]:
# Round the columns to the 4th decimal place
mean_df['accuracy'] = mean_df['accuracy'].round(2)
mean_df['precision'] = mean_df['precision'].round(2)
mean_df['recall'] = mean_df['recall'].round(2)
mean_df['f1'] = mean_df['f1'].round(2)

# Scale from seconds to milliseconds
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'] * 1000 # to millisec. 
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'].round(2)

mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'] * 1000 # to millisec.
mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'].round(2)

mean_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.57,0.66,0.57,0.57,3.37,4.16
1,0.031623,0.73,0.72,0.73,0.73,1.47,2.94
2,1.0,0.8,0.8,0.8,0.8,0.87,2.27
3,31.622777,0.93,0.93,0.93,0.93,0.93,1.53
4,1000.0,0.98,0.98,0.98,0.98,2.05,0.74


In [34]:
std_df = flattend_results_df.groupby('c_value').std().reset_index()
std_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.014097,0.008966,0.013027,0.013027,0.000186,9.3e-05
1,0.031623,0.002239,0.002479,0.001734,0.001734,0.000114,9.1e-05
2,1.0,0.003419,0.003426,0.002993,0.002993,4.7e-05,7e-05
3,31.622777,0.002736,0.002805,0.002649,0.002649,4.9e-05,5.7e-05
4,1000.0,0.002247,0.0022,0.002195,0.002195,0.000218,7e-05


In [35]:
# Round the columns to the 4th decimal place
std_df['accuracy'] = std_df['accuracy'].round(4)
std_df['precision'] = std_df['precision'].round(4)
std_df['recall'] = std_df['recall'].round(4)
std_df['f1'] = std_df['f1'].round(4)

# Scale from seconds to milliseconds
std_df['train_time_per_sample'] = std_df['train_time_per_sample'] * 1000 # to millisec. 
std_df['train_time_per_sample'] = std_df['train_time_per_sample'].round(2)

std_df['test_time_per_sample'] = std_df['test_time_per_sample'] * 1000 # to millisec.
std_df['test_time_per_sample'] = std_df['test_time_per_sample'].round(2)

std_df

Unnamed: 0,c_value,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,0.001,0.0141,0.009,0.013,0.013,0.19,0.09
1,0.031623,0.0022,0.0025,0.0017,0.0017,0.11,0.09
2,1.0,0.0034,0.0034,0.003,0.003,0.05,0.07
3,31.622777,0.0027,0.0028,0.0026,0.0026,0.05,0.06
4,1000.0,0.0022,0.0022,0.0022,0.0022,0.22,0.07


In [36]:
# Convert DataFrame to LaTeX table
latex_table = mean_df.to_latex(index=False)
print(latex_table)

  latex_table = mean_df.to_latex(index=False)


\begin{tabular}{rrrrrrr}
\toprule
    c\_value &  accuracy &  precision &  recall &   f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
   0.001000 &      0.57 &       0.66 &    0.57 & 0.57 &                   3.37 &                  4.16 \\
   0.031623 &      0.73 &       0.72 &    0.73 & 0.73 &                   1.47 &                  2.94 \\
   1.000000 &      0.80 &       0.80 &    0.80 & 0.80 &                   0.87 &                  2.27 \\
  31.622777 &      0.93 &       0.93 &    0.93 & 0.93 &                   0.93 &                  1.53 \\
1000.000000 &      0.98 &       0.98 &    0.98 & 0.98 &                   2.05 &                  0.74 \\
\bottomrule
\end{tabular}



In [49]:
# List of columns to combine mean and standard deviation
columns_to_combine = ['accuracy', 'precision', 'recall', 'train_time_per_sample', 'test_time_per_sample']  # You can modify this list as per your requirement

combined_df = pd.DataFrame()
combined_df['c_value'] = mean_df['c_value']
# Combine mean and standard deviation for specified columns
for column in columns_to_combine:
    combined_df[column] = mean_df[column].astype(str) + '$\pm$' + std_df[column].astype(str)

combined_df

Unnamed: 0,c_value,accuracy,precision,recall,train_time_per_sample,test_time_per_sample
0,0.001,0.57$\pm$0.0141,0.66$\pm$0.009,0.57$\pm$0.013,3.37$\pm$0.19,4.16$\pm$0.09
1,0.031623,0.73$\pm$0.0022,0.72$\pm$0.0025,0.73$\pm$0.0017,1.47$\pm$0.11,2.94$\pm$0.09
2,1.0,0.8$\pm$0.0034,0.8$\pm$0.0034,0.8$\pm$0.003,0.87$\pm$0.05,2.27$\pm$0.07
3,31.622777,0.93$\pm$0.0027,0.93$\pm$0.0028,0.93$\pm$0.0026,0.93$\pm$0.05,1.53$\pm$0.06
4,1000.0,0.98$\pm$0.0022,0.98$\pm$0.0022,0.98$\pm$0.0022,2.05$\pm$0.22,0.74$\pm$0.07


In [50]:
latex_table_2 = combined_df.to_latex(index=False, escape=False)
print(latex_table_2)

\begin{tabular}{rlllll}
\toprule
    c_value &        accuracy &       precision &          recall & train_time_per_sample & test_time_per_sample \\
\midrule
   0.001000 & 0.57$\pm$0.0141 &  0.66$\pm$0.009 &  0.57$\pm$0.013 &         3.37$\pm$0.19 &        4.16$\pm$0.09 \\
   0.031623 & 0.73$\pm$0.0022 & 0.72$\pm$0.0025 & 0.73$\pm$0.0017 &         1.47$\pm$0.11 &        2.94$\pm$0.09 \\
   1.000000 &  0.8$\pm$0.0034 &  0.8$\pm$0.0034 &   0.8$\pm$0.003 &         0.87$\pm$0.05 &        2.27$\pm$0.07 \\
  31.622777 & 0.93$\pm$0.0027 & 0.93$\pm$0.0028 & 0.93$\pm$0.0026 &         0.93$\pm$0.05 &        1.53$\pm$0.06 \\
1000.000000 & 0.98$\pm$0.0022 & 0.98$\pm$0.0022 & 0.98$\pm$0.0022 &         2.05$\pm$0.22 &        0.74$\pm$0.07 \\
\bottomrule
\end{tabular}



  latex_table_2 = combined_df.to_latex(index=False, escape=False)
