In [1]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.svm import SVC                            # Import SVM model using guassian
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics                            # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [2]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [3]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [4]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [5]:
kernels = ['rbf', 'linear', 'poly', 'sigmoid']

In [6]:
# Create an empty list to store results
def trainer(kernels):
    results = []
    for k in kernels:
        # Create a pipeline object for our model SVM using rbf kernel
        pipe_SVM = make_pipeline(StandardScaler(),
                         SVC(kernel=k,
                         cache_size=2000,       # Default is 200 MB.
                         verbose=True)
                         )
        
        # Measure training time
        start_train = time.time()
        pipe_SVM.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_SVM.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'kernel': k,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [7]:
results = []
for x in np.arange(1): # should be 10
    r = trainer(kernels)
    results.append(r)

[LibSVM]....
*.*
optimization finished, #iter = 5296
obj = -5861.644221, rho = 1.856063
nSV = 6647, nBSV = 6521
*
optimization finished, #iter = 536
obj = -26.840079, rho = 0.478772
nSV = 205, nBSV = 5
.....
*.*
optimization finished, #iter = 6436
obj = -6575.856860, rho = 1.439287
nSV = 7700, nBSV = 7569
*
optimization finished, #iter = 995
obj = -676.825138, rho = 0.163901
nSV = 877, nBSV = 780
..
*.*
optimization finished, #iter = 3784
obj = -2968.781971, rho = 0.433926
nSV = 3648, nBSV = 3514
*
optimization finished, #iter = 559
obj = -31.220403, rho = 0.360993
nSV = 218, nBSV = 5
....
*.*
optimization finished, #iter = 5548
obj = -5981.670707, rho = -0.704682
nSV = 6597, nBSV = 6473
*
optimization finished, #iter = 895
obj = -506.459409, rho = -0.160701
nSV = 698, nBSV = 590
..
*.*
optimization finished, #iter = 3496
obj = -2820.525942, rho = -0.428197
nSV = 3416, nBSV = 3267
*
optimization finished, #iter = 585
obj = -32.866848, rho = -0.397232
nSV = 209, nBSV = 8
*
optimization 

In [8]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,rbf,0.806898,0.804855,0.806395,0.806395,0.000969,0.002286
1,linear,0.856783,0.855553,0.856039,0.856039,0.002127,0.000734
2,poly,0.708303,0.736,0.707387,0.707387,0.001615,0.000872
3,sigmoid,0.527966,0.551219,0.52752,0.52752,0.001231,0.001076


In [9]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,kernel,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,rbf,0.806898,0.804855,0.806395,0.806395,0.000969,0.002286
1,linear,0.856783,0.855553,0.856039,0.856039,0.002127,0.000734
2,poly,0.708303,0.736,0.707387,0.707387,0.001615,0.000872
3,sigmoid,0.527966,0.551219,0.52752,0.52752,0.001231,0.001076
