# SVM

In [1]:
import numpy as np
import pandas as pd
import os
import dataloader_functions.dataloader_fulldata_AmountCodeSmells as dataloader
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from joblib import dump, load

In [2]:
# define necessary variables for data setup
variable_list = [
    'CODE_SMELLS',
    'CLASSES',
    'FILES',
    'LINES',
    'NCLOC',
    'PACKAGE',
    'STATEMENTS',
    'FUNCTIONS',
    'COMMENT_LINES',
    'COMPLEXITY',
    'CLASS_COMPLEXITY',
    'FUNCTION_COMPLEXITY',
    'COGNITIVE_COMPLEXITY',
    'LINES_TO_COVER',
    'UNCOVERED_LINES',
    'DUPLICATED_LINES',
    'DUPLICATED_BLOCKS',
    'DUPLICATED_FILES',
    'COMMENT_LINES_DENSITY',
    'DUPLICATED_LINES_DENSITY'
]

label = "CODE_SMELLS"

current_dir = os.getcwd()
model_save_dir = os.path.join(current_dir, '..', '..', 'Data', 'Models', 'CodeSmells')

df = dataloader.load_df(current_dir)
df = dataloader.put_label_in_front(df, label)
df = dataloader.select_variables(df, variable_list)
df = dataloader.scale_predictors(df, label)

# drop the 29 rows that contain NAs
df = df.dropna()
df

Unnamed: 0,CODE_SMELLS,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,COMPLEXITY,CLASS_COMPLEXITY,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY
0,17012,0.707998,0.114798,0.314222,0.571297,0.342736,0.841017,0.593938,-0.346117,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
1,16987,0.707998,0.114798,0.310393,0.565516,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.404699,0.609715,0.157664,-1.710559,0.262477
2,16987,0.707998,0.114798,0.310393,0.565516,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.404699,0.609715,0.157664,-1.710559,0.262477
3,17013,0.707998,0.114798,0.314070,0.571095,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
4,17013,0.707998,0.114798,0.314053,0.571070,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,268,-0.768491,-0.779031,-0.738544,-0.683567,-0.772243,-0.666976,-0.664161,-0.819923,-0.633137,-0.830480,-0.629689,-0.670437,-0.724777,-0.724777,-0.527839,-0.490534,-0.792615,0.402842,-0.861747
140744,269,-0.768491,-0.779031,-0.738511,-0.683435,-0.772243,-0.666874,-0.664385,-0.820042,-0.633035,-0.830480,-0.629689,-0.670230,-0.724693,-0.724693,-0.527839,-0.490534,-0.792615,0.391100,-0.861747
140745,266,-0.768491,-0.779031,-0.738908,-0.683901,-0.772243,-0.667108,-0.664459,-0.820834,-0.633240,-0.830480,-0.629689,-0.670593,-0.725016,-0.725016,-0.527839,-0.490534,-0.792615,0.391100,-0.855094
140746,266,-0.768491,-0.779031,-0.738908,-0.683901,-0.772243,-0.667108,-0.664459,-0.820834,-0.633240,-0.830480,-0.629689,-0.670593,-0.725016,-0.725016,-0.527839,-0.490534,-0.792615,0.391100,-0.855094


## Train-Test-Split

In [3]:
# train-test-split
X = df.drop(columns = 'CODE_SMELLS')
y = df['CODE_SMELLS']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# svm model base setup
svm_model = SVR()

# parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'epsilon': [0.01, 0.1, 0.2],
    'gamma': ['scale', 'auto']
}

# 5-fold cross validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid_svm, optimizing MAE
grid_search_svm = GridSearchCV(
    estimator=svm_model,
    param_grid=param_grid_svm,
    scoring='neg_mean_absolute_error',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

# fit the grid search to the training data
grid_search_svm.fit(X_train, y_train)

# extract the best model and its parameters
best_model_svm = grid_search_svm.best_estimator_
best_params_svm = grid_search_svm.best_params_

print('Best SVM Parameters:', best_params_svm)

# predictions for test data using the best model
y_pred_svm = best_model_svm.predict(X_test)

# evaluating the model
mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)
mae_svm = mean_absolute_error(y_test, y_pred_svm)

print('SVM Model Evaluation (Tuned):')
print(f'Mean Squared Error: {mse_svm}')
print(f'R-squared: {r2_svm}')
print(f'Mean Absolute Error: {mae_svm}')

filename_joblib_svm = os.path.join(model_save_dir, 'SVM', 'SVM_tunedParams.joblib')
dump(best_model_svm, filename_joblib_svm)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best SVM Parameters: {'C': 10, 'epsilon': 0.2, 'gamma': 'scale', 'kernel': 'linear'}
SVM Model Evaluation (Tuned):
Mean Squared Error: 5980911.645929139
R-squared: 0.9510441897037445
Mean Absolute Error: 1017.2844280368879


['C:\\Users\\carol\\Dropbox\\DataScience\\Semester4\\MasterProjectSonarQube\\Scripts\\Model\\..\\..\\Data\\Models\\CodeSmells\\SVM\\SVM_tunedParams.joblib']