# GAMs for modelling the amount of Code Smells

In [1]:
from pygam import GAM, s, l
import pandas as pd
import numpy as np
import os
import dataloader_functions.dataloader_fulldata_AmountCodeSmells as dataloader
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from joblib import dump, load

In [2]:
# define necessary variables for data setup
variable_list = [
    'CODE_SMELLS',
    'CLASSES',
    'FILES',
    'LINES',
    'NCLOC',
    'PACKAGE',
    'STATEMENTS',
    'FUNCTIONS',
    'COMMENT_LINES',
    'COMPLEXITY',
    'CLASS_COMPLEXITY',
    'FUNCTION_COMPLEXITY',
    'COGNITIVE_COMPLEXITY',
    'LINES_TO_COVER',
    'UNCOVERED_LINES',
    'DUPLICATED_LINES',
    'DUPLICATED_BLOCKS',
    'DUPLICATED_FILES',
    'COMMENT_LINES_DENSITY',
    'DUPLICATED_LINES_DENSITY'
]

label = "CODE_SMELLS"

current_dir = os.getcwd()
model_save_dir = os.path.join(current_dir, '..', '..', '..', 'Data', 'Models', 'CodeSmells')

df = dataloader.load_df(current_dir)
df = dataloader.put_label_in_front(df, label)
df = dataloader.select_variables(df, variable_list)
df = dataloader.scale_predictors(df, label)
df

Unnamed: 0,CODE_SMELLS,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,COMPLEXITY,CLASS_COMPLEXITY,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY
0,17012,0.707998,0.114798,0.314222,0.571297,0.342736,0.841017,0.593938,-0.346117,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
1,16987,0.707998,0.114798,0.310393,0.565516,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.404699,0.609715,0.157664,-1.710559,0.262477
2,16987,0.707998,0.114798,0.310393,0.565516,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.404699,0.609715,0.157664,-1.710559,0.262477
3,17013,0.707998,0.114798,0.314070,0.571095,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
4,17013,0.707998,0.114798,0.314053,0.571070,0.342736,0.841017,0.593938,-0.346196,0.821715,1.442628,1.588837,1.352548,0.702365,0.702365,0.407604,0.610631,0.166980,-1.710559,0.262477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140743,268,-0.768491,-0.779031,-0.738544,-0.683567,-0.772243,-0.666976,-0.664161,-0.819923,-0.633137,-0.830480,-0.629689,-0.670437,-0.724777,-0.724777,-0.527839,-0.490534,-0.792615,0.402842,-0.861747
140744,269,-0.768491,-0.779031,-0.738511,-0.683435,-0.772243,-0.666874,-0.664385,-0.820042,-0.633035,-0.830480,-0.629689,-0.670230,-0.724693,-0.724693,-0.527839,-0.490534,-0.792615,0.391100,-0.861747
140745,266,-0.768491,-0.779031,-0.738908,-0.683901,-0.772243,-0.667108,-0.664459,-0.820834,-0.633240,-0.830480,-0.629689,-0.670593,-0.725016,-0.725016,-0.527839,-0.490534,-0.792615,0.391100,-0.855094
140746,266,-0.768491,-0.779031,-0.738908,-0.683901,-0.772243,-0.667108,-0.664459,-0.820834,-0.633240,-0.830480,-0.629689,-0.670593,-0.725016,-0.725016,-0.527839,-0.490534,-0.792615,0.391100,-0.855094


In [3]:
# drop the 29 rows that contain NAs
df = df.dropna()

## GAM with only spline terms
In a first approach, cubic splines are used to model with all predictors.

### Train-Test-Split
The training testing split is done in a 70/30 ratio. <br>
Lists for tracking prediction errors are initialized.

In [4]:
# train-test-split
X = df.drop(columns = "CODE_SMELLS")
y = df["CODE_SMELLS"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# lists for tracking errors
mse_track = []
mae_track = []

### Model

In [5]:
# initialize GAM model with the spline terms
gam = GAM(s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9) + s(10) + s(11) + s(12) + s(13) + s(14) + s(15) + s(16) + s(17) + s(18))

# fit the GAM model to the training data
gam.fit(X_train, y_train)
print(gam.summary())

# predictions on the test set for evaluation
y_pred = gam.predict(X_test)

# evaluating  model performance
r2 = r2_score(y_test, y_pred)
print(f"R-squared on the test set: {r2}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on the test set: {mse}")
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Average Error on the test set: {mae}")

mse_track.append(mse)
mae_track.append(mae)

filename_joblib = os.path.join(model_save_dir, "GAM", "GAM_spine.joblib")
dump(gam, filename_joblib)

GAM                                                                                                       
Distribution:                        NormalDist Effective DoF:                                    213.0856
Link Function:                     IdentityLink Log Likelihood:                              -1271751.2908
Number of Samples:                        98505 AIC:                                           2543930.753
                                                AICc:                                         2543931.6899
                                                GCV:                                           162021.4863
                                                Scale:                                         161390.7362
                                                Pseudo R-Squared:                                   0.9987
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [0.

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  print(gam.summary())


R-squared on the test set: 0.9986916136264455
Mean Squared Error on the test set: 159845.036812013
Mean Average Error on the test set: 256.9336432880163


['C:\\Users\\carol\\Dropbox\\DataScience\\Semester4\\MasterProjectSonarQube\\Scripts\\Model\\amountCodeSmells\\..\\..\\..\\Data\\Models\\CodeSmells\\GAM\\GAM_spine.joblib']

### Result
GAM with cubic splines performs significantly better on the testing set than the best linear regression model (MAE of 256.93).

## GAM with linear terms
Some of the variables have a linear relationship to the predictor. It is tested next whether interchanging the spline for a linear term for these variables improve the model.

In [6]:
# better overview which numbers are which variable
for index, column in enumerate(X_train.columns):
    print(f"{index}: {column}")

0: CLASSES
1: FILES
2: LINES
3: NCLOC
4: PACKAGE
5: STATEMENTS
6: FUNCTIONS
7: COMMENT_LINES
8: COMPLEXITY
9: CLASS_COMPLEXITY
10: FUNCTION_COMPLEXITY
11: COGNITIVE_COMPLEXITY
12: LINES_TO_COVER
13: UNCOVERED_LINES
14: DUPLICATED_LINES
15: DUPLICATED_BLOCKS
16: DUPLICATED_FILES
17: COMMENT_LINES_DENSITY
18: DUPLICATED_LINES_DENSITY


In [7]:
# initialize GAM model with the spline terms
gam2 = GAM(l(0) + s(1) + l(2) + l(3) + l(4) + l(5) + l(6) + s(7) + s(8) + s(9) + s(10) + l(11) + s(12) + l(13) + s(14) + s(15) + s(16) + s(17) + s(18))

# fit the GAM model to the training data
gam2.fit(X_train, y_train)
print(gam2.summary())

# predictions on the test set for evaluation
y_pred = gam2.predict(X_test)

# evaluating  model performance
r2 = r2_score(y_test, y_pred)
print(f"R-squared on the test set: {r2}")
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error on the test set: {mse}")
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Average Error on the test set: {mae}")

mse_track.append(mse)
mae_track.append(mae)

filename_joblib = os.path.join(model_save_dir, "GAM", "GAM_mixLinearSpine.joblib")
dump(gam2, filename_joblib)

GAM                                                                                                       
Distribution:                        NormalDist Effective DoF:                                    157.3821
Link Function:                     IdentityLink Log Likelihood:                              -1341462.7435
Number of Samples:                        98505 AIC:                                          2683242.2513
                                                AICc:                                         2683242.7646
                                                GCV:                                           328456.3954
                                                Scale:                                          327511.931
                                                Pseudo R-Squared:                                   0.9973
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
l(0)                              [0.

 
Please do not make inferences based on these values! 

Collaborate on a solution, and stay up to date at: 
github.com/dswah/pyGAM/issues/163 

  print(gam2.summary())


R-squared on the test set: 0.9973203208903333
Mean Squared Error on the test set: 327375.3186265709
Mean Average Error on the test set: 385.51548570373944


['C:\\Users\\carol\\Dropbox\\DataScience\\Semester4\\MasterProjectSonarQube\\Scripts\\Model\\amountCodeSmells\\..\\..\\..\\Data\\Models\\CodeSmells\\GAM\\GAM_mixLinearSpine.joblib']

### Result
A GAM model fit with only cubic spline terms performs much better than with any linear terms for variables that have a strong linear trend.