# Table of Contents

1. [Importing Necessary Dependencies](#importing-necessary-dependencies)
2. [Loading the Datasets](#loading-the-datasets)
3. [Defining Helper Functions](#defining-helper-functions)
4. [Data Preprocessing](#data-preprocessing)
5. [Model Training](#model-training)
6. [Model Evaluation](#model-evaluation)

# Importing Necessary Dependencies

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.svm import SVC
from sklearn.utils import class_weight

# Loading the Datasets

In [2]:
X_train = pd.read_parquet('../data/processed/X_train.parquet')
X_test = pd.read_parquet('../data/processed/X_test.parquet')
y_train = pd.read_parquet('../data/processed/y_train.parquet')
y_test = pd.read_parquet('../data/processed/y_test.parquet')

In [3]:
X_train.drop(columns="code_snippet", inplace=True)
X_test.drop(columns="code_snippet", inplace=True)

In [4]:
X_train.head()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,has_decorators,uses_globals,is_recursive,estimated_difficulty,estimated_bugs,...,num_comments,name_length,is_name_well_formed,bad_variable_names_count,max_return_length,comment_density,max_nesting_depth,num_try_blocks,has_large_return,estimated_complexity
0,2,1,1,0,1,False,False,False,0.0,0.0,...,0,1,True,0,28,0.0,1,0,False,1
1,47,3,0,5,8,False,False,False,0.5,0.002113,...,1,20,True,1,0,0.021277,1,0,False,1
2,43,0,0,7,23,False,False,False,1.416667,0.079907,...,0,30,True,0,0,0.0,3,0,False,1
3,8,1,0,5,7,False,False,False,0.5,0.001585,...,0,8,True,0,0,0.0,1,0,False,1
4,12,2,2,0,2,False,False,False,0.0,0.0,...,7,20,True,0,50,0.583333,2,1,False,2


In [5]:
X_train.describe()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,estimated_difficulty,estimated_bugs,docstring_length,num_comments,name_length,bad_variable_names_count,max_return_length,comment_density,max_nesting_depth,num_try_blocks,estimated_complexity
count,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0,649538.0
mean,18.205038,2.019047,0.677588,3.30786,6.312721,0.798108,0.01216,1.763364,0.85423,17.575506,0.569522,21.754578,0.031409,1.679046,0.067417,3.052637
std,32.478218,2.30548,1.094247,6.527018,12.87728,1.484346,0.273198,7.629977,2.993329,12.133326,2.055308,159.823869,0.075358,0.957722,0.338455,4.978079
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,4.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,10.0,1.0,0.0,2.0,3.0,0.5,0.000667,0.0,0.0,15.0,0.0,0.0,0.0,1.0,0.0,2.0
75%,21.0,2.0,1.0,4.0,7.0,1.0,0.00517,1.0,0.0,23.0,0.0,22.0,0.0,2.0,0.0,3.0
max,6072.0,374.0,116.0,817.0,2714.0,100.0,187.266816,838.0,241.0,219.0,339.0,57944.0,0.933333,13.0,25.0,440.0


In [6]:
X_train = X_train[~X_train.duplicated()]
X_test = X_test[~X_test.duplicated()]

train_keep_indices = X_train.index
test_keep_indices = X_test.index

# Filter y_train and y_test to match the deduplicated X's
y_train = y_train.loc[train_keep_indices]
y_test = y_test.loc[test_keep_indices]

In [7]:
X_train.describe()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,estimated_difficulty,estimated_bugs,docstring_length,num_comments,name_length,bad_variable_names_count,max_return_length,comment_density,max_nesting_depth,num_try_blocks,estimated_complexity
count,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0,518207.0
mean,21.361898,2.118746,0.749316,3.874531,7.408763,0.960117,0.014907,2.109518,1.03988,18.655809,0.688966,24.586957,0.036927,1.811014,0.082517,3.48173
std,35.34819,2.464547,1.181844,7.0936,14.118888,1.596794,0.305756,8.422522,3.298904,12.490121,2.265113,177.484873,0.07981,1.003935,0.374062,5.44773
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
25%,6.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,13.0,2.0,1.0,2.0,4.0,0.5,0.001585,0.0,0.0,16.0,0.0,0.0,0.0,2.0,0.0,2.0
75%,25.0,3.0,1.0,5.0,9.0,1.142857,0.008422,1.0,1.0,24.0,1.0,25.0,0.037037,2.0,0.0,4.0
max,6072.0,374.0,116.0,817.0,2714.0,100.0,187.266816,838.0,241.0,219.0,339.0,57944.0,0.933333,13.0,25.0,440.0


In [8]:
y_train.describe()

Unnamed: 0,quality
count,518207
unique,2
top,bad
freq,266580


# Defining Helper Functions

In [9]:
def plot_learning_curve(model, X, y, cv=5, title="Learning Curve"):
	train_sizes, train_scores, val_scores = learning_curve(
		model, X, y, cv=cv, scoring='accuracy',
		train_sizes=np.linspace(0.1, 1.0, 10),
		n_jobs=-1
	)
	
	plt.figure(figsize=(10, 6))
	plt.plot(
		train_sizes, np.mean(train_scores, axis=1), label='Training Score'
	)
	plt.plot(
		train_sizes, np.mean(val_scores, axis=1), label='Validation Score'
	)
	plt.fill_between(
		train_sizes, np.mean(train_scores, axis=1) - np.std(train_scores, axis=1),
		np.mean(train_scores, axis=1) + np.std(train_scores, axis=1), alpha=0.1
	)
	plt.fill_between(
		train_sizes, np.mean(val_scores, axis=1) - np.std(val_scores, axis=1),
		np.mean(val_scores, axis=1) + np.std(val_scores, axis=1), alpha=0.1
	)
	
	plt.title(title)
	plt.xlabel("Training Examples")
	plt.ylabel("Accuracy Score")
	plt.legend()
	plt.grid()
	plt.show()

# Data Preprocessing

## Converting Boolean Columns to Numerical (0, 1)

In [10]:
bool_cols = X_train.select_dtypes(include='bool').columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)
X_train.head()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,has_decorators,uses_globals,is_recursive,estimated_difficulty,estimated_bugs,...,num_comments,name_length,is_name_well_formed,bad_variable_names_count,max_return_length,comment_density,max_nesting_depth,num_try_blocks,has_large_return,estimated_complexity
0,2,1,1,0,1,0,0,0,0.0,0.0,...,0,1,1,0,28,0.0,1,0,0,1
1,47,3,0,5,8,0,0,0,0.5,0.002113,...,1,20,1,1,0,0.021277,1,0,0,1
2,43,0,0,7,23,0,0,0,1.416667,0.079907,...,0,30,1,0,0,0.0,3,0,0,1
3,8,1,0,5,7,0,0,0,0.5,0.001585,...,0,8,1,0,0,0.0,1,0,0,1
4,12,2,2,0,2,0,0,0,0.0,0.0,...,7,20,1,0,50,0.583333,2,1,0,2


## Scaling the Data

### Identifying numerical columns (excluding boolean/binary columns)

In [11]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_cols = [col for col in numerical_cols if col not in bool_cols]

### Scaling

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('bool', 'passthrough', bool_cols)  # Skip scaling for boolean columns
    ],
    remainder='passthrough'  # This will handle any other columns if they exist
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

### Converting to DataFrame to maintain column names

In [13]:
feature_names = numerical_cols + list(bool_cols)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

## Encoding the Target Variable

In [14]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train.squeeze())

## Calculating Class Weights
This step is optional, but can help balance the dataset and improve the model's performance.

In [15]:
classes = np.unique(y_train_encoded)
weights = class_weight.compute_sample_weight('balanced', y_train_encoded)
weights

array([1.02971263, 0.97195401, 0.97195401, ..., 0.97195401, 1.02971263,
       1.02971263])

# Model Training

In [None]:
model = SVC(
    C=1.0,
    kernel='rbf',
    gamma='scale',
    class_weight='balanced',
    probability=True,
    random_state=42,
    verbose=True
)

model.fit(
    X_train_scaled,
    y_train_encoded
)

[LibSVM]

# Model Evaluation

In [None]:
y_pred_encoded = model.predict(X_test_scaled)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.74      0.79      0.76      6899
        good       0.50      0.78      0.61      4240
    moderate       0.71      0.51      0.60      9923

    accuracy                           0.66     21062
   macro avg       0.65      0.69      0.66     21062
weighted avg       0.68      0.66      0.65     21062

