# Table of Contents

1. [Importing Necessary Dependencies](#importing-necessary-dependencies)
2. [Loading the Datasets](#loading-the-datasets)
3. [Defining Helper Functions](#defining-helper-functions)
4. [Data Preprocessing](#data-preprocessing)
5. [Model Training](#model-training)
6. [Model Evaluation](#model-evaluation)

# Importing Necessary Dependencies

In [None]:
import catboost as cb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.compose import ColumnTransformer
from sklpearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.utils import class_weight

# Loading the Datasets

In [57]:
X_train = pd.read_parquet('../data/processed/X_train.parquet')
X_test = pd.read_parquet('../data/processed/X_test.parquet')
y_train = pd.read_parquet('../data/processed/y_train.parquet')
y_test = pd.read_parquet('../data/processed/y_test.parquet')

In [58]:
X_train.drop(columns="code_snippet", inplace=True)
X_test.drop(columns="code_snippet", inplace=True)

In [59]:
X_train.head()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,has_decorators,uses_globals,is_recursive,estimated_difficulty,estimated_bugs,has_docstring,docstring_length,num_comments,name_length,is_name_well_formed,bad_variable_names_count,max_return_length,estimated_complexity
0,2.0,1.0,0,0,1,False,False,False,0.0,0.0,False,0,0.0,46.0,True,0.0,0.0,1.0
1,13.0,1.0,0,7,6,False,False,False,1.25,0.01661,False,0,0.0,13.0,True,0.0,0.0,1.0
2,4.0,1.0,0,0,2,False,False,False,0.0,0.0,False,0,1.0,16.0,True,0.0,0.0,1.0
3,67.0,2.0,2,28,38,False,False,False,4.442308,0.084073,True,3,6.0,14.0,True,11.0,125.0,9.0
4,6.0,2.0,0,0,5,True,False,False,0.0,0.0,False,0,0.0,35.0,False,0.0,0.0,1.0


In [60]:
y_train.head()

Unnamed: 0,quality
0,good
1,bad
2,good
3,bad
4,good


In [61]:
X_train = X_train[~X_train.duplicated()]
X_test = X_test[~X_test.duplicated()]

train_keep_indices = X_train.index
test_keep_indices = X_test.index

# Filter y_train and y_test to match the deduplicated X's
y_train = y_train.loc[train_keep_indices]
y_test = y_test.loc[test_keep_indices]

In [62]:
X_train.describe()

Unnamed: 0,loc,num_args,num_returns,num_variables,num_function_calls,estimated_difficulty,estimated_bugs,docstring_length,num_comments,name_length,bad_variable_names_count,max_return_length,estimated_complexity
count,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0,176967.0
mean,18.513926,1.831053,0.769449,3.499698,7.981974,0.767141,0.007568,2.052371,1.116768,18.922584,0.957958,20.059226,2.541084
std,25.612529,1.66507,1.197201,5.172238,10.305177,1.364084,0.025296,9.490253,3.18502,11.65574,2.129401,62.775364,3.50294
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,5.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,1.0
50%,11.0,1.0,1.0,2.0,5.0,0.5,0.000667,0.0,0.0,17.0,0.0,1.0,1.0
75%,22.0,2.0,1.0,5.0,10.0,1.0,0.00517,0.0,1.0,26.0,1.0,26.0,3.0
max,858.0,55.0,65.0,226.0,503.0,56.875,1.448993,449.0,150.0,295.0,216.0,12370.0,182.0


In [63]:
y_train.describe()

Unnamed: 0,quality
count,176967
unique,2
top,bad
freq,108560


In [64]:
X_train.dtypes

loc                         float64
num_args                    float64
num_returns                   int64
num_variables                 int64
num_function_calls            int64
has_decorators                 bool
uses_globals                   bool
is_recursive                   bool
estimated_difficulty        float64
estimated_bugs              float64
has_docstring                  bool
docstring_length              int64
num_comments                float64
name_length                 float64
is_name_well_formed            bool
bad_variable_names_count    float64
max_return_length           float64
estimated_complexity        float64
dtype: object

In [65]:
y_train.dtypes

quality    category
dtype: object

# Defining Helper Functions

In [66]:
def plot_learning_curve(model, X, y, cv=5, title="Learning Curve"):
	train_sizes, train_scores, val_scores = learning_curve(
		model, X, y, cv=cv, scoring='accuracy',
		train_sizes=np.linspace(0.1, 1.0, 10),
		n_jobs=-1
	)
	
	plt.figure(figsize=(10, 6))
	plt.plot(
		train_sizes, np.mean(train_scores, axis=1), label='Training Score'
	)
	plt.plot(
		train_sizes, np.mean(val_scores, axis=1), label='Validation Score'
	)
	plt.fill_between(
		train_sizes, np.mean(train_scores, axis=1) - np.std(train_scores, axis=1),
		np.mean(train_scores, axis=1) + np.std(train_scores, axis=1), alpha=0.1
	)
	plt.fill_between(
		train_sizes, np.mean(val_scores, axis=1) - np.std(val_scores, axis=1),
		np.mean(val_scores, axis=1) + np.std(val_scores, axis=1), alpha=0.1
	)
	
	plt.title(title)
	plt.xlabel("Training Examples")
	plt.ylabel("Accuracy Score")
	plt.legend()
	plt.grid()
	plt.show()

# Data Preprocessing

## Converting Boolean Columns to Numerical (0, 1)

In [67]:
bool_cols = X_train.select_dtypes(include='bool').columns
X_train[bool_cols] = X_train[bool_cols].astype(int)
X_test[bool_cols] = X_test[bool_cols].astype(int)

## Scaling the Data

### Identifying numerical columns (excluding boolean/binary columns)

In [68]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
numerical_cols = [col for col in numerical_cols if col not in bool_cols]

### Scaling

In [69]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), numerical_cols),
        ('bool', 'passthrough', bool_cols)  # Skip scaling for boolean columns
    ],
    remainder='passthrough'  # This will handle any other columns if they exist
)

X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

### Converting to DataFrame to maintain column names

In [70]:
feature_names = numerical_cols + list(bool_cols)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names)

In [71]:
print("Deduplicated y_train distribution:\n", y_train.value_counts())

Deduplicated y_train distribution:
 quality
bad        108560
good        68407
Name: count, dtype: int64


## Encoding the Target Variable

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train.squeeze())

## Calculating Class Weights
This step is optional, but can help balance the dataset and improve the model's performance.

classes = np.unique(y_train_encoded)
weights = class_weight.compute_sample_weight('balanced', y_train_encoded)
weights

# Model Training

In [72]:
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=12,
    l2_leaf_reg=3,
    loss_function='Logloss',
    auto_class_weights='Balanced',
    verbose=True,
    random_seed=42,
    task_type='GPU'
)

model.fit(
	X_train_scaled,
	y_train
)

0:	learn: 0.6616371	total: 147ms	remaining: 1m 13s
1:	learn: 0.6346384	total: 296ms	remaining: 1m 13s
2:	learn: 0.6102785	total: 424ms	remaining: 1m 10s
3:	learn: 0.5887689	total: 595ms	remaining: 1m 13s
4:	learn: 0.5700277	total: 758ms	remaining: 1m 15s
5:	learn: 0.5533617	total: 904ms	remaining: 1m 14s
6:	learn: 0.5386212	total: 1.07s	remaining: 1m 15s
7:	learn: 0.5255808	total: 1.23s	remaining: 1m 15s
8:	learn: 0.5140577	total: 1.38s	remaining: 1m 15s
9:	learn: 0.5041300	total: 1.53s	remaining: 1m 15s
10:	learn: 0.4951302	total: 1.72s	remaining: 1m 16s
11:	learn: 0.4872632	total: 1.9s	remaining: 1m 17s
12:	learn: 0.4798523	total: 2.12s	remaining: 1m 19s
13:	learn: 0.4732504	total: 2.3s	remaining: 1m 19s
14:	learn: 0.4675256	total: 2.45s	remaining: 1m 19s
15:	learn: 0.4619744	total: 2.6s	remaining: 1m 18s
16:	learn: 0.4572453	total: 2.74s	remaining: 1m 17s
17:	learn: 0.4529320	total: 2.87s	remaining: 1m 16s
18:	learn: 0.4487805	total: 3.02s	remaining: 1m 16s
19:	learn: 0.4449381	tota

<catboost.core.CatBoostClassifier at 0x2ac8b412db0>

# Model Evaluation

In [73]:
y_pred = model.predict(X_test_scaled)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         bad       0.88      0.82      0.85     28032
        good       0.77      0.84      0.80     19579

    accuracy                           0.83     47611
   macro avg       0.82      0.83      0.82     47611
weighted avg       0.83      0.83      0.83     47611



In [74]:
scores = cross_val_score(model, X_train_scaled, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Mean cross-validation score: {scores.mean()}")

0:	learn: 0.6619056	total: 153ms	remaining: 1m 16s
1:	learn: 0.6348814	total: 315ms	remaining: 1m 18s
2:	learn: 0.6108559	total: 466ms	remaining: 1m 17s
3:	learn: 0.5896642	total: 603ms	remaining: 1m 14s
4:	learn: 0.5715766	total: 784ms	remaining: 1m 17s
5:	learn: 0.5551687	total: 937ms	remaining: 1m 17s
6:	learn: 0.5409863	total: 1.09s	remaining: 1m 17s
7:	learn: 0.5281783	total: 1.23s	remaining: 1m 15s
8:	learn: 0.5163014	total: 1.37s	remaining: 1m 14s
9:	learn: 0.5061099	total: 1.52s	remaining: 1m 14s
10:	learn: 0.4967620	total: 1.66s	remaining: 1m 13s
11:	learn: 0.4884823	total: 1.8s	remaining: 1m 13s
12:	learn: 0.4810675	total: 1.95s	remaining: 1m 13s
13:	learn: 0.4743048	total: 2.07s	remaining: 1m 12s
14:	learn: 0.4686067	total: 2.21s	remaining: 1m 11s
15:	learn: 0.4628332	total: 2.35s	remaining: 1m 11s
16:	learn: 0.4576545	total: 2.51s	remaining: 1m 11s
17:	learn: 0.4529792	total: 2.69s	remaining: 1m 11s
18:	learn: 0.4488018	total: 2.88s	remaining: 1m 12s
19:	learn: 0.4451328	to

In [None]:
plot_learning_curve(model, X_train_scaled, y_train, cv=3, title="CatBoost Learning Curve")

NameError: name 'y_train_encoded' is not defined