In [68]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import accuracy_score,f1_score, make_scorer, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import pickle

In [69]:
import warnings
warnings.filterwarnings('ignore')

In [70]:
df = pd.read_csv("../data/processed/dataset_processed.csv")
df.head()

Unnamed: 0,average_cyclomatic_complexity,average_methods_per_class,avg_line_length,boolean_expression_avg_terms,call_graph_density,classes,classes_with_inheritance,comment_code_mismatch_score,comment_lines,comment_percentage,...,y_GodClass,y_LargeParameterList,y_LazyClass,y_LongMethod,y_MisleadingComments,y_PoorDocumentation,y_SpaghettiCode,y_UnstableModule,y_UntestedCode,y_any_smell
0,0.360756,0.0,0.394464,0.0,0.333333,0.0,0.0,0.0,0.125,0.167,...,0,0,0,0,0,1,0,0,1,1
1,0.166203,0.0,0.369089,0.0,0.0,0.0,0.0,0.0,0.017857,0.095,...,0,1,0,0,0,1,0,0,0,1
2,0.110617,0.0,0.242215,0.0,0.0,0.04878,0.05,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,0,1
3,0.110617,0.0,0.226067,0.0,0.0,0.02439,0.025,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,0,1
4,0.110617,0.0,0.222607,0.0,0.0,0.02439,0.025,0.0,0.0,0.0,...,0,0,1,0,0,1,0,0,0,1


In [71]:
X = df.iloc[:, :51].drop(columns=['file_path'])
Y = df.iloc[:, 51:]


In [72]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42
)

In [73]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier

param_distributions = {
    "estimator__max_depth": [3, 5, 7, 9],
    "estimator__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "estimator__max_iter": [100, 200, 400],
    "estimator__min_samples_leaf": [10, 20, 50],
    "estimator__l2_regularization": [0.0, 0.1, 1.0]
}

base_model = LGBMClassifier(
    random_state=42
)

model = MultiOutputClassifier(base_model)

In [75]:
scorer = make_scorer(f1_score, average="micro")

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=25,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_
best_params = search.best_params_

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [76]:
print("Best CV micro-F1:", search.best_score_)
print("Average CV micro-F1:", search.cv_results_['mean_test_score'].mean())
print("Best parameters:", best_params)

Best CV micro-F1: 0.9869200621098418
Average CV micro-F1: 0.9833265904556506
Best parameters: {'estimator__min_samples_leaf': 50, 'estimator__max_iter': 200, 'estimator__max_depth': 7, 'estimator__learning_rate': 0.1, 'estimator__l2_regularization': 0.1}


In [77]:
Y_pred = best_model.predict(X_test)
print("Test micro-F1:", f1_score(y_test, Y_pred, average="micro"))
print(classification_report(y_test, Y_pred, target_names=Y.columns))

Test micro-F1: 0.9845906471928179
                      precision    recall  f1-score   support

       y_FeatureEnvy       0.99      0.99      0.99       320
  y_FormattingIssues       0.90      0.87      0.89       368
  y_GlobalStateAbuse       1.00      1.00      1.00        49
          y_GodClass       0.92      0.85      0.88        40
y_LargeParameterList       1.00      1.00      1.00       520
         y_LazyClass       1.00      0.99      0.99       137
        y_LongMethod       1.00      1.00      1.00       215
y_MisleadingComments       1.00      1.00      1.00       327
 y_PoorDocumentation       1.00      1.00      1.00       587
     y_SpaghettiCode       1.00      1.00      1.00        31
    y_UnstableModule       1.00      0.95      0.97        20
      y_UntestedCode       1.00      1.00      1.00       319
         y_any_smell       0.98      1.00      0.99       803

           micro avg       0.99      0.98      0.98      3736
           macro avg       0.98   

In [78]:
param_grid = {
    "estimator__max_depth": [
        best_params["estimator__max_depth"] - 1,
        best_params["estimator__max_depth"],
        best_params["estimator__max_depth"] + 1,
    ],
    "estimator__learning_rate": [
        best_params["estimator__learning_rate"] * 0.7,
        best_params["estimator__learning_rate"],
        best_params["estimator__learning_rate"] * 1.3,
    ],
    "estimator__min_samples_leaf": [
        max(5, best_params["estimator__min_samples_leaf"] - 5),
        best_params["estimator__min_samples_leaf"],
        best_params["estimator__min_samples_leaf"] + 5,
    ],
}

In [79]:
grid_search = GridSearchCV(
    estimator=best_model,
    param_grid=param_grid,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
final_model = grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [80]:
print("Refined CV micro-F1:", grid_search.best_score_)
print("Refined best parameters:", grid_search.best_params_)

Refined CV micro-F1: 0.986974132004998
Refined best parameters: {'estimator__learning_rate': 0.13, 'estimator__max_depth': 7, 'estimator__min_samples_leaf': 45}


In [81]:
Y_pred = final_model.predict(X_test)

print("Test micro-F1:",
      f1_score(y_test, Y_pred, average="micro"))

print(classification_report(y_test, Y_pred, target_names=Y.columns))

Test micro-F1: 0.9851306095110516
                      precision    recall  f1-score   support

       y_FeatureEnvy       0.99      0.99      0.99       320
  y_FormattingIssues       0.91      0.88      0.89       368
  y_GlobalStateAbuse       1.00      1.00      1.00        49
          y_GodClass       0.89      0.85      0.87        40
y_LargeParameterList       1.00      1.00      1.00       520
         y_LazyClass       1.00      0.99      0.99       137
        y_LongMethod       1.00      1.00      1.00       215
y_MisleadingComments       1.00      1.00      1.00       327
 y_PoorDocumentation       1.00      1.00      1.00       587
     y_SpaghettiCode       1.00      1.00      1.00        31
    y_UnstableModule       1.00      0.95      0.97        20
      y_UntestedCode       1.00      1.00      1.00       319
         y_any_smell       0.99      1.00      0.99       803

           micro avg       0.99      0.98      0.99      3736
           macro avg       0.98   

In [84]:
import os

os.makedirs('../models', exist_ok=True)
with open('../models/LightGBM.pkl', 'wb') as f:
    pickle.dump(model, f)

print("LightGBM model saved.")

LightGBM model saved.
