In [49]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, make_scorer, classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

In [50]:
df = pd.read_csv("../data/processed/dataset_processed.csv")

In [51]:
X = df.iloc[:, :51].drop(columns=['file_path'])
Y = df.iloc[:, 51:]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42
)

In [52]:
param_distributions = {
    "estimator__max_depth": [3, 5, 7, 9],
    "estimator__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "estimator__max_iter": [100, 200, 400],
    "estimator__min_samples_leaf": [10, 20, 50],
    "estimator__l2_regularization": [0.0, 0.1, 1.0]
}

base_model = HistGradientBoostingClassifier(
    random_state=42
)

model = MultiOutputClassifier(base_model)

In [53]:
scorer = make_scorer(f1_score, average="micro")

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_distributions,
    n_iter=25,
    scoring=scorer,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

search.fit(X_train, Y_train)
best_model = search.best_estimator_

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [55]:
print("Best CV micro-F1:", search.best_score_)
print("Average CV micro-F1:", search.cv_results_['mean_test_score'].mean())
print("Best parameters:", search.best_params_)

Best CV micro-F1: 0.9772440354280553
Average CV micro-F1: 0.9559409751816804
Best parameters: {'estimator__min_samples_leaf': 10, 'estimator__max_iter': 200, 'estimator__max_depth': 9, 'estimator__learning_rate': 0.1, 'estimator__l2_regularization': 1.0}


In [58]:
Y_pred = best_model.predict(X_test)

print("Test micro-F1:", f1_score(Y_test, Y_pred, average="micro"))

for i in range(Y.shape[1]):
    print(f"\nSmell {i}: {Y.columns[i]}")
    print(classification_report(Y_test.iloc[:, i], Y_pred[:, i]))


Test micro-F1: 0.9841907824222936

Smell 0: y_FeatureEnvy
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       607
           1       0.99      0.99      0.99       320

    accuracy                           1.00       927
   macro avg       1.00      1.00      1.00       927
weighted avg       1.00      1.00      1.00       927


Smell 1: y_FormattingIssues
              precision    recall  f1-score   support

           0       0.92      0.94      0.93       559
           1       0.90      0.88      0.89       368

    accuracy                           0.91       927
   macro avg       0.91      0.91      0.91       927
weighted avg       0.91      0.91      0.91       927


Smell 2: y_GlobalStateAbuse
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       878
           1       1.00      1.00      1.00        49

    accuracy                           1.00       927
   macro avg

In [59]:
with open('../models/histogram_gb.pkl', 'wb') as f:
    pickle.dump(model, f)

print("Histogram Gradient Boosting model saved.")

Histogram Gradient Boosting model saved.
