In [70]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score, hamming_loss
import pickle


In [71]:
# Load dataset
pd.set_option('display.max_columns', None)
df = pd.read_csv('../data/processed/dataset_processed.csv')
df.head()

Unnamed: 0,average_cyclomatic_complexity,average_methods_per_class,avg_line_length,boolean_expression_avg_terms,call_graph_density,classes,classes_with_inheritance,comment_code_mismatch_score,comment_lines,comment_percentage,decision_density,documentation_coverage,external_vs_internal_field_access_ratio,file_age_days,file_path,functions,global_usages_total,globals_declared,halstead_difficulty,halstead_effort,halstead_estimated_bugs,halstead_volume,inter_file_coupling,large_parameter_list_indicator,lazy_class_indicator,lines_added,lines_of_code,long_method_indicator,max_cyclomatic_ratio,max_intra_file_call_depth,max_line_length,max_lines_per_function,max_nesting_level,mean_cyclomatic_ratio,mean_lines_per_function,mean_param_entropy,methods,nesting_variance,num_authors,pep8_violations,percent_lines_over_80,semantic_todo_density,source_lines,test_files_found,test_function_count,test_lines,test_to_source_ratio,todo_fixme_count,todo_fixme_semantic_density,total_imports,unit_test_presence,y_FeatureEnvy,y_FormattingIssues,y_GlobalStateAbuse,y_GodClass,y_LargeParameterList,y_LazyClass,y_LongMethod,y_MisleadingComments,y_PoorDocumentation,y_SpaghettiCode,y_UnstableModule,y_UntestedCode,y_any_smell
0,0.360756,0.0,0.394464,0.0,0.333333,0.0,0.0,0.0,0.125,0.167,0.623314,0.0,0.033846,0.0,awesome-python\sort.py,0.025974,0.0,0.0,0.101876,0.010526,0.026215,0.02621,0.105263,False,False,0.011375,0.011375,False,0.172667,0.125,0.115646,0.005102,0.3,0.254697,0.11852,0.0,0.0,0.12288,0.0,0.002972,0.035704,0.0,0.006111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,0,0,1,0,0,1,1
1,0.166203,0.0,0.369089,0.0,0.0,0.0,0.0,0.0,0.017857,0.095,0.326629,0.0,0.009231,0.0,fastapi\pdm_build.py,0.012987,0.00625,0.038462,0.0,0.0,0.0,0.0,0.017544,False,False,0.002741,0.002741,False,0.166667,0.0,0.0839,0.001749,0.1,0.26096,0.045879,0.613327,0.0,0.0,0.0,0.0,0.0,0.0,0.001944,1.0,1.0,1.0,0.071766,0.0,0.0,0.038961,0.0,0,0,0,0,1,0,0,0,1,0,0,0,1
2,0.110617,0.0,0.242215,0.0,0.0,0.04878,0.05,0.0,0.0,0.0,0.1632,0.0,0.003077,0.0,fastapi\docs_src\additional_responses\tutorial...,0.0,0.00625,0.038462,0.0,0.0,0.0,0.0,0.017544,False,False,0.003015,0.003015,False,0.333333,0.0,0.097506,0.000583,0.1,0.521921,0.015293,0.613327,0.0,0.0,0.0,0.001486,0.130513,0.0,0.001944,0.0,0.0,0.0,0.0,0.0,0.0,0.038961,0.0,0,1,0,0,0,1,0,0,1,0,0,0,1
3,0.110617,0.0,0.226067,0.0,0.0,0.02439,0.025,0.0,0.0,0.0,0.104,0.0,0.003077,0.0,fastapi\docs_src\additional_responses\tutorial...,0.0,0.00625,0.038462,0.021992,0.000387,0.004495,0.004494,0.017544,False,False,0.003837,0.003837,False,0.266667,0.0,0.072562,0.000729,0.1,0.417537,0.019116,0.643482,0.0,0.0,0.0,0.0,0.0,0.0,0.003056,0.0,0.0,0.0,0.0,0.0,0.0,0.038961,0.0,0,0,0,0,0,1,0,0,1,0,0,0,1
4,0.110617,0.0,0.222607,0.0,0.0,0.02439,0.025,0.0,0.0,0.0,0.099429,0.0,0.003077,0.0,fastapi\docs_src\additional_responses\tutorial...,0.0,0.00625,0.038462,0.0,0.0,0.0,0.0,0.017544,False,False,0.004111,0.004111,False,0.266667,0.0,0.073696,0.000729,0.1,0.417537,0.019116,0.643482,0.0,0.0,0.0,0.0,0.0,0.0,0.003194,0.0,0.0,0.0,0.0,0.0,0.0,0.051948,0.0,0,0,0,0,0,1,0,0,1,0,0,0,1


In [72]:
# Separate features and target
X = df.iloc[:, :51].drop(columns=['file_path'])
y = df.iloc[:, 51:]
y.head()

Unnamed: 0,y_FeatureEnvy,y_FormattingIssues,y_GlobalStateAbuse,y_GodClass,y_LargeParameterList,y_LazyClass,y_LongMethod,y_MisleadingComments,y_PoorDocumentation,y_SpaghettiCode,y_UnstableModule,y_UntestedCode,y_any_smell
0,1,0,0,0,0,0,0,0,1,0,0,1,1
1,0,0,0,0,1,0,0,0,1,0,0,0,1
2,0,1,0,0,0,1,0,0,1,0,0,0,1
3,0,0,0,0,0,1,0,0,1,0,0,0,1
4,0,0,0,0,0,1,0,0,1,0,0,0,1


In [73]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [74]:
# Baseline Random Forest (before tuning)
rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model = MultiOutputClassifier(rf)

# Train the baseline model
model.fit(X_train, y_train)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [75]:
# Hyperparameter tuning
param_dist = {
    "estimator__n_estimators": [100, 200, 300],
    "estimator__max_depth": [None, 20, 40],
    "estimator__min_samples_split": [2, 5],
    "estimator__min_samples_leaf": [1, 2]
}

random_search  = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    scoring="f1_micro",
    n_jobs=-1,
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)

print("Best parameters from Random Search:")
print(random_search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters from Random Search:
{'estimator__n_estimators': 300, 'estimator__min_samples_split': 2, 'estimator__min_samples_leaf': 1, 'estimator__max_depth': 40}


In [76]:
# hyper tuning using gridSearch
best_params = random_search.best_params_

grid_param_dist = {
    "estimator__n_estimators": [
        best_params["estimator__n_estimators"] - 100,
        best_params["estimator__n_estimators"],
        best_params["estimator__n_estimators"] + 100
    ],
    "estimator__max_depth": [
        best_params["estimator__max_depth"],
        None if best_params["estimator__max_depth"] is not None else None
    ],
    "estimator__min_samples_split": [
        best_params["estimator__min_samples_split"]
    ],
    "estimator__min_samples_leaf": [
        best_params["estimator__min_samples_leaf"]
    ]
}

# Remove invalid values
for k in grid_param_dist:
    grid_param_dist[k] = list(set(
        v for v in grid_param_dist[k] if v is not None
    ))

grid_search = GridSearchCV(
    estimator=model,
    param_grid=grid_param_dist,
    cv=3,
    scoring="f1_micro",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

print("Best parameters from Grid Search:")
print(grid_search.best_params_)


Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best parameters from Grid Search:
{'estimator__max_depth': 40, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 300}


In [77]:
# Evaluate the tuned model
Y_pred = best_model.predict(X_test)

print("Final F1-micro:",
      f1_score(y_test, Y_pred, average="micro"))

print("Final Hamming Loss:",
      hamming_loss(y_test, Y_pred))

print("\nPer-label performance:")
print(classification_report(
    y_test,
    Y_pred,
    target_names=y.columns
))


Final F1-micro: 0.9819940876108573
Final Hamming Loss: 0.011119409177661604

Per-label performance:
                      precision    recall  f1-score   support

       y_FeatureEnvy       0.99      0.99      0.99       320
  y_FormattingIssues       0.91      0.85      0.88       368
  y_GlobalStateAbuse       1.00      0.98      0.99        49
          y_GodClass       0.91      0.75      0.82        40
y_LargeParameterList       1.00      1.00      1.00       520
         y_LazyClass       0.99      0.99      0.99       137
        y_LongMethod       1.00      0.99      0.99       215
y_MisleadingComments       1.00      1.00      1.00       327
 y_PoorDocumentation       1.00      1.00      1.00       587
     y_SpaghettiCode       1.00      0.84      0.91        31
    y_UnstableModule       1.00      1.00      1.00        20
      y_UntestedCode       1.00      1.00      1.00       319
         y_any_smell       0.98      1.00      0.99       803

           micro avg       0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [78]:
with open("../models/random_forest.pkl", "wb") as f:
    pickle.dump(best_model, f)

print("Tuned Random Forest model saved successfully!")


Tuned Random Forest model saved successfully!
