# RandomForest for each tag
RandomForest is one of the state of the art models for binary classification. The goal is to model each tag through its own model. To ensure the best prediction, F1-Score is optimised for balancing recall and precision. Accuracy is not suitable, since some tags are very unbalanced. The models are optimised with cross-validated gridsearch. This modelling approach doesn't account for the time series aspect.

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import make_scorer
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from joblib import dump, load

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', '..', 'Data','Sonar_Issues')

model_save_dir = os.path.join(current_dir, '..', '..', '..', 'Data', 'Models', 'CodeSmellTags')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'measures+tags.csv'), low_memory=False)
df = df[df['PROJECT_ID'] == 'hive']
df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
13698,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,2.3,121074.0,437602.0,437602.0,140806,7917,813,9.3,12.9,"brain-overload, unused, antipattern, pitfall"
13699,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,2.3,121067.0,437585.0,437585.0,140806,7917,813,9.3,12.9,"error-handling, design, unused, suspicious"
13700,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,2.3,120954.0,437096.0,437096.0,140709,7913,810,9.3,12.9,"convention, pitfall"
13701,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,pitfall
13702,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,"error-handling, clumsy, design, suspicious, pi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,2.6,11206.0,33041.0,33041.0,17659,1224,75,5.9,24.4,"error-handling, clumsy, brain-overload, bad-pr..."
15550,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,2.6,11061.0,32889.0,32889.0,17789,1228,74,5.9,24.8,"error-handling, clumsy, brain-overload, design..."
15551,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,2.6,10701.0,31505.0,31505.0,16785,1208,66,5.7,24.7,"convention, design"
15552,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,2.6,10691.0,31428.0,31428.0,16790,1208,66,5.8,24.8,"brain-overload, clumsy"


## Scale labels

In [3]:
all_tags = ['convention', 'brain-overload','unused','error-handling','bad-practice','pitfall',
            'clumsy','suspicious','design','antipattern','redundant','confusing','performance','obsolete']

# transform TAGS strings to lists
df.loc[:, 'TAGS'] = df['TAGS'].str.split(',')
# remove whitespaces
df.loc[:, 'TAGS'] = df['TAGS'].apply(lambda x: [item.strip() for item in x])

# save TAGS as raw_labels to be further processed
raw_labels = df['TAGS']

# initialise mlb with all tag categories
mlb = MultiLabelBinarizer(classes=all_tags)
# fit the mlb with the list of lists of raw labels
Y_binarized = mlb.fit_transform(raw_labels)

print(f"MLB classes (order of one-hot columns): {mlb.classes_}")
num_classes = len(mlb.classes_)
print(f"Total number of possible labels: {num_classes}")

tags_df = pd.DataFrame(Y_binarized, columns=mlb.classes_)

# create copy of original df with reset index so that the binarised labels get inserted correctly
df_reset = df.reset_index(drop=True)

# concatenate the new tags_df with  original df
df_binary = pd.concat([df_reset.drop('TAGS', axis=1), tags_df], axis=1)
df_binary 

MLB classes (order of one-hot columns): ['convention' 'brain-overload' 'unused' 'error-handling' 'bad-practice'
 'pitfall' 'clumsy' 'suspicious' 'design' 'antipattern' 'redundant'
 'confusing' 'performance' 'obsolete']
Total number of possible labels: 14


Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,bad-practice,pitfall,clumsy,suspicious,design,antipattern,redundant,confusing,performance,obsolete
0,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,0,1,0,0,0,1,0,0,0,0
1,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,0,0,0,1,1,0,0,0,0,0
2,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,0,1,0,0,0,0,0,0,0,0
3,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,0,0,0,0,0,0,0,0
4,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,1,1,1,1,0,1,0,0,0,0
1852,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,1,1,1,1,1,1,0,0,1,1
1853,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,0,0,0,0,1,0,0,0,0,0
1854,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,0,0,1,0,0,0,0,0,0,0


## Scale predictors

In [4]:
columns_to_scale = ['CLASSES', 'FILES', 'LINES', 'NCLOC',
       'PACKAGE', 'STATEMENTS', 'FUNCTIONS', 'COMMENT_LINES', 'COMPLEXITY',
       'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY', 'COGNITIVE_COMPLEXITY',
       'LINES_TO_COVER', 'UNCOVERED_LINES', 'DUPLICATED_LINES',
       'DUPLICATED_BLOCKS', 'DUPLICATED_FILES', 'COMMENT_LINES_DENSITY',
       'DUPLICATED_LINES_DENSITY']
scaler = StandardScaler()
df_binary[columns_to_scale] = scaler.fit_transform(df_binary[columns_to_scale])

## Train-Test-Split

In [5]:
# train-test-split
X = df_binary.drop(columns = ['PROJECT_ID', 'SQ_ANALYSIS_DATE', 'convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete'])
y = df_binary[['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Prepare lists for saving model metrics
To make comparability of the multiple models easier, the test metrics of each model are saved into a list and exported into a csv for further analysis at the end.

In [6]:
roc_auc_list = []
acc_list = []
prec_list = []
recall_list = []
f1_list = []

## Modelling

### Convention

In [7]:
tag = 'convention'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# hyperparameter grid for gridsearch
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['sqrt', 'log2', 0.5],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: convention
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: convention

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 500}
Best cross-validation ROC AUC: 0.4509
Logistic Regression Model Evaluation for Tag: convention
Accuracy: 0.5349462365591398
ROC-AUC: 0.5577285238302188
Precision: 0.5126582278481012
Recall: 0.4576271186440678
F1 Score: 0.4835820895522388


In [8]:
tag = 'brain-overload'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: brain-overload
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: brain-overload

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation ROC AUC: 0.4792
Logistic Regression Model Evaluation for Tag: brain-overload
Accuracy: 0.5483870967741935
ROC-AUC: 0.5549945695247601
Precision: 0.48466257668711654
Recall: 0.48466257668711654
F1 Score: 0.48466257668711654


In [9]:
tag = 'unused'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: unused
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: unused

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 500}
Best cross-validation ROC AUC: 0.4103
Logistic Regression Model Evaluation for Tag: unused
Accuracy: 0.5053763440860215
ROC-AUC: 0.4716029486375981
Precision: 0.3900709219858156
Recall: 0.35947712418300654
F1 Score: 0.3741496598639456


In [10]:
tag = 'error-handling'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba) 
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: error-handling
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: error-handling

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation ROC AUC: 0.3827
Logistic Regression Model Evaluation for Tag: error-handling
Accuracy: 0.553763440860215
ROC-AUC: 0.5178526919242274
Precision: 0.40384615384615385
Recall: 0.4632352941176471
F1 Score: 0.4315068493150685


In [11]:
tag = 'bad-practice'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: bad-practice
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: bad-practice

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation ROC AUC: 0.3462
Logistic Regression Model Evaluation for Tag: bad-practice
Accuracy: 0.5887096774193549
ROC-AUC: 0.5178926088806922
Precision: 0.30578512396694213
Recall: 0.3490566037735849
F1 Score: 0.3259911894273128


In [12]:
tag = 'pitfall'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: pitfall
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: pitfall

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation ROC AUC: 0.2668
Logistic Regression Model Evaluation for Tag: pitfall
Accuracy: 0.6155913978494624
ROC-AUC: 0.5564735482116143
Precision: 0.3619047619047619
Recall: 0.3333333333333333
F1 Score: 0.3470319634703196


In [13]:
tag = 'clumsy'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: clumsy
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: clumsy

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation ROC AUC: 0.3429
Logistic Regression Model Evaluation for Tag: clumsy
Accuracy: 0.5672043010752689
ROC-AUC: 0.5335407316741466
Precision: 0.32592592592592595
Recall: 0.38596491228070173
F1 Score: 0.3534136546184739


In [14]:
tag = 'suspicious'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: suspicious
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: suspicious

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation ROC AUC: 0.2848
Logistic Regression Model Evaluation for Tag: suspicious
Accuracy: 0.6021505376344086
ROC-AUC: 0.5575708061002178
Precision: 0.3283582089552239
Recall: 0.43137254901960786
F1 Score: 0.37288135593220334


In [15]:
tag = 'design'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: design
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: design

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 200}
Best cross-validation ROC AUC: 0.2738
Logistic Regression Model Evaluation for Tag: design
Accuracy: 0.6290322580645161
ROC-AUC: 0.5333840015200455
Precision: 0.2736842105263158
Recall: 0.2736842105263158
F1 Score: 0.2736842105263158


In [16]:
tag = 'antipattern'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: antipattern
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: antipattern

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 500}
Best cross-validation ROC AUC: 0.2970
Logistic Regression Model Evaluation for Tag: antipattern
Accuracy: 0.6774193548387096
ROC-AUC: 0.5641301907968574
Precision: 0.25274725274725274
Recall: 0.30666666666666664
F1 Score: 0.27710843373493976


In [17]:
tag = 'redundant'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: redundant
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: redundant

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation ROC AUC: 0.1693
Logistic Regression Model Evaluation for Tag: redundant
Accuracy: 0.6908602150537635
ROC-AUC: 0.5776708074534161
Precision: 0.1348314606741573
Recall: 0.24
F1 Score: 0.1726618705035971


In [18]:
tag = 'confusing'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: confusing
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: confusing

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 500}
Best cross-validation ROC AUC: 0.1304
Logistic Regression Model Evaluation for Tag: confusing
Accuracy: 0.9059139784946236
ROC-AUC: 0.3887532220865554
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [19]:
tag = 'performance'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: performance
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: performance

Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 0.5, 'min_samples_split': 10, 'n_estimators': 300}
Best cross-validation ROC AUC: 0.0886
Logistic Regression Model Evaluation for Tag: performance
Accuracy: 0.8951612903225806
ROC-AUC: 0.4881644518272425
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


In [20]:
tag = 'obsolete'

# handle imbalance with class_weight = 'balanced'
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced')

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

# 5-fold stratified cross validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimising f1_score
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    scoring='f1',
    cv=kf,
    verbose=2,
    n_jobs=-1,
)

print(f"\nStarting GridSearchCV for tag: {tag}")
grid_search.fit(X_train, y_train_temp)
print(f"GridSearchCV finished for tag: {tag}")

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("\nBest Parameters:", best_params)
print(f"Best cross-validation F1: {grid_search.best_score_:.4f}")

# predictions for test data
## for ROC AUC, predict probabilities
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
## for accuracy, precision, recall, f1, use predict to get hard labels
y_pred = best_model.predict(X_test)

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
os.makedirs(os.path.join(model_save_dir, "RandomForest"), exist_ok=True)
filename_joblib = os.path.join(model_save_dir, "RandomForest", f"RandomForest_{tag}.joblib")
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)


Starting GridSearchCV for tag: obsolete
Fitting 5 folds for each of 144 candidates, totalling 720 fits
GridSearchCV finished for tag: obsolete

Best Parameters: {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 10, 'n_estimators': 100}
Best cross-validation ROC AUC: 0.0686
Logistic Regression Model Evaluation for Tag: obsolete
Accuracy: 0.946236559139785
ROC-AUC: 0.6227231329690346
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


## Save model evaluation results

In [21]:
tag_list = ['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']
data = {
    'Tag': tag_list,
    'ROC-AUC': roc_auc_list,
    'Accuracy': acc_list,
    'Precision': prec_list,
    'Recall': recall_list,
    'F1-Score': f1_list
}

df_results = pd.DataFrame(data)

df_results.to_csv(os.path.join(model_save_dir, 'RandomForest_Evaluation_Results.csv'), index = False)