# XGBoost for each tag
XGBoost is one of the state of the art models for binary classification. The goal is to model each tag through its own model. To ensure the best prediction, F1-Score is optimised for balancing recall and precision. Accuracy is not suitable, since some tags are very unbalanced. The models are optimised with cross-validated gridsearch. This modelling approach doesn't account for the time series aspect.

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from joblib import dump, load

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', '..', 'Data','Sonar_Issues')

model_save_dir = os.path.join(current_dir, '..', '..', '..', 'Data', 'Models', 'CodeSmellTags')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'measures+tags.csv'), low_memory=False)
df = df[df['PROJECT_ID'] == 'hive']
df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
13698,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,2.3,121074.0,437602.0,437602.0,140806,7917,813,9.3,12.9,"brain-overload, unused, antipattern, pitfall"
13699,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,2.3,121067.0,437585.0,437585.0,140806,7917,813,9.3,12.9,"error-handling, design, unused, suspicious"
13700,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,2.3,120954.0,437096.0,437096.0,140709,7913,810,9.3,12.9,"convention, pitfall"
13701,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,pitfall
13702,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,"error-handling, clumsy, design, suspicious, pi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,2.6,11206.0,33041.0,33041.0,17659,1224,75,5.9,24.4,"error-handling, clumsy, brain-overload, bad-pr..."
15550,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,2.6,11061.0,32889.0,32889.0,17789,1228,74,5.9,24.8,"error-handling, clumsy, brain-overload, design..."
15551,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,2.6,10701.0,31505.0,31505.0,16785,1208,66,5.7,24.7,"convention, design"
15552,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,2.6,10691.0,31428.0,31428.0,16790,1208,66,5.8,24.8,"brain-overload, clumsy"


## Prepare labels

In [3]:
all_tags = ['convention', 'brain-overload','unused','error-handling','bad-practice','pitfall',
            'clumsy','suspicious','design','antipattern','redundant','confusing','performance','obsolete']

# transform TAGS strings to lists
df.loc[:, 'TAGS'] = df['TAGS'].str.split(',')
# remove whitespaces
df.loc[:, 'TAGS'] = df['TAGS'].apply(lambda x: [item.strip() for item in x])

# save TAGS as raw_labels to be further processed
raw_labels = df['TAGS']

# initialise mlb with all tag categories
mlb = MultiLabelBinarizer(classes=all_tags)
# fit the mlb with the list of lists of raw labels
Y_binarized = mlb.fit_transform(raw_labels)

print(f"MLB classes (order of one-hot columns): {mlb.classes_}")
num_classes = len(mlb.classes_)
print(f"Total number of possible labels: {num_classes}")

tags_df = pd.DataFrame(Y_binarized, columns=mlb.classes_)

# create copy of original df with reset index so that the binarised labels get inserted correctly
df_reset = df.reset_index(drop=True)

# concatenate the new tags_df with  original df
df_binary = pd.concat([df_reset.drop('TAGS', axis=1), tags_df], axis=1)
df_binary 

MLB classes (order of one-hot columns): ['convention' 'brain-overload' 'unused' 'error-handling' 'bad-practice'
 'pitfall' 'clumsy' 'suspicious' 'design' 'antipattern' 'redundant'
 'confusing' 'performance' 'obsolete']
Total number of possible labels: 14


Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,bad-practice,pitfall,clumsy,suspicious,design,antipattern,redundant,confusing,performance,obsolete
0,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,0,1,0,0,0,1,0,0,0,0
1,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,0,0,0,1,1,0,0,0,0,0
2,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,0,1,0,0,0,0,0,0,0,0
3,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,0,0,0,0,0,0,0,0
4,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,1,1,1,1,0,1,0,0,0,0
1852,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,1,1,1,1,1,1,0,0,1,1
1853,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,0,0,0,0,1,0,0,0,0,0
1854,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
df_binary.columns

Index(['PROJECT_ID', 'SQ_ANALYSIS_DATE', 'CLASSES', 'FILES', 'LINES', 'NCLOC',
       'PACKAGE', 'STATEMENTS', 'FUNCTIONS', 'COMMENT_LINES', 'COMPLEXITY',
       'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY', 'COGNITIVE_COMPLEXITY',
       'LINES_TO_COVER', 'UNCOVERED_LINES', 'DUPLICATED_LINES',
       'DUPLICATED_BLOCKS', 'DUPLICATED_FILES', 'COMMENT_LINES_DENSITY',
       'DUPLICATED_LINES_DENSITY', 'convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete'],
      dtype='object')

## Scale predictors

In [5]:
columns_to_scale = ['CLASSES', 'FILES', 'LINES', 'NCLOC',
       'PACKAGE', 'STATEMENTS', 'FUNCTIONS', 'COMMENT_LINES', 'COMPLEXITY',
       'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY', 'COGNITIVE_COMPLEXITY',
       'LINES_TO_COVER', 'UNCOVERED_LINES', 'DUPLICATED_LINES',
       'DUPLICATED_BLOCKS', 'DUPLICATED_FILES', 'COMMENT_LINES_DENSITY',
       'DUPLICATED_LINES_DENSITY']
scaler = StandardScaler()
df_binary[columns_to_scale] = scaler.fit_transform(df_binary[columns_to_scale])

## Train-Test-Split

In [6]:
# train-test-split
X = df_binary.drop(columns = ['PROJECT_ID', 'SQ_ANALYSIS_DATE', 'convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete'])
y = df_binary[['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Prepare lists for saving model metrics
To make comparability of the multiple models easier, the test metrics of each model are saved into a list and exported into a csv for further analysis at the end.

In [7]:
roc_auc_list = []
acc_list = []
prec_list = []
recall_list = []
f1_list = []

## Modelling

### Convention

In [8]:
tag = 'convention'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 1.3481012658227849, 'subsample': 0.8}
XGBoost Model Evaluation for Tag: convention
Accuracy: 0.510752688172043
ROC-AUC: 0.5098652759669708
Precision: 0.4860335195530726
Recall: 0.4915254237288136
F1 Score: 0.4887640449438202


### Brain-overload

In [9]:
tag = 'brain-overload'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 200, 'scale_pos_weight': 1.3296703296703296, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: brain-overload
Accuracy: 0.5564516129032258
ROC-AUC: 0.5499016643672763
Precision: 0.49390243902439024
Recall: 0.49693251533742333
F1 Score: 0.4954128440366973


### Unused

In [10]:
tag = 'unused'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 1.8704061895551258, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: unused
Accuracy: 0.5161290322580645
ROC-AUC: 0.5063121138866505
Precision: 0.41818181818181815
Recall: 0.45098039215686275
F1 Score: 0.43396226415094336


### Error-handling

In [11]:
tag = 'error-handling'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 1.853846153846154, 'subsample': 0.8}
XGBoost Model Evaluation for Tag: error-handling
Accuracy: 0.5349462365591398
ROC-AUC: 0.5259845463609173
Precision: 0.391812865497076
Recall: 0.49264705882352944
F1 Score: 0.4364820846905537


### Bad practice

In [12]:
tag = 'bad-practice'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 50, 'scale_pos_weight': 2.5845410628019323, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: bad-practice
Accuracy: 0.5510752688172043
ROC-AUC: 0.5442261313661512
Precision: 0.3236994219653179
Recall: 0.5283018867924528
F1 Score: 0.4014336917562724


### Pitfall

In [13]:
tag = 'pitfall'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 2.71, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: pitfall
Accuracy: 0.5672043010752689
ROC-AUC: 0.5337617299061608
Precision: 0.3422818791946309
Recall: 0.4473684210526316
F1 Score: 0.38783269961977185


### Clumsy

In [14]:
tag = 'clumsy'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'scale_pos_weight': 2.728643216080402, 'subsample': 0.7}
XGBoost Model Evaluation for Tag: clumsy
Accuracy: 0.5268817204301075
ROC-AUC: 0.5071399428804569
Precision: 0.3132530120481928
Recall: 0.45614035087719296
F1 Score: 0.37142857142857144


### Suspicious

In [15]:
tag = 'suspicious'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 2.9363395225464193, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: suspicious
Accuracy: 0.5161290322580645
ROC-AUC: 0.5385620915032681
Precision: 0.30303030303030304
Recall: 0.5882352941176471
F1 Score: 0.39999999999999997


### Design

In [16]:
tag = 'design'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 3.264367816091954, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: design
Accuracy: 0.5806451612903226
ROC-AUC: 0.5385901577047312
Precision: 0.2925170068027211
Recall: 0.45263157894736844
F1 Score: 0.35537190082644626


### Antipattern

In [17]:
tag = 'antipattern'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 3.6666666666666665, 'subsample': 0.8}
XGBoost Model Evaluation for Tag: antipattern
Accuracy: 0.5967741935483871
ROC-AUC: 0.5182491582491583
Precision: 0.21804511278195488
Recall: 0.38666666666666666
F1 Score: 0.2788461538461538


### Redundant

In [18]:
tag = 'redundant'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 50, 'scale_pos_weight': 8.275, 'subsample': 0.9}
XGBoost Model Evaluation for Tag: redundant
Accuracy: 0.5860215053763441
ROC-AUC: 0.5581366459627329
Precision: 0.16666666666666666
Recall: 0.52
F1 Score: 0.2524271844660194


### Confusing

In [19]:
tag = 'confusing'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 100, 'scale_pos_weight': 17.78481012658228, 'subsample': 0.7}
XGBoost Model Evaluation for Tag: confusing
Accuracy: 0.8387096774193549
ROC-AUC: 0.48921448921448923
Precision: 0.046511627906976744
Recall: 0.09523809523809523
F1 Score: 0.0625


### Performance

In [20]:
tag = 'performance'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 200, 'scale_pos_weight': 19.61111111111111, 'subsample': 0.8}
XGBoost Model Evaluation for Tag: performance
Accuracy: 0.8602150537634409
ROC-AUC: 0.4815199335548173
Precision: 0.038461538461538464
Recall: 0.03571428571428571
F1 Score: 0.03703703703703704


### Obsolete

In [21]:
tag = 'obsolete'

# weighting for balancing the two classes
positive_count = np.sum(y_train[tag] == 1)
negative_count = np.sum(y_train[tag] == 0)
calculated_scale_pos_weight = negative_count / positive_count

xgb_model = XGBClassifier(objective='binary:logistic', random_state=42)
y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [4, 5, 6],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.8, 0.9],
    'scale_pos_weight': [calculated_scale_pos_weight]
}

# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid, optimizing auc
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)

# evaluation
# roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
roc_auc = roc_auc_score(y_test_temp, y_pred)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred)
recall = recall_score(y_test_temp, y_pred)
f1 = f1_score(y_test_temp, y_pred)

print(f"XGBoost Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

filename_joblib = os.path.join(model_save_dir, "XGBoost", f"XGBoost_{tag}.joblib")
dump(best_model, filename_joblib)

roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'scale_pos_weight': 38.05263157894737, 'subsample': 0.7}
XGBoost Model Evaluation for Tag: obsolete
Accuracy: 0.8682795698924731
ROC-AUC: 0.4412568306010929
Precision: 0.0
Recall: 0.0
F1 Score: 0.0


## Save Model Evaluation results

In [22]:
tag_list = ['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']
data = {
    'Tag': tag_list,
    'ROC-AUC': roc_auc_list,
    'Accuracy': acc_list,
    'Precision': prec_list,
    'Recall': recall_list,
    'F1-Score': f1_list
}

df_results = pd.DataFrame(data)

df_results.to_csv(os.path.join(model_save_dir, 'XGBoost_Evaluation_Results.csv'), index = False)