# Logistic Regression for each tag
Logistic Regression is one of the state of the art models for binary classification. The goal is to model each tag through its own model. To ensure the best prediction, F1-Score is optimised for balancing recall and precision. Accuracy is not suitable, since some tags are very unbalanced. The models are optimised with cross-validated gridsearch. This modelling approach doesn't account for the time series aspect.

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics import make_scorer
import warnings
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from joblib import dump, load

In [2]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', '..', 'Data','Sonar_Issues')

model_save_dir = os.path.join(current_dir, '..', '..', '..', 'Data', 'Models', 'CodeSmellTags')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'measures+tags.csv'), low_memory=False)
df = df[df['PROJECT_ID'] == 'hive']
df

Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,FUNCTION_COMPLEXITY,COGNITIVE_COMPLEXITY,LINES_TO_COVER,UNCOVERED_LINES,DUPLICATED_LINES,DUPLICATED_BLOCKS,DUPLICATED_FILES,COMMENT_LINES_DENSITY,DUPLICATED_LINES_DENSITY,TAGS
13698,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,2.3,121074.0,437602.0,437602.0,140806,7917,813,9.3,12.9,"brain-overload, unused, antipattern, pitfall"
13699,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,2.3,121067.0,437585.0,437585.0,140806,7917,813,9.3,12.9,"error-handling, design, unused, suspicious"
13700,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,2.3,120954.0,437096.0,437096.0,140709,7913,810,9.3,12.9,"convention, pitfall"
13701,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,pitfall
13702,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,2.3,119218.0,431125.0,431125.0,139347,7774,791,9.3,13.0,"error-handling, clumsy, design, suspicious, pi..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15549,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,2.6,11206.0,33041.0,33041.0,17659,1224,75,5.9,24.4,"error-handling, clumsy, brain-overload, bad-pr..."
15550,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,2.6,11061.0,32889.0,32889.0,17789,1228,74,5.9,24.8,"error-handling, clumsy, brain-overload, design..."
15551,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,2.6,10701.0,31505.0,31505.0,16785,1208,66,5.7,24.7,"convention, design"
15552,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,2.6,10691.0,31428.0,31428.0,16790,1208,66,5.8,24.8,"brain-overload, clumsy"


## Prepare labels

In [3]:
all_tags = ['convention', 'brain-overload','unused','error-handling','bad-practice','pitfall',
            'clumsy','suspicious','design','antipattern','redundant','confusing','performance','obsolete']

# transform TAGS strings to lists
df.loc[:, 'TAGS'] = df['TAGS'].str.split(',')
# remove whitespaces
df.loc[:, 'TAGS'] = df['TAGS'].apply(lambda x: [item.strip() for item in x])

# save TAGS as raw_labels to be further processed
raw_labels = df['TAGS']

# initialise mlb with all tag categories
mlb = MultiLabelBinarizer(classes=all_tags)
# fit the mlb with the list of lists of raw labels
Y_binarized = mlb.fit_transform(raw_labels)

print(f"MLB classes (order of one-hot columns): {mlb.classes_}")
num_classes = len(mlb.classes_)
print(f"Total number of possible labels: {num_classes}")

tags_df = pd.DataFrame(Y_binarized, columns=mlb.classes_)

# create copy of original df with reset index so that the binarised labels get inserted correctly
df_reset = df.reset_index(drop=True)

# concatenate the new tags_df with  original df
df_binary = pd.concat([df_reset.drop('TAGS', axis=1), tags_df], axis=1)
df_binary 

MLB classes (order of one-hot columns): ['convention' 'brain-overload' 'unused' 'error-handling' 'bad-practice'
 'pitfall' 'clumsy' 'suspicious' 'design' 'antipattern' 'redundant'
 'confusing' 'performance' 'obsolete']
Total number of possible labels: 14


Unnamed: 0,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,LINES,NCLOC,PACKAGE,STATEMENTS,FUNCTIONS,COMMENT_LINES,...,bad-practice,pitfall,clumsy,suspicious,design,antipattern,redundant,confusing,performance,obsolete
0,hive,2015-03-03 00:37:22,8477.0,3882.0,1088490.0,743742.0,387,358319.0,62459.0,76113.0,...,0,1,0,0,0,1,0,0,0,0
1,hive,2015-03-02 18:18:35,8477.0,3882.0,1088466.0,743721.0,387,358306.0,62458.0,76112.0,...,0,0,0,1,1,0,0,0,0,0
2,hive,2015-02-27 23:08:33,8468.0,3872.0,1087272.0,742901.0,387,357917.0,62390.0,76071.0,...,0,1,0,0,0,0,0,0,0,0
3,hive,2015-02-27 21:30:05,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,0,0,0,0,0,0,0,0
4,hive,2015-02-27 21:09:45,8327.0,3789.0,1071783.0,731599.0,364,352969.0,61412.0,75080.0,...,0,1,1,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1851,hive,2008-09-18 17:37:59,664.0,399.0,72263.0,51707.0,33,28559.0,4592.0,3235.0,...,1,1,1,1,0,1,0,0,0,0
1852,hive,2008-09-18 00:09:17,661.0,397.0,71629.0,51241.0,33,28335.0,4538.0,3215.0,...,1,1,1,1,1,1,0,0,1,1
1853,hive,2008-09-17 20:13:00,613.0,358.0,67865.0,48976.0,29,27145.0,4346.0,2985.0,...,0,0,0,0,1,0,0,0,0,0
1854,hive,2008-09-17 00:28:22,613.0,358.0,67754.0,48873.0,29,27078.0,4340.0,2983.0,...,0,0,1,0,0,0,0,0,0,0


In [4]:
df_binary.columns

Index(['PROJECT_ID', 'SQ_ANALYSIS_DATE', 'CLASSES', 'FILES', 'LINES', 'NCLOC',
       'PACKAGE', 'STATEMENTS', 'FUNCTIONS', 'COMMENT_LINES', 'COMPLEXITY',
       'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY', 'COGNITIVE_COMPLEXITY',
       'LINES_TO_COVER', 'UNCOVERED_LINES', 'DUPLICATED_LINES',
       'DUPLICATED_BLOCKS', 'DUPLICATED_FILES', 'COMMENT_LINES_DENSITY',
       'DUPLICATED_LINES_DENSITY', 'convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete'],
      dtype='object')

## Scale predictors

In [5]:
columns_to_scale = ['CLASSES', 'FILES', 'LINES', 'NCLOC',
       'PACKAGE', 'STATEMENTS', 'FUNCTIONS', 'COMMENT_LINES', 'COMPLEXITY',
       'CLASS_COMPLEXITY', 'FUNCTION_COMPLEXITY', 'COGNITIVE_COMPLEXITY',
       'LINES_TO_COVER', 'UNCOVERED_LINES', 'DUPLICATED_LINES',
       'DUPLICATED_BLOCKS', 'DUPLICATED_FILES', 'COMMENT_LINES_DENSITY',
       'DUPLICATED_LINES_DENSITY']
scaler = StandardScaler()
df_binary[columns_to_scale] = scaler.fit_transform(df_binary[columns_to_scale])

## Train-Test-Split

In [6]:
# train-test-split
X = df_binary.drop(columns = ['PROJECT_ID', 'SQ_ANALYSIS_DATE', 'convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete'])
y = df_binary[['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Prepare lists for saving model metrics
To make comparability of the multiple models easier, the test metrics of each model are saved into a list and exported into a csv for further analysis at the end.

In [7]:
roc_auc_list = []
acc_list = []
prec_list = []
recall_list = []
f1_list = []

## Modelling

### Convention

In [8]:
tag = 'convention'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

def custom_f1_with_threshold_tuning(y_true, y_pred_proba):
    thresholds = np.linspace(0.01, 0.99, 50)
    best_f1 = 0
    for t in thresholds:
        y_pred_at_t = (y_pred_proba >= t).astype(int)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            current_f1 = f1_score(y_true, y_pred_at_t, zero_division=0)
        if current_f1 > best_f1:
            best_f1 = current_f1
    return best_f1

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2'}
Logistic Regression Model Evaluation for Tag: convention
Accuracy: 0.5456989247311828
ROC-AUC: 0.5888019701579024
Precision: 0.5212765957446809
Recall: 0.5536723163841808
F1 Score: 0.5369863013698629


### Brain-overload

In [9]:
tag = 'brain-overload'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
Logistic Regression Model Evaluation for Tag: brain-overload
Accuracy: 0.5403225806451613
ROC-AUC: 0.509906948072915
Precision: 0.47530864197530864
Recall: 0.4723926380368098
F1 Score: 0.4738461538461538


### Unused

In [10]:
tag = 'unused'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 0.01, 'class_weight': 'balanced', 'penalty': 'l2'}
Logistic Regression Model Evaluation for Tag: unused
Accuracy: 0.5188172043010753
ROC-AUC: 0.5313815023726385
Precision: 0.4336734693877551
Recall: 0.5555555555555556
F1 Score: 0.4871060171919771


### Error-handling

In [11]:
tag = 'error-handling'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: error-handling
Accuracy: 0.5080645161290323
ROC-AUC: 0.4895314057826521
Precision: 0.36257309941520466
Recall: 0.45588235294117646
F1 Score: 0.4039087947882736


### Bad-practice

In [12]:
tag = 'bad-practice'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: bad-practice
Accuracy: 0.5887096774193549
ROC-AUC: 0.546389558802667
Precision: 0.3284671532846715
Recall: 0.42452830188679247
F1 Score: 0.37037037037037035


### Pitfall

In [13]:
tag = 'pitfall'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: pitfall
Accuracy: 0.5698924731182796
ROC-AUC: 0.5613695090439276
Precision: 0.36470588235294116
Recall: 0.543859649122807
F1 Score: 0.4366197183098592


### Clumsy

In [14]:
tag = 'clumsy'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}
Logistic Regression Model Evaluation for Tag: clumsy
Accuracy: 0.5161290322580645
ROC-AUC: 0.5069019447844418
Precision: 0.2962962962962963
Recall: 0.42105263157894735
F1 Score: 0.34782608695652173


### Suspicious

In [15]:
tag = 'suspicious'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: suspicious
Accuracy: 0.5241935483870968
ROC-AUC: 0.580319535221496
Precision: 0.3271889400921659
Recall: 0.696078431372549
F1 Score: 0.44514106583072105


### Design

In [16]:
tag = 'design'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: design
Accuracy: 0.6532258064516129
ROC-AUC: 0.5350560516815505
Precision: 0.3333333333333333
Recall: 0.35789473684210527
F1 Score: 0.3451776649746193


### Antipattern

In [17]:
tag = 'antipattern'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: antipattern
Accuracy: 0.5591397849462365
ROC-AUC: 0.552615039281706
Precision: 0.2236024844720497
Recall: 0.48
F1 Score: 0.3050847457627119


### Redundant

In [18]:
tag = 'redundant'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 10, 'class_weight': 'balanced', 'penalty': 'l2'}
Logistic Regression Model Evaluation for Tag: redundant
Accuracy: 0.5295698924731183
ROC-AUC: 0.5988198757763975
Precision: 0.1657754010695187
Recall: 0.62
F1 Score: 0.2616033755274262


### Confusing

In [19]:
tag = 'confusing'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best Parameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: confusing
Accuracy: 0.4596774193548387
ROC-AUC: 0.5972052638719305
Precision: 0.0673076923076923
Recall: 0.6666666666666666
F1 Score: 0.1222707423580786


### Performance

In [20]:
tag = 'performance'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: performance
Accuracy: 0.6182795698924731
ROC-AUC: 0.5174418604651163
Precision: 0.09285714285714286
Recall: 0.4642857142857143
F1 Score: 0.15476190476190477


### Obsolete

In [21]:
tag = 'obsolete'

logreg_model = LogisticRegression(random_state=42, solver='liblinear', max_iter=1000)

y_train_temp = y_train[tag]
y_test_temp = y_test[tag]

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # inverse of regularization strength
    'penalty': ['l1', 'l2'], # regularization type
    'class_weight': ['balanced']
}

# create the scorer, indicating it needs probabilities
f1_tuned_threshold = make_scorer(custom_f1_with_threshold_tuning, needs_proba=True)
    
# 5-fold stratified cross validation setup (stratified k fold to balance classes in folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# grid search on param_grid
grid_search = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid,
    scoring=f1_tuned_threshold,
    cv=skf,
    verbose=2,
    n_jobs=-1,
)

grid_search.fit(X_train, y_train_temp)

# extract best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Parameters:", best_params)

# predictions for test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# evaluation
roc_auc = roc_auc_score(y_test_temp, y_pred_proba)
acc = accuracy_score(y_test_temp, y_pred)
prec = precision_score(y_test_temp, y_pred, zero_division=0)
recall = recall_score(y_test_temp, y_pred, zero_division=0)
f1 = f1_score(y_test_temp, y_pred, zero_division=0)

print(f"Logistic Regression Model Evaluation for Tag: {tag}")
print(f"Accuracy: {acc}")
print(f"ROC-AUC: {roc_auc}")
print(f"Precision: {prec}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# save model
filename_joblib = os.path.join(model_save_dir, "LogisticRegression", f"LogisticRegression_{tag}.joblib")
os.makedirs(os.path.dirname(filename_joblib), exist_ok=True)
dump(best_model, filename_joblib)

# save metrics
roc_auc_list.append(roc_auc)
acc_list.append(acc)
prec_list.append(prec)
recall_list.append(recall)
f1_list.append(f1)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Parameters: {'C': 100, 'class_weight': 'balanced', 'penalty': 'l1'}
Logistic Regression Model Evaluation for Tag: obsolete
Accuracy: 0.6639784946236559
ROC-AUC: 0.598816029143898
Precision: 0.024
Recall: 0.5
F1 Score: 0.04580152671755725




## Save Model Evaluation results

In [22]:
tag_list = ['convention', 'brain-overload', 'unused',
       'error-handling', 'bad-practice', 'pitfall', 'clumsy', 'suspicious',
       'design', 'antipattern', 'redundant', 'confusing', 'performance',
       'obsolete']
data = {
    'Tag': tag_list,
    'ROC-AUC': roc_auc_list,
    'Accuracy': acc_list,
    'Precision': prec_list,
    'Recall': recall_list,
    'F1-Score': f1_list
}

df_results = pd.DataFrame(data)

df_results.to_csv(os.path.join(model_save_dir, 'LogisticRegression_Evaluation_Results.csv'), index = False)