In [1]:
import pandas as pd
code_bugs_df = pd.read_csv('Code_bugs_cleaned.csv')

In [2]:
code_bugs_df.head()

Unnamed: 0,Priority,Status,Resolution,Time_to_Update
0,Major,Closed,Invalid,0
1,Critical,Closed,Workaround,2
2,Blocker,Closed,Fixed,0
3,Trivial,Closed,Not A Bug,15
4,Major,Closed,Duplicate,0


In [3]:
code_bugs_df.isna().sum().sort_values()

Priority          0
Status            0
Resolution        0
Time_to_Update    0
dtype: int64

In [4]:
def simplify_resolution(val):
    if val == 'Fixed':
        return 'Fixed'
    else:
        return 'Other'

code_bugs_df['Resolution'] = code_bugs_df['Resolution'].apply(simplify_resolution)


In [5]:
def simplify_priority(val):
    if val in ['Blocker', 'Critical']:
        return 'High'
    elif val in ['Major']:
        return 'Medium'
    else:
        return 'low'
code_bugs_df['Priority'] = code_bugs_df['Priority'].apply(simplify_priority)

In [6]:
def simplify_Status(val):
    if val in ['Resolved', 'Closed']:
        return 'Resolved'
    else:
        return 'Open'
code_bugs_df['Status'] = code_bugs_df['Status'].apply(simplify_Status)

In [7]:
code_bugs_df.head(10)

Unnamed: 0,Priority,Status,Resolution,Time_to_Update
0,Medium,Resolved,Other,0
1,High,Resolved,Other,2
2,High,Resolved,Fixed,0
3,low,Resolved,Other,15
4,Medium,Resolved,Other,0
5,Medium,Resolved,Other,0
6,Medium,Resolved,Other,1
7,low,Resolved,Other,2
8,Medium,Resolved,Other,0
9,Medium,Resolved,Other,0


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
code_bugs_df['Priority'] = le.fit_transform(code_bugs_df['Priority'])
code_bugs_df['Status'] = le.fit_transform(code_bugs_df['Status'])
code_bugs_df['Resolution'] = le.fit_transform(code_bugs_df['Resolution'])  # Target

In [9]:
code_bugs_df.head(10)

Unnamed: 0,Priority,Status,Resolution,Time_to_Update
0,1,0,1,0
1,0,0,1,2
2,0,0,0,0
3,2,0,1,15
4,1,0,1,0
5,1,0,1,0
6,1,0,1,1
7,2,0,1,2
8,1,0,1,0
9,1,0,1,0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
X = code_bugs_df.drop(columns=['Resolution'])
y = code_bugs_df['Resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.91      0.57      0.70       119
           1       0.11      0.46      0.17        13

    accuracy                           0.56       132
   macro avg       0.51      0.52      0.44       132
weighted avg       0.83      0.56      0.65       132

[[68 51]
 [ 7  6]]
ROC AUC Score: 0.5164835164835165


In [11]:
code_bugs_df_encoded = pd.get_dummies(code_bugs_df, columns=['Priority', 'Status'], drop_first=True)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = code_bugs_df_encoded.drop(columns=['Resolution'])
y = code_bugs_df_encoded['Resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]  # Get probabilities for the positive class
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("Predicted Probabilities:\n", y_pred_proba[0])
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

              precision    recall  f1-score   support

           0       0.92      0.86      0.89       119
           1       0.19      0.31      0.24        13

    accuracy                           0.80       132
   macro avg       0.55      0.58      0.56       132
weighted avg       0.85      0.80      0.82       132

[[102  17]
 [  9   4]]
Predicted Probabilities:
 0.48150251247674375
ROC AUC Score: 0.54169360051713


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X = code_bugs_df_encoded.drop(columns=['Resolution'])
y = code_bugs_df_encoded['Resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       178
           1       0.00      0.00      0.00        20

    accuracy                           0.90       198
   macro avg       0.45      0.50      0.47       198
weighted avg       0.81      0.90      0.85       198

[[178   0]
 [ 20   0]]
ROC AUC Score: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from xgboost import XGBClassifier
X = code_bugs_df_encoded.drop(columns=['Resolution'])
y = code_bugs_df_encoded['Resolution']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',scale_pos_weight=1)
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)    
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.91      0.97      0.94       119
           1       0.25      0.08      0.12        13

    accuracy                           0.89       132
   macro avg       0.58      0.53      0.53       132
weighted avg       0.84      0.89      0.86       132

[[116   3]
 [ 12   1]]
ROC AUC Score: 0.5258564964447316


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       119
           1       0.20      0.31      0.24        13

    accuracy                           0.81       132
   macro avg       0.56      0.59      0.57       132
weighted avg       0.85      0.81      0.83       132

[[103  16]
 [  9   4]]
ROC AUC Score: 0.5866192630898514


In [16]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
print("Best Parameters:", grid_search.best_params_)
best_param = grid_search.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [17]:
from xgboost import XGBClassifier
best_param.fit(X_train_scaled, y_train)
y_pred = best_param.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       119
           1       0.00      0.00      0.00        13

    accuracy                           0.90       132
   macro avg       0.45      0.50      0.47       132
weighted avg       0.81      0.90      0.85       132

[[119   0]
 [ 13   0]]
ROC AUC Score: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
log_reg.fit(X_resampled, y_resampled)
y_pred = log_reg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score after SMOTE:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.82      0.87       119
           1       0.19      0.38      0.25        13

    accuracy                           0.77       132
   macro avg       0.55      0.60      0.56       132
weighted avg       0.85      0.77      0.81       132

[[97 22]
 [ 8  5]]
ROC AUC Score after SMOTE: 0.5998707175177763


In [19]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
rf.fit(X_resampled, y_resampled)
y_pred = rf.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score after SMOTE:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.63      0.75       119
           1       0.12      0.46      0.19        13

    accuracy                           0.61       132
   macro avg       0.52      0.55      0.47       132
weighted avg       0.84      0.61      0.69       132

[[75 44]
 [ 7  6]]
ROC AUC Score after SMOTE: 0.5458952811893989


In [20]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_scaled, y_train)
knn.fit(X_resampled, y_resampled)
y_pred = knn.predict(X_test_scaled)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print("ROC AUC Score after SMOTE:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.90      0.91       119
           1       0.20      0.23      0.21        13

    accuracy                           0.83       132
   macro avg       0.56      0.56      0.56       132
weighted avg       0.84      0.83      0.84       132

[[107  12]
 [ 10   3]]
ROC AUC Score after SMOTE: 0.5649644473173885
