In [None]:
import pandas as pd
import numpy as np
import ast
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, hamming_loss, f1_score, multilabel_confusion_matrix,ConfusionMatrixDisplay
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
import fasttext
from transformers import BertTokenizer
import gensim
import gensim.downloader

dataset = pd.read_excel("Synthetic User Stories.xlsx") # Change dataset here 

labels = pd.read_excel("Keyword labelled.xlsx", header=None)
labels[2] = labels[2].apply(lambda x: x.lower())
labels

In [None]:
categories_column = []
for row in labels.iterrows():
    current_labels = []
    for label in row[1][3:]:
        if isinstance(label, str):
            current_labels.append(label.lower())
    categories_column.append(current_labels)
labels["Categories array"] = categories_column
labels[[2, "Categories array"]]

In [None]:
target = []
counter = 0
for row in dataset.iterrows():
    target.append(labels[labels[2]==row[1]["Machine Learning Task"].lower()]["Categories array"].values[0])
    counter += 1
dataset["Target"] = target
dataset[["User Story","Target"]]

In [None]:
#Cast categories into lists
dataset['Target'] = dataset['Target'].apply(lambda x: ast.literal_eval(str(x)))
dataset['Target']

In [None]:
multilabel = MultiLabelBinarizer()
y = multilabel.fit_transform(dataset['Target'])
pd.DataFrame(y, columns=multilabel.classes_)

In [None]:
from IPython.display import display, HTML
display(HTML(dataset.Target.apply(tuple).value_counts().to_frame().to_html()))

In [None]:
indexes = ['F1-mean', 'Precision-mean', 'Recall-mean', 'Hammer-Loss-mean',
           'F1-fold1','Precision-fold1','Recall-fold1','Hammer-Loss-fold1',
           'F1-fold2','Precision-fold2','Recall-fold2','Hammer-Loss-fold2',
           'F1-fold3','Precision-fold3','Recall-fold3','Hammer-Loss-fold3',
           'F1-fold4','Precision-fold4','Recall-fold4','Hammer-Loss-fold4',
           'F1-fold5','Precision-fold5','Recall-fold5','Hammer-Loss-fold5',
           'F1-fold6','Precision-fold6','Recall-fold6','Hammer-Loss-fold6',
           'F1-fold7','Precision-fold7','Recall-fold7','Hammer-Loss-fold7',
           'F1-fold8','Precision-fold8','Recall-fold8','Hammer-Loss-fold8',
           'F1-fold9','Precision-fold9','Recall-fold9','Hammer-Loss-fold9',
           'F1-fold10','Precision-fold10','Recall-fold10','Hammer-Loss-fold10',
]

column = ['BinaryRelevance LogisticRegression','BinaryRelevance RandomForestClassifier','BinaryRelevance GaussianNB','BinaryRelevance LinearSVC','BinaryRelevance KNeighborsClassifier','BinaryRelevance DecisionTreeClassifier',
          'ClassifierChain LogisticRegression','ClassifierChain RandomForestClassifier','ClassifierChain GaussianNB','ClassifierChain LinearSVC','ClassifierChain KNeighborsClassifier','ClassifierChain DecisionTreeClassifier', 
          'LabelPowerset LogisticRegression','LabelPowerset RandomForestClassifier','LabelPowerset GaussianNB','LabelPowerset LinearSVC','LabelPowerset KNeighborsClassifier','LabelPowerset DecisionTreeClassifier'
        ]
results = pd.DataFrame(index=indexes, columns=column)
results.head()

In [None]:
df_pred1 = pd.DataFrame(columns=['Real Label',
                                  'BinaryRelevance LogisticRegression','BinaryRelevance RandomForestClassifier','BinaryRelevance GaussianNB','BinaryRelevance LinearSVC','BinaryRelevance KNeighborsClassifier','BinaryRelevance DecisionTreeClassifier',
                                  'ClassifierChain LogisticRegression','ClassifierChain RandomForestClassifier','ClassifierChain GaussianNB','ClassifierChain LinearSVC','ClassifierChain KNeighborsClassifier','ClassifierChain DecisionTreeClassifier', 
                                  'LabelPowerset LogisticRegression','LabelPowerset RandomForestClassifier','LabelPowerset GaussianNB','LabelPowerset LinearSVC','LabelPowerset KNeighborsClassifier','LabelPowerset DecisionTreeClassifier'])

for i in range(0, len(y)):
  df_pred1.loc[i, 'Real Label'] = list(y[i])


df_pred2 = df_pred1.copy(deep=True)
df_pred3 = df_pred1.copy(deep=True)
df_pred4 = df_pred1.copy(deep=True)
df_pred5 = df_pred1.copy(deep=True)
df_pred6 = df_pred1.copy(deep=True)
df_pred7 = df_pred1.copy(deep=True)
df_pred8 = df_pred1.copy(deep=True)
df_pred9 = df_pred1.copy(deep=True)
df_pred10 = df_pred1.copy(deep=True)
df_pred1.head()

In [None]:
def plot_confusion_matrix(cm):
  f, axes = plt.subplots(2, 4, figsize=(25, 15))
  axes = axes.ravel()
  for i in range(7):
    disp = ConfusionMatrixDisplay(cm[i])
    disp.plot(ax=axes[i], values_format='.4g',cmap='Blues')
    disp.ax_.set_title(list(multilabel.classes_)[i])
    disp.im_.colorbar.remove()

  f.delaxes(axes[7])
  plt.subplots_adjust(wspace=0.25, hspace=0.10)
  f.colorbar(disp.im_, ax=axes)
  plt.show()

In [None]:
def build_model(model, mlb_estimator, X, y):

  clf=mlb_estimator(model)
  kf = KFold(n_splits=10, shuffle=True)

  prec_scores = np.zeros(10)
  rec_scores = np.zeros(10)  
  f_scores = np.zeros(10)
  ham_scores = np.zeros(10)
  conf_matrix = []
  pred = pd.DataFrame(columns=['0','1','2','3','4','5','6','7','8','9'])
  idx = 0
  name = str(type(clf).__name__) + " " + str(type(model).__name__)
  for index, (train_index, test_index) in enumerate(kf.split(X, y)):
      print("Training on fold " + str(index+1) + "/10...")
      # Generate batches from indices
      X_train, X_test, y_train, y_test = \
            X[ X.index.isin(train_index)], X[ X.index.isin(test_index)], y[train_index], y[test_index]

      clf.fit(X_train.values, y_train)
      clf_pred = clf.predict(X_test.values)
      
      for j in range (0, len(clf_pred.toarray())):
        pred.loc[j, str(idx)] = list(clf_pred.toarray()[j])

      conf_matrix.append(multilabel_confusion_matrix(y_test, clf_pred))
      prec_scores[idx] = precision_score(y_test, clf_pred, average='micro')
      rec_scores[idx] = recall_score(y_test, clf_pred, average='micro')
      f_scores[idx] = f1_score(y_test, clf_pred, average='micro')
      ham_scores[idx] = hamming_loss(y_test, clf_pred)
      idx+=1

  results.loc['F1-mean'][name] = np.mean(f_scores)
  results.loc['Precision-mean'][name] = np.mean(prec_scores)
  results.loc['Recall-mean'][name] = np.mean(rec_scores)
  results.loc['Hammer-Loss-mean'][name] = np.mean(ham_scores)

  for i in range (0,10):
      f1 = "F1-fold"
      prec = "Precision-fold"
      rec = "Recall-fold"
      ham = "Hammer-Loss-fold"
      results.loc[f1+str(i+1)][name] = f_scores[i]
      results.loc[prec+str(i+1)][name] = prec_scores[i]
      results.loc[rec+str(i+1)][name] = rec_scores[i]
      results.loc[ham+str(i+1)][name] = ham_scores[i]


  for i in range(0, len(pred)):
    df_pred1.loc[i, name] = pred.iloc[i]['0']
    df_pred2.loc[i, name] = pred.iloc[i]['1']
    df_pred3.loc[i, name] = pred.iloc[i]['2']
    df_pred4.loc[i, name] = pred.iloc[i]['3']
    df_pred5.loc[i, name] = pred.iloc[i]['4']
    df_pred6.loc[i, name] = pred.iloc[i]['5']
    df_pred7.loc[i, name] = pred.iloc[i]['6']
    df_pred8.loc[i, name] = pred.iloc[i]['7']
    df_pred9.loc[i, name] = pred.iloc[i]['8']
    df_pred10.loc[i, name] = pred.iloc[i]['9']

  return prec_scores, rec_scores, f_scores, ham_scores, conf_matrix

In [None]:
def getTrainSetFastText():
    ft_model = fasttext.load_model("fasttext_model.bin")
    traindata = []
    for msg in dataset['User Story']:
        traindata.append(ft_model.get_sentence_vector(msg))
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetTFIDF():
    countvec = CountVectorizer(max_features=100)
    bow = countvec.fit_transform(dataset['User Story']).toarray()
    tfidfconverter = TfidfTransformer()
    X = tfidfconverter.fit_transform(bow).toarray()
    training_data = pd.DataFrame(X)
    training_data.columns = training_data.columns.astype(str)
    return training_data

def getTrainSetBERT():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized_data = tokenizer(dataset['User Story'].tolist(), padding=True, truncation=True, max_length=100)
    traindata = []
    for msg in tokenized_data['input_ids']:
        traindata.append(msg)
    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetWord2Vec():
    # word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')
    # word2vec_vectors.save('word2vec_model.bin')
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary=True)
    traindata = []
    for msg in dataset['User Story']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in w2v_model:
                vecs.append(w2v_model[word][:100])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

def getTrainSetGlove():
    glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')
    traindata = []
    for msg in dataset['User Story']:
        words = msg.split()
        vecs = []
        for word in words:
            if word in glove_vectors:
                vecs.append(glove_vectors[word])
        if vecs:
            vec_avg = sum(vecs) / len(vecs)
        else:
            vec_avg = [0] * 100
        traindata.append(vec_avg)

    traindata = pd.DataFrame(traindata)
    traindata.columns = traindata.columns.astype(str)
    return traindata

In [None]:
X = getTrainSetBERT()

### **BinaryRelevance**

#### RandomForest

In [None]:
model = RandomForestClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### LogisticRegression

In [None]:
model = LogisticRegression()

prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Gaussian Naive Bayes

In [None]:
model = GaussianNB()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### k-NearestNeighbors

In [None]:
model = KNeighborsClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Decision Tree

In [None]:
model = DecisionTreeClassifier()

prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Support Vector Machine

In [None]:
model = LinearSVC()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, BinaryRelevance, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

### **ClassifierChain**

#### RandomForest

In [None]:
model = RandomForestClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### LogisticRegression

In [None]:
model = LogisticRegression()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

####2.4.2.3) Gaussian Naive Bayes

In [None]:
model = GaussianNB()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### k-NearestNeighbors

In [None]:
model = KNeighborsClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Decision Tree

In [None]:
model = DecisionTreeClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Support Vector Machine

In [None]:
model = LinearSVC()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, ClassifierChain, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

### **LabelPowerset**

#### RandomForest

In [None]:
model = RandomForestClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### LogisticRegression

In [None]:
model = LogisticRegression()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Gaussian Naive Bayes

In [None]:
model = GaussianNB()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### k-NearestNeighbors

In [None]:
model = KNeighborsClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Decision Tree

In [None]:
model = DecisionTreeClassifier()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

#### Support Vector Machine

In [None]:
model = LinearSVC()
prec_score, rec_score, f_score, ham_loss, conf_matrix = build_model(model, LabelPowerset, X, y)

print('='*50)
print("Precision: " + str(np.mean(prec_score)))
print("Recall Score: " + str(np.mean(rec_score)))
print("F1 Score: " + str(np.mean(f_score)))
print("Hamming Loss: " + str(np.mean(ham_loss)))
plot_confusion_matrix(np.mean(conf_matrix, axis=0))

## Multi-label Classifiers results

In [None]:
with pd.ExcelWriter('./resultsMultilabel/resultsMULTILABELS.xlsx', mode='a',if_sheet_exists='replace') as writer:
    results.to_excel(writer, sheet_name='MultiLabelClf BERT')

results.head()

In [None]:
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='w') as writer:
  df_pred1.to_excel(writer, sheet_name='Fold 1')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred2.to_excel(writer, sheet_name='Fold 2')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred3.to_excel(writer, sheet_name='Fold 3')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred4.to_excel(writer, sheet_name='Fold 4')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred5.to_excel(writer, sheet_name='Fold 5')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred6.to_excel(writer, sheet_name='Fold 6')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred6.to_excel(writer, sheet_name='Fold 6')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred7.to_excel(writer, sheet_name='Fold 7')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred8.to_excel(writer, sheet_name='Fold 8')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred9.to_excel(writer, sheet_name='Fold 9')
with pd.ExcelWriter('./resultsMultilabel/multilabel-clf-bert.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
  df_pred10.to_excel(writer, sheet_name='Fold 10')