**This notebook provides a proof-of-concept implementation of different NLP approaches for automatically identifying and classifying design decisions in natural language software architecture documentation.**


# Install necessary libraries

In [None]:
!pip install anytree
!python -m pip install --upgrade pip --user
!python -m pip install --upgrade simpletransformers --user
!pip install transformers
!pip install tensorboardX
!pip install torch
!pip install nltk
!pip install prettytable
import pandas as pd
import numpy as np
import nltk
nltk.download('averaged_perceptron_tagger')

# Configure the classification process for your interest

To evaluate different model combination, **adjust the configuration below**. You can choose which text preprocessing techniques should be applied, on which level the classification should be done and whether you want to do single or multi label classification.

The **text corpus (txt and csv files)** must be stored in the configured directory.

Please, provide the neccessary **python modules** 'classification_scheme' and 'SAD_classification_dataframe_operations' in your runtime environment.

In [None]:
from SAD_classification_dataframe_operations import LevelOfClassification
from SAD_classification_dataframe_operations import Config

# Select the folder where you store the corpus (txt files as lines and csv for classification)
config_data = Config(
    root_folder = '../dataset/'
)

# Configure the classification process with respect to your research question
config_classification = Config(
    # LEVEL_0 and LEAVES exist (for special customizing go to module 'SAD_classification_dataframe_operations')
    level_of_classification = LevelOfClassification.LEAVES,
    label_unrelated = 'none of these',
    # Text preprocessing
    do_lowercasing = True,
    do_cleanup = False,
    do_lemmatization = True,
    do_stop_word_removal  = False,
    filter_out_non_design_decisions = True,
    # Option for multilabel (one-vs-rest) classification
    do_multilabel_classification = False,
    # k-fold cross-validation
    k_splits = 5,
    n_repeats = 3,
    random_state = 1, # for reproducability
    # BERT specific options
    epochs = 16,
    batch_size = 8,
    threshold = 0.5
)

# Reproduce classification scheme in hierarchical order

The final classification scheme, can be loaded from the python module 'classification_scheme'.

In [None]:
from classification_scheme import ClassificationSchemeBuilder

builder = ClassificationSchemeBuilder()
classification_scheme = builder.build_classification_scheme()

# Load the corpus from txt and csv files

Generate the pandas.DataFrame from raw data of the corpus. Pandas.DataFrame allows easy adjustment of text and labels.

In [None]:
import os
import glob
from csv import reader

def generate_data_frame(root_folder: str):
    """
    Generate a pandas.DataFrame from the sources files stored in the root folder. 
    All lines should be stored in txt files and the corresponding csv file.
    A lexicographic ordering of the files must result in the same order for txt and corresponding csv file.
    """
    # Define the shape of the resulting DataFrame  
    corpus = pd.DataFrame(columns=['line', 'design_decision', 'classification', 'alternative_classification'])

    # Get all csv (classification) and txt (source) files from the root folder 
    # and order them to match the txt files with the corresponding csv files.
    path = root_folder
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    csv_files.sort()
    txt_files = glob.glob(os.path.join(path, "*.txt"))
    txt_files.sort()
    assert len(csv_files) == len(txt_files)

    for i in range(0, len(csv_files)):
      classifications = pd.read_csv(csv_files[i], sep=';')
      file = open(txt_files[i], 'r')
      lines = file.readlines()
      assert len(classifications) == len(lines)

      # Extract line from the row, delete not-meaningful linebreak and empty lines
      for row in range (0, len(classifications)):
        line = lines[row]
        line = line.replace("\n", "")
        if line == '':
          continue

        # Extract whether the current line provides a design decision and how it is classified
        design_decision_in_row = classifications.loc[classifications.index[row], 'design_decision']
        label_for_row = classifications.loc[classifications.index[row], 'classification'].strip()
        alternative_label_for_row = classifications.loc[classifications.index[row], 'alternative_classification'].strip()
          
        corpus = corpus.append({'line': line, 'design_decision': design_decision_in_row, 'classification': label_for_row, 'alternative_classification': alternative_label_for_row}, ignore_index=True)
    return corpus

In [None]:
# Generate a pandas.DataFrame to be able to run text und label preprocessing
corpus_raw = generate_data_frame(config_data.root_folder)

# Inspect the data frame



Use this methods to get familiar with the representation of the text corpus and labels.

In [None]:
display(corpus_raw)

Count the number of lines with and without design decisions in the corpus.

In [None]:
print(corpus_raw['design_decision'].value_counts())

Count the absolute frequency of each class from the scheme.

In [None]:
print(corpus_raw['classification'].value_counts())
print(corpus_raw['alternative_classification'].value_counts())

# Text preprocessing

With the following code, the text corpus will be preprocessed. All preprocessing techniques which were selected in the configuration will be applied. Possibly, you can also filter out all lines which do not include a design decision. You can inspect the changes in the data frame afterwards.

In [None]:
# Apply all in the configuration selected text preprocessing methods to the data frame
from SAD_classification_dataframe_operations import TextPreprocessor

text_preprocessor = TextPreprocessor()
corpus_preprocessed = corpus_raw.copy()

corpus_preprocessed['line'] = corpus_preprocessed['line'].map(lambda line:text_preprocessor.preprocess(
    line, config_classification.do_cleanup, config_classification.do_lowercasing,
    config_classification.do_stop_word_removal, do_lemmatization=config_classification.do_lemmatization))
display(corpus_preprocessed[['line']])

In [None]:
# Filter out lines, which do not include a design decision (not possible on level 0).
# Do this if your classifier should not care about lines which do not even have a design decision.
import warnings

if config_classification.filter_out_non_design_decisions:
  if config_classification.level_of_classification == LevelOfClassification.LEVEL_0:
    warnings.warn("Cannot filter out lines without design decisions on level 0.")
  else:
    corpus_preprocessed = corpus_preprocessed.drop(corpus_preprocessed[corpus_preprocessed.design_decision == 0].index)
    display(corpus_preprocessed)

# Adjust labels to fit classifiers

This section does the preprocessing for the labels. First, all labels are transformed to the classes on the selected level. Afterwards, these classes are mapped to an integer value and all labels will be set to this integer value. Integer values for labels are mandatory for some APIs. You can also map them back to their class name by using *matching_category_integer_label*.

In [None]:
# Replace all labels with the corresponding (parent) classes on the selected level of classification.
from SAD_classification_dataframe_operations import LabelPreprocessor

label_preprocessor = LabelPreprocessor()
corpus_unified_classes = label_preprocessor.transform_df_unified_labels(
    df=corpus_preprocessed, column_name='classification', classification_scheme=classification_scheme,
    level_of_classification = config_classification.level_of_classification, label_unrelated=config_classification.label_unrelated)
corpus_unified_classes = label_preprocessor.transform_df_unified_labels(
    df=corpus_unified_classes, column_name='alternative_classification', classification_scheme=classification_scheme,
    level_of_classification = config_classification.level_of_classification, label_unrelated=config_classification.label_unrelated)
display(corpus_unified_classes)

In [None]:
# Generate a dictionary which represents the mapping between each selected class and an integer label.
categories = config_classification.level_of_classification.value
matching_category_integer_label = label_preprocessor.generate_dictionary_for_integer_labels(
    categories=categories, label_unrelated=config_classification.label_unrelated)
print(matching_category_integer_label)

In [None]:
# Replace the string labels with the corresponding integers from the dictionary generated above.
corpus_with_integer_labels = label_preprocessor.transform_df_integer_labels(
    df=corpus_unified_classes, column_name='classification', dictionary_for_integer_labels=matching_category_integer_label)
corpus_with_integer_labels = label_preprocessor.transform_df_integer_labels(
    df=corpus_with_integer_labels, column_name='alternative_classification', dictionary_for_integer_labels=matching_category_integer_label)
display(corpus_with_integer_labels)

In [None]:
# Inspect the amount of data points per class.
print(corpus_with_integer_labels['classification'].value_counts())
print(corpus_with_integer_labels['alternative_classification'].value_counts())

# Generate a multi-label corpus

If you want to do multi-label, the following code will be executed. It creates a corpus for multi-label classification.

In [None]:
# Generate a data frame with extra column for multi label array. 
if config_classification.do_multilabel_classification:
    corpus_multilabel = label_preprocessor.transform_df_multi_label(df=corpus_with_integer_labels, first_column_name='classification', 
                                                                    second_column_name='alternative_classification', new_column_name='multilabel')
    display(corpus_multilabel)

In [None]:
if config_classification.do_multilabel_classification:
  corpus_multilabel = corpus_with_integer_labels.copy()
  corpus_multilabel['multilabel'] = ""
  for index, row in corpus_multilabel.iterrows():
    multilabel = []
    multilabel.append(row['classification'])
    if(row['alternative_classification'] != 0 and row['alternative_classification'] != row['classification']):
      multilabel.append(row['alternative_classification'])

    row['multilabel'] = multilabel

  display(corpus_multilabel)

# Extract lines and labels from DataFrame

Input for all classification algorithms are lines and labels so we extract them from the pandas.DataFrame

In [None]:
# Extract the lines and labels which will serve as input for the training and evaluation phase.
lines = corpus_with_integer_labels['line']
labels = corpus_with_integer_labels['classification']
labels = labels.astype('int')

# Text classification using "classic" machine learning

## Define vectorizers and classifiers from sklearn

We evaluate different vectorizer-classifier-combinations. They are listed here. They include Bag-of-Words (BoW), bi-, trigramm and tf-idf as representation and Logistic Regression, Multinomial Naive Bayes, Decision Tree, RandomForest and Support Vector Machine. All of them can also be used as OneVsRest-Classifiers for multi-label classification.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# List all classifiers you want to evaluate with sklearn.Pipeline
classifiers = [
    MultinomialNB(),
    LogisticRegression(max_iter=500), # prevent overfitting (no fine-tuning but resolves a warning)
    SVC(),
    DecisionTreeClassifier(),
    RandomForestClassifier()
]

# List all vectorizers you want to evaluate with sklearn.Pipeline
vectorizers = [
    CountVectorizer(), # BoW
    CountVectorizer(analyzer='char_wb', ngram_range=(2, 2)), # 2-gram vectorizer
    CountVectorizer(analyzer='char_wb', ngram_range=(3, 3)), # 3-gram vectorizer
    TfidfVectorizer() # Tf-idf
]

In [None]:
# Adjust classifiers to multilabel (one-vs-rest) and select lines and labels from the multilabel data frame
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

if config_classification.do_multilabel_classification:
  lines = corpus_multilabel['line']
  labels = corpus_multilabel['multilabel']

  for i in range(0, len(classifiers)):
    classifiers[i] = OneVsRestClassifier(classifiers[i])

  labels = MultiLabelBinarizer().fit_transform(labels)

## Define scoring metrics

Use statistic measurements to evaluate the performance of the classifiers.

In [None]:
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

if len(matching_category_integer_label) == 2:
    scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score, average='binary', zero_division=0),
               'recall' : make_scorer(recall_score, average='binary', zero_division=0), 
               'f1_score' : make_scorer(f1_score, average='binary', zero_division=0)}
else:
    scoring = {'accuracy' : make_scorer(accuracy_score), 
               'precision' : make_scorer(precision_score, average='weighted', zero_division=0),
               'recall' : make_scorer(recall_score, average='weighted', zero_division=0),
               'f1_score' : make_scorer(f1_score, average='weighted', zero_division=0)}

## Do k-fold cross validation over all vectorizers and classifiers

The k-fold cross-validation is used to evaluate the performance of each vectorizer-classifier-combination. K-fold cross-validiation estimates the performance on unseen data points and reduces the bias due to selection of training and test data set. The average results from the scoring metrics will be printed to the console.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

# Repeated k-fold cross validation (results will be averaged over all repeats)
cv = RepeatedKFold(n_splits=config_classification.k_splits, n_repeats=config_classification.n_repeats, random_state=config_classification.random_state)

# Scoring matrizes
matrix_width = len(classifiers)
matrix_height = len(vectorizers)
accuracies = [[0 for x in range(matrix_height)] for y in range(matrix_width)]
precisions = [[0 for x in range(matrix_height)] for y in range(matrix_width)]
recalls = [[0 for x in range(matrix_height)] for y in range(matrix_width)]
f1_scores = [[0 for x in range(matrix_height)] for y in range(matrix_width)]

# Run the evaluation
for i in range(0, len(classifiers)):
  for j in range(0, len(vectorizers)):
    print('Do cross-validation with classifier', type(classifiers[i]).__name__, 'and vectorizer', type(vectorizers[j]).__name__, '(ngram_range:', vectorizers[j].ngram_range, ')')
    line_classifier = Pipeline([
                                  ('vectorizer', vectorizers[j]),
                                  ('classifier', classifiers[i]),
                                  ])

    # Save and print the results achieved with this vectorizer-classifier-combination
    cross_val_scores = cross_validate(line_classifier, lines, labels, scoring=scoring, cv=cv, n_jobs=-1)
    accuracies[i][j] = (sum(cross_val_scores['test_accuracy'] / len(cross_val_scores['test_accuracy'])))
    precisions[i][j] = (sum(cross_val_scores['test_precision'] / len(cross_val_scores['test_precision'])))
    recalls[i][j] = (sum(cross_val_scores['test_recall'] / len(cross_val_scores['test_recall'])))
    f1_scores[i][j] = (sum(cross_val_scores['test_f1_score'] / len(cross_val_scores['test_f1_score'])))
    print('Accuracy:', round(accuracies[i][j], 4), 'F1-Score:', round(f1_scores[i][j], 4))

## Show scoring results from k-fold cross-validation

In [None]:
from SAD_classification_dataframe_operations import ResultPresenter
result_presenter = ResultPresenter()
print('Average accuracy:', result_presenter.calculate_average_over_matrix(accuracies))
print('Average precision:', result_presenter.calculate_average_over_matrix(precisions))
print('Average recall:', result_presenter.calculate_average_over_matrix(recalls))
print('Average F1-score:', result_presenter.calculate_average_over_matrix(f1_scores))

In [None]:
print(result_presenter.present_results_in_table('Accuracy', classifiers, vectorizers, accuracies))

In [None]:
print(result_presenter.present_results_in_table('F1_Score', classifiers, vectorizers, f1_scores))

In [None]:
print(result_presenter.present_results_in_table('Precision', classifiers, vectorizers, precisions))

In [None]:
print(result_presenter.present_results_in_table('Recall', classifiers, vectorizers, recalls))

## Text classification with BERT

Prepare dataset and switch to GPU.

In [None]:
# BERT with simpletransformer takes pandas.DataFrame as input
corpus_for_transformer = corpus_with_integer_labels[['line', 'classification']]
corpus_for_transformer['classification'].values.astype('int')
corpus_for_transformer.rename(columns={"line": "text", "classification": "labels"}, inplace=True)
num_labels = len(matching_category_integer_label)
print(corpus_for_transformer)

In [None]:
# Change runtime environment to GPU

import torch
cuda_available = torch.cuda.is_available()
if cuda_available:
    curr_device = torch.cuda.current_device()
    print(torch.cuda.get_device_name(curr_device))
device = torch.device("cuda" if cuda_available else "cpu")
device

In [None]:
# Define different f1 score averaging methods to use during evaluation

from sklearn.metrics import f1_score, precision_score, recall_score

def f1_multiclassMicro(labels, preds):
    return f1_score(labels, preds, average='micro', zero_division=0)
def f1_multiclassMacro(labels, preds):
    return f1_score(labels, preds, average='macro', zero_division=0)
def f1_multiclassWeighted(labels, preds):
    return f1_score(labels, preds, average='weighted', zero_division=0)
def precision_multiclassMicro(labels, preds):
    return precision_score(labels, preds, average='micro', zero_division=0)
def precision_multiclassMacro(labels, preds):
    return precision_score(labels, preds, average='macro', zero_division=0)
def precision_multiclassWeighted(labels, preds):
    return precision_score(labels, preds, average='weighted', zero_division=0)
def recall_multiclassMicro(labels, preds):
    return recall_score(labels, preds, average='micro', zero_division=0)
def recall_multiclassMacro(labels, preds):
    return recall_score(labels, preds, average='macro', zero_division=0)
def recall_multiclassWeighted(labels, preds):
    return recall_score(labels, preds, average='weighted', zero_division=0)

Perform a k-fold cross-validation using BERT. The BERT pretrained model *bert-base-uncased* and the classifier *BertForSequenceClassification* will be loaded using library *simpletransformers*. The accuracy and (weighted) f1 scores will be saved for further evaluation. You can inspect them with the next cell.

In [None]:
# Do k-fold cross-validation with BERT
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score

if not config_classification.do_multilabel_classification: 
    # prepare cross validation
    kf = RepeatedKFold(n_splits=config_classification.k_splits, n_repeats=config_classification.n_repeats, random_state=config_classification.random_state)

    accuracies_bert, f1_scores_bert_binary, f1_scores_bert_weighted, f1_scores_bert_micro, f1_scores_bert_macro = [], [], [], [], []
    precision_scores_bert_binary, precision_scores_bert_weighted, precision_scores_bert_micro, precision_scores_bert_macro = [], [], [], []
    recall_scores_bert_binary, recall_scores_bert_weighted, recall_scores_bert_micro, recall_scores_bert_macro = [], [], [], []

    overall_flat_predictions, overall_flat_true_labels = [], []

    # run k-fold cross-validation for BERT model
    for train_index, val_index in kf.split(corpus_for_transformer):
        train_df = corpus_for_transformer.iloc[train_index]
        val_df = corpus_for_transformer.iloc[val_index]

        model_args = ClassificationArgs()
        model_args.num_train_epochs = config_classification.epochs
        model_args.train_batch_size = config_classification.batch_size
        model_args.eval_batch_size = config_classification.batch_size
        model = ClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, args=model_args)
        model.train_model(train_df, args = {'overwrite_output_dir':True, 'save_eval_checkpoints':False, 'save_model_every_epoch':False})

        # if classification is binary, you can use the f1_score from sklearn directly
        if len(matching_category_integer_label) == 2:
            result, model_outputs, wrong_predictions = model.eval_model(val_df)

            predictions = []
            for y_pred in model_outputs:
                predictions.append(np.argmax(y_pred))
            true_labels = val_df['labels'].tolist()
            true_labels = true_labels[:len(predictions)]
            overall_flat_predictions.extend(predictions)
            overall_flat_true_labels.extend(true_labels)

            print(classification_report(true_labels, predictions, digits=3))
            accuracies_bert.append(accuracy_score(true_labels, predictions))
            f1_scores_bert_binary.append(f1_score(true_labels, predictions, average='binary', zero_division=0))
            f1_scores_bert_weighted.append(f1_multiclassWeighted(true_labels, predictions))
            f1_scores_bert_micro.append(f1_multiclassMicro(true_labels, predictions))
            f1_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
            precision_scores_bert_binary.append(precision_score(true_labels, predictions, average='binary', zero_division=0))
            precision_scores_bert_weighted.append(precision_multiclassWeighted(true_labels, predictions))
            precision_scores_bert_micro.append(precision_multiclassMicro(true_labels, predictions))
            precision_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
            recall_scores_bert_binary.append(recall_score(true_labels, predictions, average='binary', zero_division=0))
            recall_scores_bert_weighted.append(recall_multiclassWeighted(true_labels, predictions))
            recall_scores_bert_micro.append(recall_multiclassMicro(true_labels, predictions))
            recall_scores_bert_macro.append(recall_multiclassMacro(true_labels, predictions))


        # if classification is multi-class, you have to calculate the weighted F1-score from the model_outputs
        else:
            result, model_outputs, wrong_predictions = model.eval_model(val_df)

            predictions = []
            for y_pred in model_outputs:
                predictions.append(np.argmax(y_pred))
            true_labels = val_df['labels'].tolist()
            true_labels = true_labels[:len(predictions)]
            overall_flat_predictions.extend(predictions)
            overall_flat_true_labels.extend(true_labels)

            print(classification_report(true_labels, predictions, digits=3))
            accuracies_bert.append(accuracy_score(true_labels, predictions))
            f1_scores_bert_weighted.append(f1_multiclassWeighted(true_labels, predictions))
            f1_scores_bert_micro.append(f1_multiclassMicro(true_labels, predictions))
            f1_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
            precision_scores_bert_weighted.append(precision_multiclassWeighted(true_labels, predictions))
            precision_scores_bert_micro.append(precision_multiclassMicro(true_labels, predictions))
            precision_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
            recall_scores_bert_weighted.append(recall_multiclassWeighted(true_labels, predictions))
            recall_scores_bert_micro.append(recall_multiclassMicro(true_labels, predictions))
            recall_scores_bert_macro.append(recall_multiclassMacro(true_labels, predictions))


Show scoring results of non multi-label classification:

In [None]:
if not config_classification.do_multilabel_classification:
    print("Accuracies:", accuracies_bert)
    if len(matching_category_integer_label) == 2:
        print("Binary Precision:", precision_scores_bert_binary)
        print("Binary Recall:", recall_scores_bert_binary)
        print("Binary F1-Scores:", f1_scores_bert_binary)
    print("Weighted Precision:", precision_scores_bert_weighted)
    print("Weighted Recall:", recall_scores_bert_weighted)
    print("Weighted F1-Scores:", f1_scores_bert_weighted)
    print("Micro Precision:", precision_scores_bert_micro)
    print("Micro Recall:", recall_scores_bert_micro)
    print("Micro F1-Scores:", f1_scores_bert_micro)
    print("Macro Precision:", precision_scores_bert_macro)
    print("Macro Recall:", recall_scores_bert_macro)
    print("Macro F1-Scores:", f1_scores_bert_macro)
    
    print(f"Average accuracy: {sum(accuracies_bert) / len(accuracies_bert)}")
    if len(matching_category_integer_label) == 2:
        print(f"Average Binary Precision: {sum(precision_scores_bert_binary) / len(precision_scores_bert_binary)}")
        print(f"Average Binary Recall: {sum(recall_scores_bert_binary) / len(recall_scores_bert_binary)}")
        print(f"Average Binary F1-Scores: {sum(f1_scores_bert_binary) / len(f1_scores_bert_binary)}")
    print(f"Average weighted Precision: {sum(precision_scores_bert_weighted) / len(precision_scores_bert_weighted)}")
    print(f"Average weighted Recall: {sum(recall_scores_bert_weighted) / len(recall_scores_bert_weighted)}")
    print(f"Average weighted F1-Score: {sum(f1_scores_bert_weighted) / len(f1_scores_bert_weighted)}")
    print(f"Average micro Precision: {sum(precision_scores_bert_micro) / len(precision_scores_bert_micro)}")
    print(f"Average micro Recall: {sum(recall_scores_bert_micro) / len(recall_scores_bert_micro)}")
    print(f"Average micro F1-Score: {sum(f1_scores_bert_micro) / len(f1_scores_bert_micro)}")
    print(f"Average macro Precision: {sum(precision_scores_bert_macro) / len(precision_scores_bert_macro)}")
    print(f"Average macro Recall: {sum(recall_scores_bert_macro) / len(recall_scores_bert_macro)}")
    print(f"Average macro F1-Score: {sum(f1_scores_bert_macro) / len(f1_scores_bert_macro)}")

    if config_classification.level_of_classification == LevelOfClassification.LEAVES:
        matching_category_integer_label.pop("none of these", 0)
    target_names = matching_category_integer_label.keys()
    print(classification_report(overall_flat_true_labels, overall_flat_predictions, target_names=target_names, digits = 3))



## Multi-label Classification using BERT

For multi-label classification the corpus has to be transformed.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
if config_classification.do_multilabel_classification: 
    corpus_for_transformer_multilabel = corpus_multilabel[['line', 'multilabel']]
    corpus_for_transformer_multilabel.rename(columns={"line": "text", "multilabel": "labels"}, inplace=True)
    corpus_for_transformer_multilabel = corpus_for_transformer_multilabel.reset_index(drop=True)
    mlb = MultiLabelBinarizer()
    mlb.fit(corpus_for_transformer_multilabel['labels'])
    labels = mlb.fit_transform(corpus_for_transformer_multilabel['labels'])
    for i in range(len(labels)):
        corpus_for_transformer_multilabel.loc[i, 'labels'] = labels[i]
    num_labels=len(corpus_for_transformer_multilabel.iloc[1].labels)
    print(corpus_for_transformer_multilabel)

Perform a k-fold cross-validation using BERT. The BERT pretrained model *bert-base-uncased* and the classifier *BertForSequenceClassification* will be loaded using library *simpletransformers*. The accuracy and (weighted) f1 scores will be saved for further evaluation. You can inspect them with the next cell.

In [None]:
# Do k-fold cross-validation with BERT
from simpletransformers.classification import MultiLabelClassificationArgs, MultiLabelClassificationModel
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report

if config_classification.do_multilabel_classification:
    # prepare cross validation
    kf = RepeatedKFold(n_splits=config_classification.k_splits, n_repeats=config_classification.n_repeats, random_state=config_classification.random_state)

    accuracies_bert, f1_scores_bert_weighted, f1_scores_bert_micro, f1_scores_bert_macro = [], [], [], []
    precision_scores_bert_weighted, precision_scores_bert_micro, precision_scores_bert_macro = [], [], []
    recall_scores_bert_weighted, recall_scores_bert_micro, recall_scores_bert_macro = [], [], []

    overall_flat_predictions, overall_flat_true_labels = [], []

    # run k-fold cross-validation for BERT model
    for train_index, val_index in kf.split(corpus_for_transformer_multilabel):
        train_df = corpus_for_transformer_multilabel.iloc[train_index]
        val_df = corpus_for_transformer_multilabel.iloc[val_index]

        model_args = MultiLabelClassificationArgs()
        model_args.num_train_epochs = config_classification.epochs
        model_args.train_batch_size = config_classification.batch_size
        model_args.eval_batch_size = config_classification.batch_size
        model_args.threshold = config_classification.threshold
        model = MultiLabelClassificationModel('bert', 'bert-base-uncased', num_labels=num_labels, args=model_args)
        model.train_model(train_df, args = {'overwrite_output_dir':True, 'save_eval_checkpoints':False, 'save_model_every_epoch':False})


        result, model_outputs, wrong_predictions = model.eval_model(val_df)

        predictions, true_labels = [], []
        for y_pred in model_outputs:
            predictions.append([1 if i>=config_classification.threshold else 0 for i in y_pred ] )
        for true in val_df['labels']:
            true_labels.append(true)
        predictions = np.array(predictions)
        true_labels = np.array(true_labels)


        overall_flat_predictions.extend(predictions)
        overall_flat_true_labels.extend(true_labels)
        accuracies_bert.append(accuracy_score(true_labels, predictions))
        f1_scores_bert_weighted.append(f1_multiclassWeighted(true_labels, predictions))
        f1_scores_bert_micro.append(f1_multiclassMicro(true_labels, predictions))
        f1_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
        precision_scores_bert_weighted.append(precision_multiclassWeighted(true_labels, predictions))
        precision_scores_bert_micro.append(precision_multiclassMicro(true_labels, predictions))
        precision_scores_bert_macro.append(f1_multiclassMacro(true_labels, predictions))
        recall_scores_bert_weighted.append(recall_multiclassWeighted(true_labels, predictions))
        recall_scores_bert_micro.append(recall_multiclassMicro(true_labels, predictions))
        recall_scores_bert_macro.append(recall_multiclassMacro(true_labels, predictions))

        print(classification_report(true_labels, predictions, digits=3))

Show results of multi-label classification

In [None]:
if config_classification.do_multilabel_classification:
    print("Accuracies:", accuracies_bert)
    print("Weighted Precision:", precision_scores_bert_weighted)
    print("Weighted Recall:", recall_scores_bert_weighted)
    print("Weighted F1-Scores:", f1_scores_bert_weighted)
    print("Micro Precision:", precision_scores_bert_micro)
    print("Micro Recall:", recall_scores_bert_micro)
    print("Micro F1-Scores:", f1_scores_bert_micro)
    print("Macro Precision:", precision_scores_bert_macro)
    print("Macro Recall:", recall_scores_bert_macro)
    print("Macro F1-Scores:", f1_scores_bert_macro)
    
    print(f"Average accuracy: {sum(accuracies_bert) / len(accuracies_bert)}")
    print(f"Average weighted Precision: {sum(precision_scores_bert_weighted) / len(precision_scores_bert_weighted)}")
    print(f"Average weighted Recall: {sum(recall_scores_bert_weighted) / len(recall_scores_bert_weighted)}")
    print(f"Average weighted F1-Score: {sum(f1_scores_bert_weighted) / len(f1_scores_bert_weighted)}")
    print(f"Average micro Precision: {sum(precision_scores_bert_micro) / len(precision_scores_bert_micro)}")
    print(f"Average micro Recall: {sum(recall_scores_bert_micro) / len(recall_scores_bert_micro)}")
    print(f"Average micro F1-Score: {sum(f1_scores_bert_micro) / len(f1_scores_bert_micro)}")
    print(f"Average macro Precision: {sum(precision_scores_bert_macro) / len(precision_scores_bert_macro)}")
    print(f"Average macro Recall: {sum(recall_scores_bert_macro) / len(recall_scores_bert_macro)}")
    print(f"Average macro F1-Score: {sum(f1_scores_bert_macro) / len(f1_scores_bert_macro)}")

    matching_category_integer_label.pop("none of these", 0)
    target_names = matching_category_integer_label.keys()
    print(classification_report(overall_flat_true_labels, overall_flat_predictions, target_names=target_names, digits = 3))
