In [2]:
#Imports
import os
import json
import pandas as pd
import IPython.display as dp

### Results

In [None]:
#Path to results directory
base_dir = "../results"

#Output list
rows = []

#Walking through all results subfolders
for root, _, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".json"):
            file_path = os.path.join(root, file)

            #Loading JSON files
            with open(file_path) as f:
                data = json.load(f)

            #Extracting metrics
            accuracy = round(data.get("accuracy", 0) * 100, 2)
            macro = data.get("macro avg", {})
            weighted = data.get("weighted avg", {})

            macro_precision = round(macro.get("precision", 0), 4)
            macro_recall = round(macro.get("recall", 0), 4)
            macro_f1 = round(macro.get("f1-score", 0), 4)

            weighted_precision = round(weighted.get("precision", 0), 4)
            weighted_recall = round(weighted.get("recall", 0), 4)
            weighted_f1 = round(weighted.get("f1-score", 0), 4)

            #Parse model name, feature type, and split from the path
            parts = root.split(os.sep)
            model = parts[-2] if len(parts) >= 2 else "Unknown"
            feature_type = parts[-1] if model != parts[-1] else "Default"
            split_info = file.replace(".json", "").replace(f"{model}_results_", "")

            #Appending the rows
            rows.append({
                "Model": model.upper(),
                "Feature Type": feature_type.replace("_", " ").title(),
                "Split": split_info,
                "Accuracy (%)": accuracy,
                "Macro Precision": macro_precision,
                "Macro Recall": macro_recall,
                "Macro F1": macro_f1,
                "Weighted Precision": weighted_precision,
                "Weighted Recall": weighted_recall,
                "Weighted F1": weighted_f1
            })

#Creating DataFrame
df = pd.DataFrame(rows)
df = df.sort_values(by=["Model", "Feature Type", "Split"])

#Displaying
dp.display(dp.HTML(df.to_html(index=False)))

#Saving to CSV
df.to_csv("summary_results_metrics.csv", index=False)


Model,Feature Type,Split,Accuracy (%),Macro Precision,Macro Recall,Macro F1,Weighted Precision,Weighted Recall,Weighted F1
LOGREG,Raw Text,logreg_tfidf_results_80_20,75.07,0.7245,0.7566,0.7321,0.7844,0.7507,0.7587
RESULTS,Bilstm,bilstm_results_70_30,83.85,0.8043,0.832,0.8154,0.8519,0.8385,0.8424
RESULTS,Bilstm,bilstm_results_75_25,84.45,0.8125,0.8323,0.8206,0.8544,0.8445,0.8478
RESULTS,Bilstm,bilstm_results_80_20,84.07,0.8037,0.8327,0.816,0.8513,0.8407,0.8439
RESULTS,Logreg,logreg_results_5fold,57.4,0.3358,0.3347,0.2985,0.0,0.0,0.435
RESULTS,Logreg,logreg_results_70_30,46.36,0.3405,0.3392,0.3276,0.4248,0.4636,0.4333
RESULTS,Logreg,logreg_results_75_25,50.66,0.3324,0.3326,0.3037,0.4183,0.5066,0.4342
RESULTS,Logreg,logreg_results_80_20,54.0,0.3367,0.3342,0.2833,0.4215,0.54,0.4324
RESULTS,Rf,rf_results_5fold,57.0,0.353,0.3334,0.2423,0.0,0.0,0.4139
RESULTS,Rf,rf_results_70_30,56.98,0.3951,0.3334,0.2424,0.4609,0.5698,0.414


### Results Summary Table
In the last notebook, we gathered the results of all the models, which we considered during all the project into one comparison table. Traditional machine learning based on both reduced PCA-based embeddings and raw text features, and neural networks deep learning based on BiLSTM model.

As the table indicates, raw text features were always outperforming the reduced PCA BERT embeddings trained model in terms of all metrics. Random Forest and SVM are leading models among traditional ones and their performance was competitive when employing TF-IDF features, and Logistic Regression demonstrated similar stability by class. But the BiLSTM model was different, as it produced the best overall outcomes, due to the opportunity to use both sequential and contextual information.

Such comparison confirms choices made at the stage of experimental work and proves the idea of working with BiLSTM model in the ultimate evaluation fairly well. It also demonstrates the fact that selection of feature representation can have a dramatic effect on model performance a topic that will be explained in detail in the thesis.