In [None]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, f1_score

current_path = Path().resolve().parent
print(current_path)

# Transformer-based and LSTM-based models.

In [4]:
train_sets = ['pe', 'we', 'abam', 'mix1']
test_sets = ['pe', 'we', 'abam', 'mix1']

models = ['bert', 'bert_crf', 'distilbert', 'distilbert_crf', 'glove_bilstmcrf', 'distilbert_bilstmcrf', 'bert_bilstmcrf']

df_info = []

def parse_file(file_path):
    tokens, labels, predictions = [], [], []
    try:
        with open(file_path, encoding='utf-8') as nf:
            lines = nf.readlines()
            for line in lines:
                if line.strip():
                    tk, lb, pr = line.split()
                    tokens.append(tk)
                    labels.append(lb)
                    predictions.append(pr)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except ValueError:
        print(f"Invalid file format: {file_path}")

    return tokens, labels, predictions

for train in train_sets:
    for test in test_sets:
        
        for m in models:
            folder_dir = f"{train}_{m}"
            
            row = [train, test, m]

            model_name = "-".join(m.split("_"))
            for i in range(10):
                file_name = f"results-{train}-{test}-{model_name}-{i}.txt"
                file_path = (current_path / 'results' / 'BIO' / folder_dir / file_name)
                tokens, labels, predictions = parse_file(file_path)
                # print(file_name)
                fmacro = f1_score(labels, predictions, average = 'macro')
                row.append(fmacro)
            
            df_info.append(row)

In [None]:
results_dataframe = pd.DataFrame(df_info, columns = ['train', 'test', 'model'] + [f'run{i}' for i in range(10)])
results_dataframe['average'] = results_dataframe.loc[:, 'run0':'run9'].mean(axis=1)
results_dataframe['std_deviation'] = results_dataframe.loc[:, 'run0':'run9'].std(axis=1)
results_dataframe.head()

In [6]:
results_dataframe.to_csv(str(current_path) + "/data.csv", index = False)

# MTL models

In [7]:
def parse_file(file_path):
    tokens, labels, predictions = [], [], []
    try:
        with open(file_path, encoding='utf-8') as nf:
            lines = nf.readlines()
            for line in lines:
                if line.strip():
                    tk, lb, pr = line.split()
                    tokens.append(tk)
                    labels.append(lb)
                    predictions.append(pr)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except ValueError:
        print(f"Invalid file format: {file_path}")

    return tokens, labels, predictions

test_sets = ['pe', 'we', 'abam']
folder_cues = ['equal', 'dwa', 'dwaT3', 'dwaT5']

df_info = []

for folder_cue in folder_cues:
    folder_dir = f"mtl_{folder_cue}"
    weighting = 'dwa' if folder_cue.startswith('dwa') else 'equal'
    for test in test_sets:
        for i in range(3):
            row = [test, weighting, folder_cue, i]
            for j in range(10):
                file_name = f"results-mtl-{weighting}-{test}-bert-{j}-CRF{i}.txt"
                file_path = (current_path / 'results' / 'BIO' / folder_dir / file_name)
                tokens, labels, predictions = parse_file(file_path)
                fmacro = f1_score(labels, predictions, average = 'macro')
                row.append(fmacro)
            df_info.append(row)

In [None]:
results_dataframe = pd.DataFrame(df_info, columns = ['test', 'weighting', 'weighting_cue', 'crf'] + [f'run{i}' for i in range(10)])
results_dataframe['average'] = results_dataframe.loc[:, 'run0':'run9'].mean(axis=1)
results_dataframe['std_deviation'] = results_dataframe.loc[:, 'run0':'run9'].std(axis=1)
results_dataframe.head()

In [9]:
results_dataframe.to_csv(str(current_path) + "/data_mtl.csv", index = False)