In [None]:
DATA_FILE = "Fine-tuning LLMs blog post - Raw Data - Sheet1.csv"
import pandas as pd

data_file = pd.read_csv(DATA_FILE, header=2)

In [None]:
import numpy as np
ROOT_DIR = "../data/"

results = data_file["Result Path"].replace(np.nan, None).tolist()

In [None]:
def get_latency(raw_output_file):
    import json
    
    with open(raw_output_file) as f:
        raw_output = json.load(f)
    
    items = len(raw_output["texts"])
    lag = raw_output["total_time_elapsed"]
    latency = lag / items
    return latency

In [None]:
def get_values(label : str, df : pd.DataFrame) -> pd.DataFrame:
    return df[ df.iloc[:, 0] == label ]

def parse_classification_report(df : pd.DataFrame) -> dict:

    report = {}
    labels = df.iloc[:, 0].to_list()

    for label in labels:
        values = get_values(label, df)
        if label == 'accuracy':
            accuracy = values['precision'].to_list()[0]
            report[label] = accuracy
        else:
            precision, recall, f1, support = values[['precision', 'recall', 'f1-score', 'support']].iloc[0].to_list()
            report[label] = {
                'precision' : precision,
                'recall' : recall,
                'f1' : f1,
                'support' : support
            }

    return report

def collect_average_metrics(results : dict) -> dict:
    avgs = ['macro avg', 'weighted avg']
    metrics = ['precision', 'recall', 'f1']
    
    avg_metrics = {}
    avg_metrics['accuracy'] = float(np.mean([i['accuracy'] for i in results.values()]))
    
    for avg in avgs:
        for metric in metrics:

            values = [i[avg] for i in results.values()]
            values = [i[metric] for i in values]
            avg_metrics[metric + " - " + avg] = float(np.mean(values))

    return avg_metrics

In [None]:
import os
import pandas as pd
import glob
import numpy as np
import re

evaluations = {}

for result in results:
    if result is None: continue
    
    path = os.path.join(ROOT_DIR, result)
    
    classification_reports = glob.glob( os.path.join(path, "evaluation_*.csv") )
    
    if not classification_reports:
        print(f"Could not find results at {path}.")
        continue
    
    raw_output_file = glob.glob( os.path.join(path, "raw_output.json") )[0]
    latency = get_latency(raw_output_file)
        
    evaluation = {}
    for report in classification_reports:
        label_name = re.findall(r"evaluation_(.*).csv", report)[0]

        data = pd.read_csv(report)
        data = parse_classification_report(data)
        evaluation[label_name] = data

    avg_metrics = collect_average_metrics(evaluation)

    # Insert latency at position 1 of the dict
    avg_metrics = list(avg_metrics.items())
    avg_metrics.insert(1, ("latency", latency))
    avg_metrics = dict(avg_metrics)

    evaluations[result] = avg_metrics

In [None]:
result = pd.DataFrame(evaluations).transpose()
result