# Results

In [None]:
import sys
sys.path.append("../src/")
sys.path.append("src/")
from evaluate import EvaluationResult, ModelResponse

In [None]:
DATA_FILE = "Fine-tuning LLMs blog post - Raw Data - Results.csv"
import pandas as pd

data_file = pd.read_csv(DATA_FILE, header=2)

In [None]:
import numpy as np
ROOT_DIR = "../data/"

results = data_file["Result Path"].dropna().replace(np.nan, None).tolist()

In [None]:
def get_values(label : str, df : pd.DataFrame) -> pd.DataFrame:
    return df[ df.iloc[:, 0] == label ]

def parse_classification_report(df : pd.DataFrame) -> dict:

    report = {}
    labels = df.iloc[:, 0].to_list()

    for label in labels:
        values = get_values(label, df)
        if label == 'accuracy':
            accuracy = values['precision'].to_list()[0]
            report[label] = accuracy
        else:
            precision, recall, f1, support = values[['precision', 'recall', 'f1-score', 'support']].iloc[0].to_list()
            report[label] = {
                'precision' : precision,
                'recall' : recall,
                'f1' : f1,
                'support' : support
            }
    
    return report

def collect_average_metrics(results : dict) -> dict:
    avgs = ['macro avg', 'weighted avg']
    metrics = ['precision', 'recall', 'f1']
    
    avg_metrics = {}
    avg_metrics['accuracy'] = float(np.mean([i['accuracy'] for i in results.values()]))
    
    for avg in avgs:
        for metric in metrics:

            values = [i[avg] for i in results.values()]
            values = [i[metric] for i in values]
            avg_metrics[metric + " - " + avg] = float(np.mean(values))

    return avg_metrics

In [None]:
import os
import pandas as pd
import glob
import numpy as np
import re
import json

evaluations = {}

def parse_raw_output(raw_output_file : str) -> EvaluationResult:
    # The file format of EvaluationResults changed since I started
    # collecting data, so this function updates the outdated
    # EvaluationResult dict objects to match the new standard
    # before they are parsed to prevent an Exception.
    
    with open(raw_output_file, "r") as f:
        data = json.load(f)
        f.close()

    data["total_tokens_per_response"] = None
    data["total_tokens"] = None
    if data["config"].get("name"):
        data["config"]["technique_name"] = data["config"]["name"]
        del data["config"]["name"]

    fixed_responses = []
    for response in data["llm_responses"]:
        if type(response) is str:
            fixed_responses.append(ModelResponse(text=response,prompt_tokens=0,completion_tokens=0,total_tokens=0,latency=0).to_dict())
        else: fixed_responses.append(response)
    data["llm_responses"] = fixed_responses

    eval_result = EvaluationResult.from_dict(data)
    return eval_result

In [None]:
for result in results:
    if result is None: continue
    
    path = os.path.join(ROOT_DIR, result)
    
    classification_reports = glob.glob( os.path.join(path, "evaluation_*.csv") )
    
    if not classification_reports:
        print(f"Could not find results at {path}.")
        continue
    
    raw_output_file = glob.glob( os.path.join(path, "raw_output.json") )[0]

    eval_result = parse_raw_output(raw_output_file)

    num_samples = int(len(eval_result.texts))
    total_latency = eval_result.total_time_elapsed
    
    latency = total_latency / num_samples
        
    evaluation = {}
    for report in classification_reports:
        label_name = re.findall(r"evaluation_(.*).csv", report)[0]

        data = pd.read_csv(report)
        data = parse_classification_report(data)
        evaluation[label_name] = data

    avg_metrics = collect_average_metrics(evaluation)

    # Insert latency at position 1 of the dict
    avg_metrics = list(avg_metrics.items())
    avg_metrics.insert(1, ("latency", latency))
    avg_metrics.insert(0, ("samples", num_samples))
    avg_metrics.insert(0, ("max tokens", int(eval_result.config.max_tokens)))
    avg_metrics.insert(0, ("prompt", eval_result.config.prompt))

    avg_metrics.insert(0, ("total_tokens", sum([i.total_tokens for i in eval_result.llm_responses])))
    avg_metrics.insert(0, ("completion_tokens", sum([i.completion_tokens for i in eval_result.llm_responses])))
    avg_metrics.insert(0, ("prompt_tokens", sum([i.prompt_tokens for i in eval_result.llm_responses])))
    avg_metrics = dict(avg_metrics)

    evaluations[result] = avg_metrics

In [None]:
pd.set_option('display.max_colwidth', 10)

In [None]:
result = pd.DataFrame(evaluations).transpose()
result

# Few-shot prompt generator

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"
import sys
sys.path.append("src/")
sys.path.append("../src/")
import evaluate as ev
import preprocess as pre

In [None]:
for i, result in enumerate(results):
    print(f"{i} - {result}")

In [None]:
file = results[10]
import json
with open( os.path.join(ROOT_DIR, os.path.join(file, "raw_output.json")) ) as f:
    data = json.load(f)

# Update data structure from previous version
if not data['config'].get("technique_name"):
    data['config']['technique_name'] = data['config']['name']
    del data['config']['name']
for i in ['label_names', 'labels_pred', 'labels_true']:
    if type(data[i]) is list:
        data[i] = {"NatureTitle":data[i]}

data = ev.EvaluationResult.from_dict(data)

In [None]:
examples = data.get_few_shot_examples(3,1,1)

print(examples)

In [None]:
input_features = "Final Narrative"
output_labels = ["NatureTitle", "Part of Body Title"]

# Preprocess the dataset into a form usable for supervised finetuning

dataset, label_names = pre.load_dataset(
    os.path.join(ROOT_DIR, "osha/datasets/imbalanced_multiclass_train.csv"),
    input_features,
    output_labels,
    test_size=0)

In [None]:
prompt = ev.create_prompt("OSHA injury report", label_names, examples=examples)

print(prompt.replace("\n", "\\n").replace('"', '\\"'))

In [None]:
print(prompt)