# Load the OSHA Injuries Dataset

In [1]:
# Import modules for LLM finetuning and evaluation
import finetune as ft
import evaluate as ev

In [2]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv("dataset/January2015toJuly2024.csv", low_memory=False)
data["NatureTitle"] = data["NatureTitle"].map(lambda x : x.strip() if type(x) is str else x)

In [20]:
pd.set_option('max_colwidth', 0)

In [None]:
d = data[["Employer", "Final Narrative", "NatureTitle", "Part of Body Title"]].head()

d.style.set_properties(**{'text-align': 'left'}).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])

In [None]:
pd.Series([x.strip() for x in data["NatureTitle"].dropna()]).unique().size

In [None]:
len(data)

In [None]:
[min(top_classes.to_list())] * 10

In [None]:
from matplotlib import pyplot as plt

top_classes = pd.Series([x.strip() for x in data["NatureTitle"].dropna()])
top_classes = top_classes.value_counts()
# Obtain top 50
pd.Series({name: count for (name, count) in top_classes.to_dict().items() if count > 50})

plt.barh(
    top_classes.keys(),
    top_classes.to_list()
)
plt.gca().invert_yaxis()
plt.xscale('log')
plt.yticks(fontsize=1, rotation=0)
#plt.yticks([])
plt.ylabel("Class Label")
plt.xlabel("Number of samples")

In [77]:
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

In [None]:
from matplotlib import pyplot as plt
top_classes = pd.Series([x.strip() for x in data["NatureTitle"].dropna()])
top_classes = top_classes.value_counts().iloc[0:10]
plt.barh(
    top_classes.keys(),
    top_classes.to_list()
)
plt.barh(
    top_classes.keys(),
    [min(top_classes.to_list())] * 10
)
plt.gca().invert_yaxis()
plt.legend(["Total", "Balanced"])
plt.xlabel("Number of samples")

In [None]:
dataset = ft.create_dataset_from_dataframe(data, "Final Narrative", "NatureTitle")

In [None]:
dataset['train'].features['label'].names[0:10]

In [None]:
dataset = ft.select_top_n_classes(dataset, n=10)

In [None]:
dataset = ft.undersample_dataset(dataset, ratio=1)

In [None]:
dataset, label_names = ft.preprocess_dataset(dataset)

# Model output analysis

In [None]:
output = "output/osha/fine-tuned/answers.csv"
output = pd.read_csv(output)

In [None]:
n = 15 # number of top classes

labels = output["True Label"].value_counts().iloc[0:n].keys()

plt.barh(
    output["True Label"].value_counts().iloc[0:n].keys(),
    output["True Label"].value_counts().iloc[0:n].to_list(),
    alpha=0.5
)

plt.barh(
    output["Predicted Label"].value_counts().iloc[0:n].keys(),
    output["Predicted Label"].value_counts().iloc[0:n].to_list(),
    alpha=0.5
)

plt.gca().invert_yaxis()
plt.legend(labels=["True", "Predicted"])