# Model Performance Analysis

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import os
import json

## Load Rollouts from json

In [27]:
rollout_path = "../assets/rollouts/"
rollout_files = glob.glob(os.path.join(rollout_path, "rollout_results_*.json"))

# Load all rollout files
results = {}
for file in rollout_files:
    with open(file, "r") as f:
        # Get the filename without the path and extension e.g "merged_task_1_lora"
        filename = os.path.basename(file)
        filename = filename.split(".")[0]
        filename = "_".join(filename.split("_")[2:])
        # Load the json file
        rollout = json.load(f)
        results[filename] = rollout

print(sorted(results.keys()))


['merged_task1_fpft', 'merged_task1_lora', 'merged_task2_fpft', 'merged_task2_lora', 'merged_task3_fpft', 'merged_task3_lora', 'merged_task4_fpft', 'merged_task4_lora', 'task1_fpft', 'task1_lora', 'task2_fpft', 'task2_lora', 'task3_fpft', 'task3_lora', 'task4_fpft', 'task4_lora']


## Turn into pd.DataFrame

In [34]:
rollout = results["merged_task2_lora"]

df = pd.DataFrame(rollout)
print(df.shape)
df.head()


(20, 6)


Unnamed: 0,timestamp,success,duration,steps_completed,total_steps,score
0,1765203000.0,False,25.679944,2,4,0.5
1,1765203000.0,False,8.374424,0,4,0.0
2,1765203000.0,False,21.421537,2,4,0.5
3,1765203000.0,False,19.943922,2,4,0.5
4,1765203000.0,False,15.807475,2,4,0.5


## Calculate Scores

---

just to make sure

In [35]:
for i in range(len(df)):
    df.loc[i, "score"] = df.loc[i, "steps_completed"] / df.loc[i, "total_steps"]

df.tail()

Unnamed: 0,timestamp,success,duration,steps_completed,total_steps,score
15,1765204000.0,False,20.820444,2,4,0.5
16,1765204000.0,False,19.53765,2,4,0.5
17,1765204000.0,False,26.210658,0,4,0.0
18,1765204000.0,False,37.813264,2,4,0.5
19,1765204000.0,True,32.153528,4,4,1.0


## Calculate Avg Success Rate

In [36]:
avg_success_rate = df["success"].mean()
print(f"Avg Success Rate: {avg_success_rate}")

Avg Success Rate: 0.2


## Calculate Avg Task Progress

In [None]:
avg_task_progress = df["score"].mean()
print(f"Avg Task Progress: {avg_task_progress}")

Avg Task Progress: 0.5375


## Combine all results into a single DataFrame

----

In [None]:
df = pd.DataFrame(columns=["success_rate", "avg_task_progress"])
for filename, rollout in results.items():

    
    # Create a temp DataFrame from the rollout
    temp = pd.DataFrame(rollout)
    # Recalculate the score
    for i in range(len(temp)):
        temp.loc[i, "score"] = temp.loc[i, "steps_completed"] / temp.loc[i, "total_steps"]
    # Calculate the success rate
    success_rate = temp["success"].mean()
    # Calculate the avg task progress
    avg_task_progress = temp["score"].mean()
    # Add the filename, success rate, and avg task progress to the main DataFrame as a row
    df.loc[filename] = [success_rate, avg_task_progress]

df

Unnamed: 0,success_rate,avg_task_progress
merged_task4_lora,0.55,0.75
task4_fpft,0.7,0.866667
merged_task2_lora,0.2,0.5375
merged_task4_fpft,0.8,0.933333
task1_lora,0.45,0.816667
merged_task2_fpft,0.15,0.5625
task3_fpft,0.9,0.925
merged_task3_fpft,0.8,0.8
task1_fpft,0.75,0.916667
merged_task1_lora,0.5,0.8
