In [None]:
!pip install mlflow

In [26]:
!pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [27]:
import mlflow
import pandas as pd
from tabulate import tabulate

# Set the tracking URI to the MLFlow Server
mlflow.set_tracking_uri('http://127.0.0.1:5000')

# Retrieve experiments
experiments = mlflow.search_experiments()

# Initialize a list to collect all run data
all_runs = []

# Retrieve runs from each experiment and store them
for experiment in experiments:
    experiment_id = experiment.experiment_id

    if experiment.name == "base_4o" or experiment.name == "DUP":
        runs = mlflow.search_runs(experiment_ids=[experiment_id])

        for _, run in runs.iterrows():
            if 'params.accuracy' in run:
                try:
                    accuracy = float(run['params.accuracy'])
                except ValueError:
                    accuracy = None
                run_data = {
                    'experiment_id': experiment_id,
                    'experiment_name': experiment.name,
                    'run_id': run.run_id,
                    'run_name': run['tags.mlflow.runName'],
                    'accuracy': accuracy
                }
                all_runs.append(run_data)
            else:
                print(f"No accuracy metric found for run {run.run_id} in experiment: {experiment.name}")

# Convert runs to a DataFrame for easier manipulation
runs_df = pd.DataFrame(all_runs)

# Function to extract and standardize the subject name
def extract_subject_name(run_name):
    if 'base_4o' in run_name:
        return run_name.split('base_4o_')[1].rsplit('_', 1)[0].replace('_test', '')
    elif 'majority_vote' in run_name:
        return run_name.split('majority_vote_')[1].rsplit('_', 1)[0]
    elif '4o_' in run_name:
        return run_name.split('4o_')[1].rsplit('_', 1)[0].replace('_test.csv', '')
    return run_name

# Apply the function to standardize subject names
runs_df['subject'] = runs_df['run_name'].apply(extract_subject_name)

# Ensure the accuracy column is numeric
runs_df['accuracy'] = pd.to_numeric(runs_df['accuracy'], errors='coerce')

# Filter for relevant experiments
base_4o_df = runs_df[runs_df['experiment_name'] == 'base_4o'][['subject', 'accuracy']]
dup_majority_vote_df = runs_df[(runs_df['experiment_name'] == 'DUP') & runs_df['run_name'].str.startswith('majority_vote_')][['subject', 'accuracy']]
dup_4o_df = runs_df[(runs_df['experiment_name'] == 'DUP') & runs_df['run_name'].str.startswith('4o_')][['subject', 'accuracy']]

# Rename accuracy columns for merging
base_4o_df.rename(columns={'accuracy': 'base_4o'}, inplace=True)
dup_majority_vote_df.rename(columns={'accuracy': '1_layer_dup+majority_vote'}, inplace=True)
dup_4o_df.rename(columns={'accuracy': '1_layer_DUP'}, inplace=True)

# Perform the merging step-by-step
merged_df = pd.merge(base_4o_df, dup_majority_vote_df, on='subject', how='outer')
merged_df = pd.merge(merged_df, dup_4o_df, on='subject', how='outer')

# Replace NaN values with '-' for better readability
merged_df.fillna('-', inplace=True)

# Ensure the subjects are sorted alphabetically for consistency
merged_df.sort_values(by='subject', inplace=True)

# Adjust pandas display settings to show all rows
pd.set_option('display.max_rows', None)

# Display the merged DataFrame using tabulate
print("\nMerged DataFrame:")
print(tabulate(merged_df, headers='keys', tablefmt='psql'))



Merged DataFrame:
+----+-------------------------------------+-----------+-----------------------------+---------------+
|    | subject                             |   base_4o | 1_layer_dup+majority_vote   |   1_layer_DUP |
|----+-------------------------------------+-----------+-----------------------------+---------------|
|  0 | abstract_algebra                    |  0.69     | 0.71                        |      0.66     |
|  1 | anatomy                             |  0.896296 | -                           |      0.881481 |
|  2 | astronomy                           |  0.927632 | -                           |      0.927632 |
|  3 | business_ethics                     |  0.8      | -                           |      0.82     |
|  4 | clinical_knowledge                  |  0.909434 | -                           |      0.879245 |
|  5 | college_biology                     |  0.958333 | -                           |      0.958333 |
|  6 | college_chemistry                   |  0.69    

  merged_df.fillna('-', inplace=True)


In [29]:
# Create a DataFrame from the sample data
df = pd.DataFrame(merged_df)

# Function to find the best performance and corresponding condition
def find_best_performance(row):
    conditions = ['base_4o', '1_layer_dup+majority_vote', '1_layer_DUP']
    best_value = -float('inf')
    best_condition = None
    for condition in conditions:
        value = row[condition]
        if value != '-' and value > best_value:
            best_value = value
            best_condition = condition
    return pd.Series({'best_performance': best_value, 'best_condition': best_condition})

# Apply the function to each row
best_df = df.apply(find_best_performance, axis=1)

# Add the subject column to the best_df
best_df['subject'] = df['subject']

# Reorder the columns for better readability
best_df = best_df[['subject', 'best_performance', 'best_condition']]

# Display the result using tabulate
print("\nBest Performance DataFrame:")
print(tabulate(best_df, headers='keys', tablefmt='psql'))


Best Performance DataFrame:
+----+-------------------------------------+--------------------+---------------------------+
|    | subject                             |   best_performance | best_condition            |
|----+-------------------------------------+--------------------+---------------------------|
|  0 | abstract_algebra                    |           0.71     | 1_layer_dup+majority_vote |
|  1 | anatomy                             |           0.896296 | base_4o                   |
|  2 | astronomy                           |           0.927632 | base_4o                   |
|  3 | business_ethics                     |           0.82     | 1_layer_DUP               |
|  4 | clinical_knowledge                  |           0.909434 | base_4o                   |
|  5 | college_biology                     |           0.958333 | base_4o                   |
|  6 | college_chemistry                   |           0.7      | 1_layer_dup+majority_vote |
|  7 | college_computer_science

In [31]:
# Calculate the average of the best performance scores
average_best_performance = best_df['best_performance'].mean()

# Print the result
print(f"\nAverage of the best performance scores: {average_best_performance:.6f}")

# Calculate the average of the best performance scores
average_4o = merged_df['base_4o'].mean()

# Print the result
print(f"\nAverage of the 4o scores: {average_4o:.6f}")


Average of the best performance scores: 0.869192

Average of the 4o scores: 0.865110
