# Plotting of Experimenmt Results

Instructions:
- Replace your AWS credentials
- Specify the name of your S3 bucket
- Print available experiemnt clusters to download locally
- Pick your experiment cluster to plot
- Pick which experiments in the cluster to plot
- Make your plots
- Analyze!

## Imports

In [1]:
import matplotlib.pyplot as plt
import os
import boto3
import numpy as np

## Parsing functions

In [2]:
def parse_logfile(logfile):
    # so the tricky part we have to deal with in these log files
    # is that the job could crash and get restarted, which will
    # re-wind back and start re-logging older steps. So we keep
    # all the data as dictionary and over-write old data with new
    # and then at the end compile everything together

    # read raw data
    streams = {} # stream:str -> {step: val}
    lrs = {}
    total_training_time = 0
    with open(logfile, "r") as f:
        for line in f:
            parts = line.split()
            step = int(parts[0].split(":")[1])
            stream = parts[1].split(":")[0]
            val = float(parts[1].split(":")[1])
            if not stream in streams:
                streams[stream] = {}
            d = streams[stream]
            d[step] = val
            
            if len(parts) > 2:
                lr = float(parts[2].split(":")[1])
                lrs[step] = lr
                
                try:
                    time = float(parts[5].split(":")[1])
                    total_training_time += time / 1000 # convert from ms to s
                except:
                    pass


    # now re-represent as list of (step, val) tuples
    streams_xy = {}
    for k, v in streams.items():
        # get all (step, val) items, sort them
        xy = sorted(list(v.items()))
        # unpack the list of tuples to tuple of lists
        streams_xy[k] = zip(*xy)
    # return the xs, ys lists

    # convert total train time from s to hours
    total_training_time /= 3600

    return streams_xy, (np.array(list(lrs.keys())), np.array(list(lrs.values()))), total_training_time

# optional function that smooths out the loss some
def smooth_moving_average(signal, window_size):
    if signal.ndim != 1:
        raise ValueError("smooth_moving_average only accepts 1D arrays.")
    if signal.size < window_size:
        raise ValueError("Input vector needs to be bigger than window size.")
    if window_size < 3:
        return signal

    s = np.pad(signal, (window_size//2, window_size-1-window_size//2), mode='edge')
    w = np.ones(window_size) / window_size
    smoothed_signal = np.convolve(s, w, mode='valid')
    return smoothed_signal


In [3]:
def extract_learning_rate(file_path):
    try:
        with open(file_path, 'r') as file:
            for line in file:
                # Check if the line contains the learning rate parameter
                if 'learning rate (LR)' in line:
                    # Split the line and extract the value
                    parts = line.split('|')
                    learning_rate = parts[2].strip()  # The value is in the 3rd part
                    return float(learning_rate)  # Convert to float for numerical use

    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return None  # Return None if the learning rate is not found

def extract_parameters(file_path):
    num_parameters = None    
    try:
        with open(file_path, 'r') as file:
            for line in file:
                # Check if the line contains the number of parameters
                if 'num_parameters' in line:
                    # Split the line and extract the value
                    parts = line.split('|')
                    num_parameters = int(parts[2].strip())  # Convert to int

                if num_parameters is not None:
                    break

    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return num_parameters  # Return both values

## AWS Credentials

In [4]:
# TODO: Replace with your IAM User credentials
aws_access_key = ""
aws_secret_key = ""
aws_region = ""  # Replace with the region that your s3 bucket is in, is normally just us-east-1

## Getting your logs from the S3 Bucket

In [23]:
# Create a boto3 session with your credentials
session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_key,
    region_name=aws_region
)

# Create an S3 client or resource with this session
s3_client = session.client('s3')
s3 = session.resource('s3')

In [None]:
# TODO: Specify your bucket
bucket_name = ""

# Print Available Experiment Clusters

In [None]:
# Get the S3 bucket
bucket = s3.Bucket(bucket_name)

# Set to store unique top-level folder names
folders = set()

# List folders inside the experiments folder
experiments_folder = "experiments/"
for obj in bucket.objects.filter(Prefix=experiments_folder):
    # Extract the folder name after 'experiments/' by splitting on '/'
    parts = obj.key[len(experiments_folder):].split('/')
    if len(parts) > 1:
        folder_name = parts[0]
        folders.add(folder_name)

# Print all unique top-level folders
print("Experiment Clusters that you can plot:")
for folder in sorted(folders):
    print('\t-', folder)

## Getting your experiment logs

To download your logs, specify the **cluster_name** variable to the experiment cluster you want to download then run the next cell. This downloads this experiment cluster and saves it locally in a folder in the current working directory called downloaded_logs 

In [None]:
# TODO: Specify which experiment cluster you want to download and then plot.
cluster_name = ""

In [None]:
# Define the local directory to save files
local_directory = os.path.join(os.getcwd(), "downloaded_logs")
os.makedirs(local_directory, exist_ok=True)

# Download files in the specified subfolder
for obj in bucket.objects.filter(Prefix=f"{experiments_folder}{cluster_name}/"):
    # Define the local path to save the file
    relative_path = os.path.relpath(obj.key, experiments_folder)
    local_file_path = os.path.join(local_directory, relative_path)

    # Create local directories if they do not exist
    os.makedirs(os.path.dirname(local_file_path), exist_ok=True)

    # Download the file
    print(f"Downloading {obj.key} to {local_file_path}")
    bucket.download_file(obj.key, local_file_path)

print("Download complete.")

# Printing the individual experiments within your cluster

In [None]:
# List all entries in the specified directory
path = os.path.join('downloaded_logs', cluster_name)

print("Available individual experiments to plot")
for entry in os.listdir(path):
    # Check if the entry is a directory
    if os.path.isdir(os.path.join(path, entry)):
        print('\t-', entry)

Specify which individual experiments from the cluster you would like to plot. Leave the variable **experiments_to_plot** as an empty list if you want to plot all experiments in the cluster.

In [113]:
# TODO: Specify which experiments you want to plot. Leave empty is if you want to plot all experiments in the cluster
experiments_to_plot = []

## Plot your experiment

In [114]:
def plot_experiment(cluster_name, to_plot, plot_val_loss=True, plot_train_loss=False, draw_min_train_loss=False, draw_min_val_loss=False):
    plt.figure(figsize=(16, 4))

    cluster_path = os.path.join(local_directory, cluster_name)

    # Check if the experiment path exists and is a directory

    
    if len(to_plot) == 0:
        # plot all experiments in the cluster
        directory_list = [item for item in os.listdir(cluster_path)]
    else:
        directory_list = [os.path.join(cluster_path, item) for item in to_plot]

    for item in directory_list:
        item_path = os.path.join(cluster_path, item)
        if os.path.isdir(item_path):
            # print('\t', f'Adding {item_path} to plot')

            logfile = os.path.join(item_path, "main.log")

            param_file = os.path.join(item_path, "run_parameters.txt")
            learn_rate = extract_learning_rate(param_file)
            num_params = extract_parameters(param_file) / 1e6 # convert to millions

            name = f'LR: {learn_rate}'

            streams, lr_data, train_time = parse_logfile(logfile)
            steps, lr_val = lr_data

            # Panel 1: losses: both train and val
            plt.subplot(131)
            xs1, ys1 = streams["trl"] # training loss
            ys1 = np.array(ys1)

            # smooth out ys using a rolling window
            # ys = smooth_moving_average(ys, 21) # optional

            xs2, ys2 = streams["tel"] # validation loss

            if draw_min_train_loss:
                plt.axhline(min(ys1), color='b', linestyle='--')
            if draw_min_val_loss:
                plt.axhline(min(ys2), color='r', linestyle='--')

            if plot_train_loss:
                plt.plot(xs1, ys1, label=f'({name}) train loss')

            if plot_val_loss:
                plt.plot(xs2, ys2, label=f'({name}) val loss')
            
            if num_params is not None and learn_rate is not None:
                print(f"Params: {num_params:.2f}m | LR : {learn_rate:.4f} | Min Train Loss: {min(ys1):.4f} | Min Val Loss : {min(ys2):.4f} | Train Time : {train_time:.2f} hrs")
            else:
                print(f"Min Train Loss: {min(ys)}")
            
            plt.xlabel("steps")
            plt.ylabel("loss")
            plt.yscale('log')
            plt.legend()
            plt.title("Loss")

            plt.subplot(132)
            if "eval" in streams:
                xs, ys = streams["eval"] # HellaSwag eval
                ys = np.array(ys)
                plt.plot(xs, ys, label=f"({name})")

                plt.xlabel("steps")
                plt.ylabel("accuracy")
                plt.legend()
                plt.title("HellaSwag eval")
                # print("Max Hellaswag eval:", max(ys))

            plt.subplot(133)
            plt.plot(steps, lr_val)
            plt.title("Learning Rate Schedule")
            plt.xlabel("steps")
            plt.ylabel("learning rate")

    plt.show()

The **plot_experiments** takes the following inputs:
  - name of a cluster to plot (You must have downlaoded the cluster logs in order for the plotting to work)
  - a list containing the experiments you want to plot (leave empty if you want to plot them all)
  - two booleans for if you want to plot validation loss or training loss 
  - a boolean for if you want to label the plots by the learning rate they used
    - if set to False, then it will default to using the experiment name (folder name)

It will print the minimum training loss and minimum validation loss achieved for each experiment, along with the total number of parameters and learning rate used.

In [None]:
plot_experiment(cluster_name, experiments_to_plot, 
                plot_val_loss=False, 
                plot_train_loss=True, 
                draw_min_train_loss=True,
                draw_min_val_loss=False)

# Analysis

Do any of your analysis here, or keep tables of models you've already tested!

| Example | of | a | table |
| --- | --- | --- | --- |
| this | is | a | row |
| and | this | is | also |