#### Analyze data

Includes plotting single runs, comparing multiple runs, etc.

##### Graphing training runs

In [None]:
import matplotlib.pyplot as plt
import json
import glob
import os
import datetime

# Simply update this with the .jsonl files you want to plot
train_models = ['dataset_3_1_1']
train_models = ['trained_models/' + trained_model_folder for trained_model_folder in train_models]
plt.figure(figsize=(15, 6))

for trained_model_folder in train_models:
    latest_file = None
    latest_time = None
    
    for file in glob.glob(os.path.join(trained_model_folder, "train_*.jsonl")):
        filename = os.path.basename(file)
        parts = filename.replace("train_", "").replace(".jsonl", "").split("-")
        month, day, year, hour, minute, second = map(int, parts)
        
        # Convert to datetime object
        file_time = datetime.datetime(year, month, day, hour, minute, second)
        
        # Find the latest file
        if latest_time is None or file_time > latest_time:
            latest_time = file_time
            latest_file = file
            
    train_data_x = []
    
    with open(latest_file, 'r') as data_file:
        for line in data_file:
            jdata = json.loads(line)
            if jdata['message'] == "Training progress" and 'iter' in jdata:
                train_data_x.append([jdata['iter'], jdata['train_loss'], jdata['val_loss']])

    x = [point[0] for point in train_data_x]
    tl = [point[1] for point in train_data_x]
    vl = [point[2] for point in train_data_x]

    plt.plot(x, tl, linestyle='-', label=f'Training Loss')
    plt.plot(x, vl, linestyle='-', label=f'Validation Loss')

# Add titles and labels
plt.title('Training Progress for Multiple Datasets')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


##### Graphing sampling (distributions)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import datetime

output_dir_name = 'dataset_1_1'

def get_latest_folder():
    latest_time = None
    latest_file = None
    for file in glob.glob(os.path.join('generated_samples', output_dir_name, "*")):
        filename = os.path.basename(file)
        parts = filename.split("-")
        month, day, year, hour, minute, second = map(int, parts)
        
        # Convert to datetime object
        file_time = datetime.datetime(year, month, day, hour, minute, second)
        
        # Find the latest file
        if latest_time is None or file_time > latest_time:
            latest_time = file_time
            latest_file = file
    return latest_file

input_dist_path = f'{get_latest_folder()}/input_leading_particles.csv'
sample_dist_path = f'{get_latest_folder()}/sampled_leading_particles.csv'

columns = ["num_particles", "pdgid", "e", "px", "py", "pz"]
bin_settings = {
    "num_particles": {"min": 5, "max": 20, "bins": 15, "ymin": 0, "ymax": 2000},
    "pdgid": {"min": -300, "max": 0, "bins": 10, "ymin": 0, "ymax": 10000},
    "e": {"min": 0, "max": 35000, "bins": 35, "ymin": 0, "ymax": 10000},
    "px": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
    "py": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
    "pz": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
}

df1 = pd.read_csv(input_dist_path, sep=" ", names=columns, engine="python")
df2 = pd.read_csv(sample_dist_path, sep=" ", names=columns, engine="python")

for column, settings in bin_settings.items():
    min_val = settings["min"]
    max_val = settings["max"]
    bins = settings["bins"]
    ymin = settings["ymin"]
    ymax = settings["ymax"]

    # Filter values to ensure they fall within the specified range
    filtered_data1 = df1[column][(df1[column] >= min_val) & (df1[column] <= max_val)]
    filtered_data2 = df2[column][(df2[column] >= min_val) & (df2[column] <= max_val)]

    # Create side-by-side histograms
    plt.figure(figsize=(14, 6))
    
    # First file histogram
    plt.subplot(1, 2, 1)
    plt.hist(filtered_data1, bins=bins, range=(min_val, max_val), edgecolor="black", alpha=0.7, color="blue")
    plt.title(f"Histogram of {column} - Input leading particle")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.ylim(ymin, ymax)  # Set y-axis limits
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    
    # Second file histogram
    plt.subplot(1, 2, 2)
    plt.hist(filtered_data2, bins=bins, range=(min_val, max_val), edgecolor="black", alpha=0.7, color="orange")
    plt.title(f"Histogram of {column} - Sampled leading particle")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.ylim(ymin, ymax)  # Set y-axis limits
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    
    # Display the plots
    plt.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
import glob
import datetime

output_dir_name = 'dataset_3_1_1'

def get_latest_folder():
    latest_time = None
    latest_file = None
    for file in glob.glob(os.path.join('generated_samples', output_dir_name, "*")):
        filename = os.path.basename(file)
        parts = filename.split("-")
        month, day, year, hour, minute, second = map(int, parts)
        
        # Convert to datetime object
        file_time = datetime.datetime(year, month, day, hour, minute, second)
        
        # Find the latest file
        if latest_time is None or file_time > latest_time:
            latest_time = file_time
            latest_file = file
    return latest_file

input_dist_path = f'{get_latest_folder()}/input_leading_particles.csv'
sample_dist_path = f'{get_latest_folder()}/sampled_leading_particles.csv'

columns = ["num_particles", "pdgid", "e", "px", "py", "pz"]
bin_settings = {
    "num_particles": {"min": 5, "max": 20, "bins": 15, "ymin": 0, "ymax": 2000},
    "pdgid": {"min": -300, "max": 0, "bins": 10, "ymin": 0, "ymax": 10000},
    "e": {"min": 0, "max": 35000, "bins": 35, "ymin": 0, "ymax": 10000},
    "px": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
    "py": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
    "pz": {"min": 0, "max": 35000, "bins": 20, "ymin": 0, "ymax": 2000},
}

df1 = pd.read_csv(input_dist_path, sep=" ", names=columns, engine="python")
df2 = pd.read_csv(sample_dist_path, sep=" ", names=columns, engine="python")

for column, settings in bin_settings.items():
    min_val = settings["min"]
    max_val = settings["max"]
    bins = settings["bins"]
    ymin = settings["ymin"]
    ymax = settings["ymax"]

    # Filter values to ensure they fall within the specified range
    filtered_data1 = df1[column][(df1[column] >= min_val) & (df1[column] <= max_val)]
    filtered_data2 = df2[column][(df2[column] >= min_val) & (df2[column] <= max_val)]

    plt.figure(figsize=(21, 6))
    plt.subplot(1, 2, 1)
    plt.hist(filtered_data1, bins=bins, range=(min_val, max_val), edgecolor="black", alpha=0.7, color="blue", label="Input")
    plt.hist(filtered_data2, bins=bins, range=(min_val, max_val), edgecolor="black", alpha=0.7, color="orange", label="Sampled")
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    plt.ylim(ymin, ymax)
    plt.legend()
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()