# Preview Evaluation - Perplexity

Thia is a notebook to make a preview comparison of the generations when using Perplexity as a metric for the Datamodels collections.  
Important to notice, this notebook is considering onle ONE generation of 50 samples.

In [1]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
from utils.calculate_metric import calculate_agg_metric
import os
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


## Calculate Metrics and save them

In [2]:
seeds = [7270, 860]
datamodels_path  = "../dmrc_pipelines"

In [3]:
for s in seeds:
    questions_path = f"../../datamodels_training_window_size/experiments_{s}/questions.feather"
    for file in os.listdir(f"{datamodels_path}/{s}/generations"):
        print(file)
        calculate_agg_metric(
            metrics=["rouge_l"],
            generation_path=f"{datamodels_path}/{s}/generations/{file}",
            reference_path=questions_path,
            saving_path=f"result_{s}_{file.split('.')[0]}.feather"
        )




perplexity_generations.json
rag_generations.json
datamodels_generations.json
baseline_7270_baseline_generations.json
perplexity_generations.json
rag_generations.json
datamodels_generations.json
baseline_860_baseline_generations.json


## Load Data

In [9]:
### Perplexity results

_perplexitiy_results = []
for f in os.listdir("."):
    if f.endswith("feather"):
        processsed_namefile = f.split(".")[0].split("result_")[1]
        seed = processsed_namefile.split("_")[0]
        run_type = processsed_namefile.split("_")[1]
        _perplexitiy_results.append(pl.read_ipc(f).with_columns(pl.lit(seed).alias("seed"), pl.lit(run_type).alias("run_type")))
perplexity_results = pl.concat(_perplexitiy_results)
perplexity_results.group_by("seed", "run_type", "metric").agg(pl.col("mean").mean()).sort("seed", "run_type")

seed,run_type,metric,mean
str,str,str,f64
"""7270""","""baseline""","""rouge_l""",0.286462
"""7270""","""datamodels""","""rouge_l""",0.328384
"""7270""","""perplexity""","""rouge_l""",0.009194
"""7270""","""rag""","""rouge_l""",0.255143
"""860""","""baseline""","""rouge_l""",0.165779
"""860""","""datamodels""","""rouge_l""",0.341708
"""860""","""perplexity""","""rouge_l""",0.002676
"""860""","""rag""","""rouge_l""",0.202825


In [7]:
import json
indexes = {}
for s in seeds:
    rag_retrieval = json.load(open(f"{datamodels_path}/{s}/retrieval/rag_retrieval_indexes.json"))
    per_datamodels_retrieval = json.load(open(f"{datamodels_path}/{s}/retrieval/nomralized_no_top_p_{s}_indexes.json"))
    rou_datamodels_retrieval = json.load(open(f"{datamodels_path}/{s}/retrieval/size_2000_indexes.json"))
    dicts = [rag_retrieval, per_datamodels_retrieval, rou_datamodels_retrieval]
    processed_dicts = []
    for d in dicts:
        # Get values in order and convert to numpy arrayzz
        array_50x100 = np.array([d[str(key)] for key in range(50)])
        # Take first 16 elements
        array_50x16 = array_50x100[:, 34:50]
        # Convert back to list if needed (otherwise keep as numpy array)
        processed_dicts.append(array_50x16.tolist())
    indexes[str(s)] = processed_dicts

In [11]:


def count_matches_no_numpy(list_a, list_b):
    matches = 0
    for key in range(50):
        for i in range(16):
            if list_a[key][i] in list_b[key]:
                matches += 1
    return matches

def count_matches_no_numpy_3(list_a, list_b, list_c):
    matches = 0
    for key in range(50):
        for i in range(16):
            if list_a[key][i] in list_b[key] and list_a[key][i] in list_c[key]:
                matches += 1
    return matches


for s in seeds:
    print(f"Seed: {s}")
    print(f"Intersection between RAG and Perplexity: {count_matches_no_numpy(indexes[str(s)][0], indexes[str(s)][1])}")
    print(f"Intersection between RAG and Rouge: {count_matches_no_numpy(indexes[str(s)][0], indexes[str(s)][2])}")
    print(f"Intersection between Perplexity and Rouge: {count_matches_no_numpy(indexes[str(s)][1], indexes[str(s)][2])}")
    print(f"Intersection between RAG, Perplexity and Rouge: {count_matches_no_numpy_3(indexes[str(s)][0], indexes[str(s)][1], indexes[str(s)][2])}")

Seed: 7270
Intersection between RAG and Perplexity: 125
Intersection between RAG and Rouge: 140
Intersection between Perplexity and Rouge: 121
Intersection between RAG, Perplexity and Rouge: 21
Seed: 860
Intersection between RAG and Perplexity: 138
Intersection between RAG and Rouge: 136
Intersection between Perplexity and Rouge: 137
Intersection between RAG, Perplexity and Rouge: 31
