In [20]:
# Import necessary libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
with open('pc_usr_data.json', 'r') as file:
    data = json.load(file)

# Extract data into a DataFrame for easier analysis
records = []
for item in data:
    context = item["context"]
    fact = item["fact"]
    for response in item["responses"]:
        overall_avg = sum(response["Overall"]) / len(response["Overall"])
        #Normalize between 1 and 3
        # overall_avg = round(np.interp(overall_avg, (0, 5), (1, 3)), 2)
        record = {
            "context": context,
            # "fact": fact,
            "response": response["response"],
            "model": response["model"],
            "overall_evaluation": overall_avg,
        }
        records.append(record)

df = pd.DataFrame(records)
df

Unnamed: 0,context,response,model,overall_evaluation
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,3.666667
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,2.000000
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,3.000000
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,2.000000
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,5.000000
...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,4.333333
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,3.000000
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,2.333333
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,2.333333


In [21]:
import json
import pandas as pd
import re

# Load the JSON file
with open("evaluations.json", "r") as file:
    data = json.load(file)

# Prepare a list to store extracted information
rows = []

# Iterate over each context and its responses
for context, responses in data.items():
    for response_data in responses:
        # Extract relevant fields
        response_text = response_data["response"]
        model = response_data["model"]
        evaluation_full = response_data["evaluation"]

        # Extract the first number from the evaluation field
        match = re.search(r"\d+\.\d+|\d+", evaluation_full)
        evaluation_score = float(match.group())

        # Add a row to the list
        rows.append(
            {
                "context": context,
                "response": response_text,
                "model": model,
                "overall_evaluation": evaluation_score,
            }
        )

# Create a DataFrame
df_responses = pd.DataFrame(rows)
# Display the DataFrame
df_responses


Unnamed: 0,context,response,model,overall_evaluation
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.0
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.0
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.0
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.0
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,3.0
...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,3.0
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,1.0
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.0
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.0


In [22]:
merged_df = pd.merge(df, df_responses, on=["context", "response", "model"], how="inner", suffixes=('_original', '_gpt4'))
merged_df

Unnamed: 0,context,response,model,overall_evaluation_original,overall_evaluation_gpt4
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,3.666667,2.0
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,2.000000,1.0
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,3.000000,2.0
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,2.000000,1.0
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,5.000000,3.0
...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,4.333333,3.0
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,3.000000,1.0
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,2.333333,1.0
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,2.333333,1.0


In [23]:
correlation_matrix_pearson = merged_df[["overall_evaluation_original", "overall_evaluation_gpt4"]].corr(method="pearson")
correlation_matrix_pearson

Unnamed: 0,overall_evaluation_original,overall_evaluation_gpt4
overall_evaluation_original,1.0,0.553122
overall_evaluation_gpt4,0.553122,1.0


In [24]:
correlation_matrix_spearman = merged_df[["overall_evaluation_original", "overall_evaluation_gpt4"]].corr(method="spearman")
correlation_matrix_spearman

Unnamed: 0,overall_evaluation_original,overall_evaluation_gpt4
overall_evaluation_original,1.0,0.547106
overall_evaluation_gpt4,0.547106,1.0
