In [4]:
# Import necessary libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
with open('pc_usr_data.json', 'r') as file:
    data = json.load(file)

# Extract data into a DataFrame for easier analysis
records = []
for item in data:
    context = item["context"]
    fact = item["fact"]
    for response in item["responses"]:
        overall_avg = sum(response["Overall"]) / len(response["Overall"])
        #Normalize between 1 and 3
        overall_avg = round(np.interp(overall_avg, (0, 5), (1, 3)), 2)
        record = {
            "context": context,
            # "fact": fact,
            "response": response["response"],
            "model": response["model"],
            "overall_evaluation": overall_avg,
            "evaluation1": round(np.interp(response["Overall"][0], (0, 5), (1, 3)), 2),
            "evaluation2": round(np.interp(response["Overall"][1], (0, 5), (1, 3)), 2),
            "evaluation3": round(np.interp(response["Overall"][2], (0, 5), (1, 3)), 2),
        }
        records.append(record)

df = pd.DataFrame(records)
df

Unnamed: 0,context,response,model,overall_evaluation,evaluation1,evaluation2,evaluation3
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.47,2.2,2.6,2.6
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.80,1.8,1.8,1.8
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.20,2.2,2.2,2.2
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.80,1.8,1.8,1.8
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,3.00,3.0,3.0,3.0
...,...,...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,2.73,2.6,3.0,2.6
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,2.20,1.8,2.6,2.2
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.93,2.2,1.8,1.8
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.93,2.2,1.8,1.8


In [5]:
import json
import pandas as pd
import re

# Load the JSON file
with open("evaluations.json", "r") as file:
    data = json.load(file)

# Prepare a list to store extracted information
rows = []

# Iterate over each context and its responses
for context, responses in data.items():
    for response_data in responses:
        # Extract relevant fields
        response_text = response_data["response"]
        model = response_data["model"]
        evaluation_full = response_data["evaluation"]

        # Extract the first number from the evaluation field
        match = re.search(r"\d+\.\d+|\d+", evaluation_full)
        evaluation_score = float(match.group())

        # Add a row to the list
        rows.append(
            {
                "context": context,
                "response": response_text,
                "model": model,
                "overall_evaluation": evaluation_score,
            }
        )

# Create a DataFrame
df_responses = pd.DataFrame(rows)
# Display the DataFrame
df_responses


Unnamed: 0,context,response,model,overall_evaluation
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.0
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.0
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.0
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.0
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,3.0
...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,3.0
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,1.0
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.0
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.0


In [6]:
merged_df = pd.merge(df, df_responses, on=["context", "response", "model"], how="inner", suffixes=('_original', '_gpt4'))
merged_df

Unnamed: 0,context,response,model,overall_evaluation_original,evaluation1,evaluation2,evaluation3,overall_evaluation_gpt4
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.47,2.2,2.6,2.6,2.0
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.80,1.8,1.8,1.8,1.0
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.20,2.2,2.2,2.2,2.0
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.80,1.8,1.8,1.8,1.0
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,3.00,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,2.73,2.6,3.0,2.6,3.0
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,2.20,1.8,2.6,2.2,1.0
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.93,2.2,1.8,1.8,1.0
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.93,2.2,1.8,1.8,1.0


In [25]:
correlation_matrix_pearson = merged_df[["overall_evaluation_gpt4", "evaluation1", "evaluation2", "evaluation3", "overall_evaluation_original"]].corr(method="pearson")
correlation_matrix_pearson = correlation_matrix_pearson.iloc[1:, [0]]
correlation_matrix_pearson

Unnamed: 0,overall_evaluation_gpt4
evaluation1,0.484899
evaluation2,0.517789
evaluation3,0.449234
overall_evaluation_original,0.552852


In [24]:
correlation_matrix_spearman = merged_df[["overall_evaluation_gpt4", "evaluation1", "evaluation2", "evaluation3", "overall_evaluation_original"]].corr(method="spearman")
correlation_matrix_spearman = correlation_matrix_spearman.iloc[1:, [0]]
correlation_matrix_spearman

Unnamed: 0,overall_evaluation_gpt4
evaluation1,0.478191
evaluation2,0.510401
evaluation3,0.437072
overall_evaluation_original,0.547106


In [16]:
from sklearn.metrics import cohen_kappa_score

merged_df_1 = merged_df.copy()
merged_df_1["overall_evaluation_gpt4"] = merged_df_1["overall_evaluation_gpt4"].apply(lambda x: round(x))
merged_df_1["overall_evaluation_original"] = merged_df_1["overall_evaluation_original"].apply(lambda x: round(x))

cohen_value = cohen_kappa_score(merged_df_1["overall_evaluation_gpt4"], merged_df_1["overall_evaluation_original"])
print("COHEN_VALUE : " + str(cohen_value))

COHEN_VALUE : 0.12595901718947267


In [17]:
from scipy.stats import kendalltau

tau, p_value = kendalltau(merged_df["overall_evaluation_gpt4"], merged_df["overall_evaluation_original"])
print("TAU Value: " + str(tau))
print("P-Value: " + str(p_value))

TAU Value: 0.44593338154183865
P-Value: 9.053601299113695e-22
