Dataset Originale

In [None]:
# Import necessary libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
with open('pc_usr_data.json', 'r') as file:
    data = json.load(file)

# Extract data into a DataFrame for easier analysis
records = []
for item in data:
    context = item["context"]
    fact = item["fact"]
    for response in item["responses"]:
        overall_avg = sum(response["Overall"]) / len(response["Overall"])
        #Normalize between 1 and 3
        overall_avg = round(np.interp(overall_avg, (0, 5), (1, 3)), 2)
        # overall_avg = round(overall_avg, 2)
        record = {
            "context": context,
            # "fact": fact,
            "response": response["response"],
            "model": response["model"],
            "overall_evaluation": overall_avg,
            "evaluation1": round(np.interp(response["Overall"][0], (0, 5), (1, 3)), 2),
            "evaluation2": round(np.interp(response["Overall"][1], (0, 5), (1, 3)), 2),
            "evaluation3": round(np.interp(response["Overall"][2], (0, 5), (1, 3)), 2),
            # "evaluation1": round(response["Overall"][0], 2),
            # "evaluation2": round(response["Overall"][1], 2),
            # "evaluation3": round(response["Overall"][2], 2),
        }
        records.append(record)
df = pd.DataFrame(records)
df

Unnamed: 0,context,response,model,overall_evaluation,evaluation1,evaluation2,evaluation3
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,3.67,3,4,4
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,2.00,2,2,2
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,3.00,3,3,3
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,2.00,2,2,2
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,5.00,5,5,5
...,...,...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,4.33,4,5,4
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,3.00,2,4,3
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,2.33,3,2,2
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,2.33,3,2,2


Dataset con Chain of thought

In [8]:
import json
import pandas as pd
import re

# Load the JSON file
with open("evaluations_justified.json", "r") as file:
    data = json.load(file)

# Prepare a list to store extracted information
rows = []

# Iterate over each context and its responses
for context, responses in data.items():
    for response_data in responses:
        # Extract relevant fields
        response_text = response_data["response"]
        model = response_data["model"]
        evaluation_full = response_data["evaluation"]

        # Find the first numeric score anywhere in the string
        match = re.search(r"(\d+(?:\.\d+)?)", evaluation_full)
        if match:
            evaluation_score = float(match.group(1))
            # The explanation is everything after the matched number
            explanation = evaluation_full[match.end() :].strip()
        else:
            evaluation_score = None
            explanation = ""

        # Optionally remove common prefixes from the explanation
        explanation = re.sub(
            r"^(?:\*?\*?explanation:|eval(?:uation)?\s*details:)\s*",
            "",
            explanation,
            flags=re.IGNORECASE,
        )

        # Add a row to the list
        rows.append(
            {
                "context": context,
                "response": response_text,
                "model": model,
                "overall_evaluation": evaluation_score,
                "explanation": explanation,
            }
        )

# Create a DataFrame
df_responses_explained = pd.DataFrame(rows)
# Display the DataFrame
df_responses_explained

Unnamed: 0,context,response,model,overall_evaluation,explanation
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.0,The response is mostly appropriate and maintai...
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.0,**\nThe given response is unrelated to the pre...
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.0,The response from the artist is mostly appropr...
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.0,(Unsatisfactory)\n\n**Explanation:**\nThe give...
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,3.0,**\nThe overall quality of the response is exc...
...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,3.0,- Explanation: The response is highly relevant...
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,1.0,(Unsatisfactory)\n\n**Explanation:**\nThe prop...
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.0,(Unsatisfactory)\n\n- Explanation: The respons...
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.0,(Unsatisfactory)\n\nExplanation: The response ...


Dataset con 20 valutazioni per response

In [9]:
import json
import pandas as pd
import numpy as np


def compute_mean(scores):
    """
    Compute the mean of a list of scores, ignoring non-numeric values.
    """
    numeric_scores = []
    for score in scores:
        try:
            # Convert score to float; if this fails, the score is non-numeric and will be ignored.
            numeric_scores.append(float(score))
        except (ValueError, TypeError):
            continue
    return np.mean(numeric_scores) if numeric_scores else np.nan


# Open and load the JSON file
with open("evaluations.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Prepare a list to collect our flattened records
records = []

# Iterate over each conversation and its associated list of responses
for conversation, responses in data.items():
    for entry in responses:
        evaluations = entry.get("evaluation", [])
        mean_eval = compute_mean(evaluations)
        record = {
            "context": conversation,
            "response": entry.get("response", ""),
            "model": entry.get("model", ""),
            "overall_evaluation": mean_eval,
        }
        records.append(record)

df_responses = pd.DataFrame(records)
df_responses

Unnamed: 0,context,response,model,overall_evaluation
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,2.00
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,1.00
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,2.00
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,1.00
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,2.30
...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,3.00
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,1.25
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,1.00
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,1.00


Dataset Unificato

In [10]:
merged_df = pd.merge(df, df_responses, on=["context", "response", "model"], how="inner", suffixes=('_original', '_gpt4'))
merged_df

Unnamed: 0,context,response,model,overall_evaluation_original,evaluation1,evaluation2,evaluation3,overall_evaluation_gpt4
0,hi there how are you doing this evening ?\nhi ...,ha ha i'm so shy\n,Original Ground Truth,3.67,3,4,4,2.00
1,hi there how are you doing this evening ?\nhi ...,i know what you mean spend most nights cuddlin...,KV-MemNN,2.00,2,2,2,1.00
2,hi there how are you doing this evening ?\nhi ...,i am a little shy but i am a little shy\n,Seq2Seq,3.00,3,3,3,2.00
3,hi there how are you doing this evening ?\nhi ...,i am not . i am a student .\n,Language Model,2.00,2,2,2,1.00
4,hi there how are you doing this evening ?\nhi ...,i only feel comfortable sharing my music with ...,New Human Generated,5.00,5,5,5,2.30
...,...,...,...,...,...,...,...,...
295,"hi , i produce rap music for artists , i mostl...",we should have them meet at a cat park so they...,Original Ground Truth,4.33,4,5,4,3.00
296,"hi , i produce rap music for artists , i mostl...",my drummer has a cat named system . our fav ba...,KV-MemNN,3.00,2,4,3,1.25
297,"hi , i produce rap music for artists , i mostl...",i do not know what that means .\n,Seq2Seq,2.33,3,2,2,1.00
298,"hi , i produce rap music for artists , i mostl...",i do not know how to play the guitar .\n,Language Model,2.33,3,2,2,1.00


Correlazione di Pearson

In [11]:
correlation_matrix_pearson = merged_df[["overall_evaluation_gpt4", "evaluation1", "evaluation2", "evaluation3", "overall_evaluation_original"]].corr(method="pearson")
correlation_matrix_pearson = correlation_matrix_pearson.iloc[1:, [0]]
correlation_matrix_pearson

Unnamed: 0,overall_evaluation_gpt4
evaluation1,0.498179
evaluation2,0.526585
evaluation3,0.469865
overall_evaluation_original,0.568712


Correlazione di Spearman

In [12]:
correlation_matrix_spearman = merged_df[["overall_evaluation_gpt4", "evaluation1", "evaluation2", "evaluation3", "overall_evaluation_original"]].corr(method="spearman")
correlation_matrix_spearman = correlation_matrix_spearman.iloc[1:, [0]]
correlation_matrix_spearman

Unnamed: 0,overall_evaluation_gpt4
evaluation1,0.515271
evaluation2,0.538239
evaluation3,0.466399
overall_evaluation_original,0.582577


Valore di Cohen

In [13]:
from sklearn.metrics import cohen_kappa_score

merged_df_1 = merged_df.copy()
merged_df_1["overall_evaluation_gpt4"] = merged_df_1["overall_evaluation_gpt4"].apply(lambda x: round(x))
merged_df_1["overall_evaluation_original"] = merged_df_1["overall_evaluation_original"].apply(lambda x: round(x))

cohen_value = cohen_kappa_score(merged_df_1["overall_evaluation_gpt4"], merged_df_1["overall_evaluation_original"])
print("COHEN_VALUE : " + str(cohen_value))

COHEN_VALUE : -0.08304611436765708


Valore di Tau

In [14]:
from scipy.stats import kendalltau

tau, p_value = kendalltau(merged_df["overall_evaluation_gpt4"], merged_df["overall_evaluation_original"])
print("TAU Value: " + str(tau))
print("P-Value: " + str(p_value))

TAU Value: 0.4481028215893109
P-Value: 1.190379557410349e-24


In [5]:
from tqdm import tqdm
import json
import pandas as pd

# Load the JSON file
with open("fed_data_2.json", "r") as file:
    fed_data = json.load(file)

records = []

for dialog_id, example in enumerate(tqdm(fed_data)):
    conversation = example["context"]
    response = example.get("response")
    system = example['system']
    conversation = conversation.split("\n")
    conversation = [s.replace("User: ", "").replace("System: ", "").strip() for s in conversation]
    if response is None:
        # this is a conversation data point, not a turn data point
        continue
 
    response = response.replace("User: ", "").replace("System: ", "").strip()
    full_conversation = " ".join(conversation) + " " + response
    mean_annotation = sum(example["annotations"]["Overall"]) / len(example["annotations"]["Overall"])
    
    record = {
        "context": conversation,
        "response": response,
        "system": system,
        "conversation": full_conversation,
        "overall_evaluation": mean_annotation,
    }
    records.append(record)


df_responses_fed = pd.DataFrame(records)
df_responses_fed  
    

100%|██████████| 500/500 [00:00<00:00, 102355.02it/s]


Unnamed: 0,context,response,system,conversation,overall_evaluation
0,"[Hi!, Hi! What's up?, Nothing much, how about ...","It's probably boring, isn't it?",Meena,"Hi! Hi! What's up? Nothing much, how about you...",2.6
1,"[Hi!, Hi! What's up?, Nothing much, how about ...",What is the meeting about?,Meena,"Hi! Hi! What's up? Nothing much, how about you...",3.4
2,"[Hi!, Hi! What's up?, Nothing much, how about ...",How pleasant?,Meena,"Hi! Hi! What's up? Nothing much, how about you...",2.2
3,"[Hi!, Hey! How are you today?, good, I'm glad ...",I listen to them through iTunes.,Meena,Hi! Hey! How are you today? good I'm glad to h...,3.6
4,"[Hi!, Hey! How are you today?, good, I'm glad ...","Yeah, I just don't have the money for Spotify.",Meena,Hi! Hey! How are you today? good I'm glad to h...,3.4
...,...,...,...,...,...
370,"[Hi!, Hello. How's your week coming along?, It...",Mint chocolate chips. I think your croissants ...,Human,Hi! Hello. How's your week coming along? It's ...,4.0
371,"[Hi!, Hello. How's your week coming along?, It...",Yes. When my top favorite food looks like: coo...,Human,Hi! Hello. How's your week coming along? It's ...,4.0
372,"[Hi, Hi there., how are you this morning?, Ah ...","Yes, I have taken the form of a girl from Leeds.",Mitsuku,Hi Hi there. how are you this morning? Ah earl...,2.8
373,"[Hi, Hi there., how are you this morning?, Ah ...",Clare said he do not understand how a robot ea...,Mitsuku,Hi Hi there. how are you this morning? Ah earl...,1.8
