In [46]:
import pandas as pd
import os
import plotly.express as px
import plotly.subplots as sp
import textwrap
import ast

In [47]:
def get_json_files(folder):
    return [f for f in os.listdir(folder) if f.endswith(".json")]

In [48]:
def calculate_accuracy(df):
    df[df['chatbot_response_clean'] == df['label']].shape[0]
    return df[df['chatbot_response_clean'] == df['label']].shape[0] / df.shape[0]


In [49]:
def calculate_metrics(df):
    # Calculate true positives, false positives, false negatives
    df['true_positives'] = df.apply(lambda row: 1 if row['chatbot_response_clean'] == row['label'] and row['label'] == 1 else 0, axis=1)
    df['false_positives'] = df.apply(lambda row: 1 if row['chatbot_response_clean'] != row['label'] and row['label'] == 0 else 0, axis=1)
    df['false_negatives'] = df.apply(lambda row: 1 if row['chatbot_response_clean'] != row['label'] and row['label'] == 1 else 0, axis=1)
    # Calculate precision and recall
    precision = df['true_positives'].sum() / (df['true_positives'].sum() + df['false_positives'].sum())
    recall = df['true_positives'].sum() / (df['true_positives'].sum() + df['false_negatives'].sum())
    # Calculate F1 score
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    return precision, recall, f1

In [50]:
def calculate_results(files, folder, model_name):
    results_cg = []

    for file in files:
        df = pd.read_json(os.path.join(folder, file))
        precision, recall, f1 = calculate_metrics(df)
        accuracy = calculate_accuracy(df)
        # get the chatbot_question of the 0th row
        question = df.iloc[0]['chatbot_question']
        results_cg.append({
            "model": model_name,
            "question": question,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })
    return results_cg

In [51]:
def print_promt(index, row, model):
    if model == "CHAT-GPT":
        question_text = ""
        
        for message in eval(row['question_label']):
            question_text += message.get("content", "")
    else:
        question_text = row['question_label']

        
    wrapped_text = textwrap.fill(question_text, width=120, subsequent_indent='    ')  # Wrap text at a maximum width of 60 characters
    print(f"Prompt for index {index}:\n    {wrapped_text}\n")

In [52]:
def create_chart(results):
    # plot the results
    df = pd.DataFrame(results)
    # Convert the 'question' column to a string representation for display
    df['question_label'] = df['question'].apply(str)
    # Convert the index to a string representation for display
    df['index_label'] = df.index.astype(str)

    # Create separate bar charts for accuracy and F1 score
    fig_acc = px.bar(df, x='index_label', y='accuracy', labels={'index_label': 'Index', 'accuracy': 'Accuracy'})
    fig_prec = px.bar(df, x='index_label', y='precision', labels={'index_label': 'Index', 'precision': 'Precision'})
    fig_recall = px.bar(df, x='index_label', y='recall', labels={'index_label': 'Index', 'recall': 'Recall'})
    fig_f1 = px.bar(df, x='index_label', y='f1', labels={'index_label': 'Index', 'f1': 'F1 Score'})

    # Combine the two charts into a single plot
    fig = sp.make_subplots(rows=2, cols=2, subplot_titles=(f'Precision {results[0].get("model")}', f'Recall {results[0].get("model")}', f'Accuracy {results[0].get("model")}', f'F1 Score {results[0].get("model")}'))
    fig.add_trace(fig_acc.data[0], row=2, col=1)
    fig.add_trace(fig_prec.data[0], row=1, col=1)
    fig.add_trace(fig_recall.data[0], row=1, col=2)
    fig.add_trace(fig_f1.data[0], row=2, col=2)



    # Customize the chart
    fig.update_layout(
        title=f'Results {results[0].get("model")}',
        xaxis_tickangle=-45,
        yaxis=dict(range=[0, 1]),  # Set the range for the first y-axis
        yaxis2=dict(range=[0, 1]),  # Set the range for the second y-axis
        yaxis3=dict(range=[0, 1]),  # Set the range for the third y-axis
        yaxis4=dict(range=[0, 1]),  # Set the range for the fourth y-axis
    )

    # Show the chart
    fig.show()

    # Print the question and their index
    for index, row in df.iterrows():
        print_promt(index, row, results[0].get("model"))

In [53]:
def create_models_chart(results, title):
    # plot the results
    df = pd.DataFrame(results)
    # Convert the 'question' column to a string representation for display
    df['question_label'] = df['question'].apply(lambda x: str(x))
    # Convert the index to a string representation for display
    df['index_label'] = df.index.astype(str)

    # Create separate bar charts for accuracy and F1 score
    fig_acc = px.bar(df, x='model', y='accuracy', labels={'model': 'Index', 'accuracy': 'Accuracy'})
    fig_prec = px.bar(df, x='model', y='precision', labels={'index_label': 'Index', 'precision': 'Precision'})
    fig_recall = px.bar(df, x='model', y='recall', labels={'index_label': 'Index', 'recall': 'Recall'})
    fig_f1 = px.bar(df, x='model', y='f1', labels={'index_label': 'Index', 'f1': 'F1 Score'})

    # Combine the two charts into a single plot
    fig = sp.make_subplots(rows=2, cols=2, subplot_titles=(f'Precision', f'Recall', f'Accuracy', f'F1 Score'))
    fig.add_trace(fig_acc.data[0], row=2, col=1)
    fig.add_trace(fig_prec.data[0], row=1, col=1)
    fig.add_trace(fig_recall.data[0], row=1, col=2)
    fig.add_trace(fig_f1.data[0], row=2, col=2)



    # Customize the chart
    fig.update_layout(
        title=title,
        xaxis_tickangle=-45,
        yaxis=dict(range=[0, 1]),  # Set the range for the first y-axis
        yaxis2=dict(range=[0, 1]),  # Set the range for the second y-axis
        yaxis3=dict(range=[0, 1]),  # Set the range for the third y-axis
        yaxis4=dict(range=[0, 1]),  # Set the range for the fourth y-axis
    )

    # Show the chart
    fig.show()

    # Print the question and their index
    for index, row in df.iterrows():
        print_promt(index, row, results[0].get("model"))

## Analyze ChatGPT3.5

### Random Sample v2

In [54]:
folder = "data/results/chat_gpt/new_random_sample"
files = get_json_files(folder)


results_cg = calculate_results(files, folder, model_name="CHAT-GPT")
create_chart(results_cg)

Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Maxxis Maxxis Minion DHR2 29 x 2.3
    Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)



### Normal benchmark

In [55]:
folder = "data/results/chat_gpt/medium"
files = get_json_files(folder)


results_cg = calculate_results(files, folder, model_name="CHAT-GPT")
create_chart(results_cg)

Prompt for index 0:
    You are taking part in a benchmark that tries to measure your performance at entity matching. You should determine if
    two products are the exact same. Two products are only the same if all attributes match! Only answer yes or
    no.product 1: Pen Drive SanDisk 64GB Cruzer Glide USB 2.0 it costs: 9,86 EUR, product 2: SanDisk 64GB
    SDIX30N-064G-GN6NN USB 3.0 Flash Drive 35.85 USD

Prompt for index 1:
    You are taking part in a benchmark that tries to measure your performance at entity matching. You should determine if
    two products are the exact same. Two products are only the same if all attributes match! Only answer yes or
    no.product 1: Pen Drive SanDisk 64GB Cruzer Glide USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash
    Drive

Prompt for index 2:
    You are a helpful assistant that tries to tell if two products are the same. Two products are only the same if all
    attributes match! Only answer yes or no.product 1: Pen Dri

### Hard benchmark

In [56]:
folder = "data/results/chat_gpt/hard/"
files = get_json_files(folder)

# exclude 
files = [f for f in files if f != "2023-04-24 16:08:16.804316_simple_promt.json"]

results_cg = calculate_results(files, folder, model_name="CHAT-GPT")
create_chart(results_cg)




Prompt for index 0:
    You are a helpful assistant that tries to tell if two products are the same. Two products are only the same if all
    attributes match! Only answer yes or no.product 1: Brother HL-L6200DW Wireless High Speed Mono Laser 2-sided
    printing 250 sheet Network 46PPM, product 2: Brother HL-L6300DW Laser Printer - Monochrome - DuplexThe twoPlease
    answer yes or no otherwise I will not be able to understand you.product 1: Brother HL-L6200DW Wireless High Speed
    Mono Laser 2-sided printing 250 sheet Network 46PPM, product 2: Brother HL-L6300DW Laser Printer - Monochrome -
    DuplexNo.Very good, thank you!product 1: Corsair Vengeance LPX RAM 16GB 2400MHz DDR4 UDIMM 288 Pin Desktop Memory,
    product 2: 16GB Corsair Vengeance LPX schwarz DDR4-2400 DIMM CL14 SingleNo.Thats incorrect, they are the
    same!product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer Men's Special Edition Heuer
    Monaco Watch

Prompt for index 1:
    Are the

### Whole val split

In [57]:
folder = "data/results/chat_gpt/"
files = get_json_files(folder)

# exclude 
files = [f for f in files if f != "2023-04-24 16:08:16.804316_simple_promt.json"]

results_cg = calculate_results(files, folder, model_name="CHAT-GPT")
create_chart(results_cg)




Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Corsair Carbide 275R Tempered Glass
    Blanca, product 2: Logitech C310 Webcam - 1 Megapixel - USB 2.0



## Analyze ChatGPT4

In [58]:
folder = "data/results/chat_gpt_4"
files = get_json_files(folder)

# exclude 
files = [f for f in files if f != "2023-04-24 16:08:16.804316_simple_promt.json"]

results_cg = calculate_results(files, folder, model_name="CHAT-GPT4")
create_chart(results_cg)




Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive



## Analyse GPT4All

### Random sample 1000

In [59]:
folder = "data/results/chat_gpt/random_sample_1000"
files = get_json_files(folder)

results_cg = calculate_results(files, folder, model_name="GPT4All")
create_chart(results_cg)


Prompt for index 0:
    [{'role': 'system', 'content': 'Do the following two product decriptions match. Answer with yes or no.'}, {'role':
    'user', 'content': ' product 1: POKEMON POKEMON SWSH2 REBEL CLASH 3PK BLISTER - Rayquaza, product 2: Pokemon S&S2:
    Rebel Clash BD'}]



### Random smaple V2

In [60]:
folder = "data/results/gpt4all/new_random_sampel"
files = get_json_files(folder)

results_cg = calculate_results(files, folder, model_name="GPT4All")
create_chart(results_cg)


Prompt for index 0:
    Do the following two product decriptions match. Your answer must include yes or no. product 1: Maxxis Maxxis Minion DHR2
    29 x 2.3 Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)

Prompt for index 1:
    Do the following two product decriptions match. Answer with yes or no. product 1: Maxxis Maxxis Minion DHR2 29 x 2.3
    Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)



### Medium Difficulty

In [61]:
folder = "data/results/gpt4all/curated"
files = get_json_files(folder)

results_cg = calculate_results(files, folder, model_name="GPT4All")
create_chart(results_cg)


Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. Otherwise I can understand you. product 1: Pen
    Drive SanDisk 64GB Cruzer Glide USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 1:
    You are a helpful assistant that tries to tell if two products are the same. Two products are only the same if all
    attributes match! Only answer yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide USB 2.0, product 2: SanDisk
    64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 2:
    Do the following two product decriptions match. Your answer must include yes or no. product 1: Pen Drive SanDisk 64GB
    Cruzer Glide USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 3:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Fla

### Other runs

In [62]:
folder = "data/results/gpt4all"
files = get_json_files(folder)

results_cg = calculate_results(files, folder, model_name="GPT4All")
create_chart(results_cg)


Prompt for index 0:
    Are these two products the same? product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer
    Men's Special Edition Heuer Monaco Watch

Prompt for index 1:
    Are these two products the same? product 1: Brother HL-L6200DW Wireless High Speed Mono Laser 2-sided printing 250 sheet
    Network 46PPM, product 2: Brother HL-L6300DW Laser Printer - Monochrome - Duplex



## Analyze Aleph Alpha

In [63]:
folder = "data/results/aleph_alpha"
files = get_json_files(folder)

results_cg = calculate_results(files, folder, model_name="Aleph Alpha")
create_chart(results_cg)



Prompt for index 0:
    Are these two products the same? product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer
    Men's Special Edition Heuer Monaco Watch

Prompt for index 1:
    Do the following two product decriptions match. Answer with yes or no. product 1: TAG Heuer Monaco Chronograph Calibre
    11 Automatic, product 2: TAG Heuer Men's Special Edition Heuer Monaco Watch

Prompt for index 2:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive



## Comparing the 3 Models on hard benchmark

In [64]:
files = [{
    "name": "gpt4all",
    "file": "data/results/gpt4all/2023-04-24 16:27:14.136349_simple_promt.json"
},
    {
    "name": "CHAT-GPT",
    "file": "data/results/chat_gpt/hard/2023-04-24 17:20:03.458940_simple_promt.json"},
    {
    "name": "aleph_alpha",
    "file": "data/results/aleph_alpha/2023-04-24 22:42:34.718717_simple_promt.json"}
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on hard benchmark")

Prompt for index 0:
    Are these two products the same? product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer
    Men's Special Edition Heuer Monaco Watch

Prompt for index 1:
    [{'role': 'system', 'content': 'Are these two products the same?'}, {'role': 'user', 'content': "product 1: TAG Heuer
    Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer Men's Special Edition Heuer Monaco Watch"}]

Prompt for index 2:
    Are these two products the same? product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer
    Men's Special Edition Heuer Monaco Watch



## Model comparison on random sample

In [65]:
files = [{
    "name": "gpt4all",
    "file": "data/results/gpt4all/new_random_sampel/2023-05-10 11:45:25.579616_simple_promt.json"
},
    {
    "name": "CHAT-GPT",
    "file": "data/results/chat_gpt/new_random_sample/2023-05-08 11:18:53.559359_simple_promt.json"},
    {
    "name": "aleph_alpha",
    "file": "data/results/aleph_alpha/random_sample_v2/2023-05-09 10:03:42.833719_simple_promt.json"}
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on random benchmark")

Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Maxxis Maxxis Minion DHR2 29 x 2.3
    Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)

Prompt for index 1:
    [{'role': 'system', 'content': 'Do the following two product decriptions match. Answer with yes or no.'}, {'role':
    'user', 'content': ' product 1: Maxxis Maxxis Minion DHR2 29 x 2.3 Folding Bead, product 2: Maxxis Minion DHR II 3C
    MaxxTerra/DD TR 29\\" Tire - 29 x 2.3\\" (Folding Bead)'}]

Prompt for index 2:
    Do the following two product decriptions match. Answer with yes or no. product 1: Maxxis Maxxis Minion DHR2 29 x 2.3
    Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)



### Model comparison on Medium

In [66]:
files = [{
    "name": "gpt4all",
    "file": "data/results/gpt4all/curated/2023-05-09 17:00:49.040867_simple_promt.json"
},
    {
    "name": "CHAT-GPT",
    "file": "data/results/chat_gpt/medium/2023-04-27 16:59:49.307413_simple_promt.json"},
    {
    "name": "aleph_alpha",
    "file": "data/results/aleph_alpha/2023-05-09 11:40:59.377552_simple_promt_medium.json"}
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on medium benchmark")

Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 1:
    [{'role': 'system', 'content': 'Do the following two product decriptions match. Answer with yes or no.'}, {'role':
    'user', 'content': ' product 1: Pen Drive SanDisk 64GB Cruzer Glide USB 2.0, product 2: SanDisk 64GB
    SDIX30N-064G-GN6NN USB 3.0 Flash Drive'}]

Prompt for index 2:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive



### Model comparison on hard benchmark

In [67]:
files = [{
    "name": "gpt4all",
    "file": "data/results/gpt4all/hard/2023-05-09 19:33:42.255141_simple_promt.json"
},
    {
    "name": "CHAT-GPT",
    "file": "data/results/chat_gpt/hard/2023-05-09 11:02:42.363451_simple_promt_hard.json"},
    {
    "name": "aleph_alpha",
    "file": "data/results/aleph_alpha/2023-05-09 16:09:07.988052_simple_promt_hard.json"}
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on hard benchmark")

Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: TAG Heuer Monaco Chronograph Calibre
    11 Automatic, product 2: TAG Heuer Men's Special Edition Heuer Monaco Watch

Prompt for index 1:
    [{'role': 'system', 'content': 'Do the following two product decriptions match. Answer with yes or no.'}, {'role':
    'user', 'content': " product 1: TAG Heuer Monaco Chronograph Calibre 11 Automatic, product 2: TAG Heuer Men's
    Special Edition Heuer Monaco Watch"}]

Prompt for index 2:
    Do the following two product decriptions match. Answer with yes or no. product 1: TAG Heuer Monaco Chronograph Calibre
    11 Automatic, product 2: TAG Heuer Men's Special Edition Heuer Monaco Watch



## Comparging gpt4all models

In [68]:
files = [{
    "name": "gpt4allv1",
    "file": "data/results/gpt4all/new_random_sampel/2023-05-08 16:09:45.331542_simple_promt.json"
},
    {
    "name": "gpt4allv2",
    "file": "data/results/gpt4all/new_model_random_sampel/2023-05-09 12:22:52.561259_simple_promt.json"},
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on random benchmark")

Prompt for index 0:
    Do the following two product decriptions match. Your answer must include yes or no. product 1: Maxxis Maxxis Minion DHR2
    29 x 2.3 Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)

Prompt for index 1:
    Do the following two product decriptions match. Your answer must include yes or no. product 1: Maxxis Maxxis Minion DHR2
    29 x 2.3 Folding Bead, product 2: Maxxis Minion DHR II 3C MaxxTerra/DD TR 29\" Tire - 29 x 2.3\" (Folding Bead)



In [69]:
df = pd.read_json("data/results/aleph_alpha/2023-05-09 16:09:07.988052_simple_promt_hard.json")
# show all -1 chatbot_answers_clean
df[df['chatbot_response_clean'] == -1]

Unnamed: 0.1,Unnamed: 0,id_left,brand_left,title_left,description_left,price_left,priceCurrency_left,specTableContent_left,cluster_id_left,id_right,...,label,is_hard_negative,roberta-base_logits,roberta-base_prediction,rsupcon-base_logits,rsupcon-base_prediction,chatbot_response,chatbot_response_raw,chatbot_question,chatbot_response_clean
0,1840,98621092,,TAG Heuer Monaco Chronograph Calibre 11 Automatic,Design inspired by the watch worn by Steve McQ...,"£4,195.00",,,556904,2004031,...,1,False,"[3.7184028625, -4.1772656441]",0,[1.0],1,", Calibre 11 Automatic.","{'completions': [{'completion': ', Calibre 11 ...",Do the following two product decriptions match...,-1
1,710,47887901,,Apple AirPods (2nd Generation) Bluetooth Earbu...,Staples.com: Apple AirPods MRXJ2AM/A In the Ea...,199,USD,,1994510,49097388,...,0,True,"[-1.7439045906000001, 1.717010498]",1,[0.0],0,\n\nA:\n\n,"{'completions': [{'completion': ' A: ', 'fin...",Do the following two product decriptions match...,-1
3,3869,435472,,Evans EMAD Clear Bass Drum Head 20 Inch,"20"" drum head made using a single ply of 10mil...",46.06,USD,,800940,55131100,...,1,False,"[3.2268047333, -3.6520571709]",0,[1.0],1,Drum Head 20 Inch.\n,{'completions': [{'completion': ' Drum Head 20...,Do the following two product decriptions match...,-1
4,1787,88641225,,"8GB (2x4GB) HyperX Fury Blue DDR3, 1600MHz, CL...","8GB (2x4GB) HyperX Fury Blue DDR3, 1600MHz, CL...",74.99,CAD,,288081,86250508,...,0,True,"[-2.2421159744, 2.2641017437]",1,[0.0],0,- 1.5V -,"{'completions': [{'completion': ' - 1.5V -', '...",Do the following two product decriptions match...,-1
5,2038,13519118,,"3M - Privacy Filter 19\""\"" WideS","3M Privacy Filter 19\""\"" WideS (PF319W) - Type...",2020.00,DKK,,3303908,1533183,...,0,True,"[-4.3640899658, 4.6543402672]",1,[0.0],0,.\n\nA:\n,"{'completions': [{'completion': '. A: ', 'fin...",Do the following two product decriptions match...,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,3392,51060838,,Monochrome Ribbon Zebra ZXP Series 1 Load-N-Go...,Monochrome Ribbon Black monochrome ribbon Clea...,0.00,USD,,1542544,21556335,...,0,True,"[-4.1408891678, 4.4325938225]",1,[0.0],0,.\n\nA:\n,"{'completions': [{'completion': '. A: ', 'fin...",Do the following two product decriptions match...,-1
95,2820,31581824,SHIMANO,SPD-SL SH 11 - tacchette,The Shimano SM-SH11 cleats serve as a connecti...,15.33,EUR,,532781,75286119,...,0,True,"[-4.3649616241, 4.6570587158]",1,[0.0],0,",CN-HG901-","{'completions': [{'completion': ',CN-HG901-', ...",Do the following two product decriptions match...,-1
96,4223,6002122,,LOGITECH - STEREO HEADSET H151 ANALOG - EMEAIN...,LOGITECH STEREO HEADSET H151 ANALOG - EMEAIN A...,,,,122031,31273059,...,0,True,"[-4.3757534027, 4.6587324142]",1,[0.0],0,- EMEAIN AC,{'completions': [{'completion': ' - EMEAIN AC'...,Do the following two product decriptions match...,-1
97,2073,393407,Audemars Piguet,Audemars Piguet Royal Oak Chronograph,Audemars Piguet Royal Oak,36560,USD,,607611,98604797,...,0,True,"[-4.1834125519, 4.471534729]",1,[0.0],0,.\n\nA:\n,"{'completions': [{'completion': '. A: ', 'fin...",Do the following two product decriptions match...,-1


### Medium Benchmark with GPT4

In [70]:
files = [{
    "name": "gpt4all",
    "file": "data/results/gpt4all/curated/2023-05-09 17:00:49.040867_simple_promt.json"
},
    {
    "name": "CHAT-GPT3.5",
    "file": "data/results/chat_gpt/medium/2023-04-27 16:59:49.307413_simple_promt.json"},
    {
    "name": "CHAT-GPT4",
    "file": "data/results/chat_gpt_4/Gpt4.json"
    },
    {
    "name": "aleph_alpha",
    "file": "data/results/aleph_alpha/2023-05-09 11:40:59.377552_simple_promt_medium.json"},
]

results = []

for file in files:
    df = pd.read_json(file["file"])
    accuracy = calculate_accuracy(df)
    # get the chatbot_question of the 0th row
    question = df.iloc[0]['chatbot_question']
    precision, recall, f1 = calculate_metrics(df)
    results.append({
        "model": file["name"],
        "question": question,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# plot the results
create_models_chart(results, "Comparison of Models on medium benchmark")

Prompt for index 0:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 1:
    [{'role': 'system', 'content': 'Do the following two product decriptions match. Answer with yes or no.'}, {'role':
    'user', 'content': ' product 1: Pen Drive SanDisk 64GB Cruzer Glide USB 2.0, product 2: SanDisk 64GB
    SDIX30N-064G-GN6NN USB 3.0 Flash Drive'}]

Prompt for index 2:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive

Prompt for index 3:
    Do the following two product decriptions match. Answer with yes or no. product 1: Pen Drive SanDisk 64GB Cruzer Glide
    USB 2.0, product 2: SanDisk 64GB SDIX30N-064G-GN6NN USB 3.0 Flash Drive



### Full Dataset gpt3.5