In [None]:
# Get info of One Model evals (without fine-tuned G-Retriever)

import os
import json

path = "Model Outpus & Evaluation/One Model Evaluation/"
files_to_data = dict()

for f_name in os.listdir(path):
    with open(f"{path}/{f_name}", 'r') as f:
        data = f.readlines()
        data = [json.loads(x) for x in data]
    
    files_to_data[f_name] = data

unique_files = [x[:-7] for x in list(files_to_data.keys())]
unique_files = list(set(unique_files))

In [None]:
import pandas as pd
import re

rag_df = pd.DataFrame()
g_retr_df = pd.DataFrame()
ft_df = pd.DataFrame()

f_name_to_scores = dict()

for f_name in unique_files:
    scores_list = list()
    for q_id in range(len(files_to_data[f_name+ '1.jsonl'])):
        score = list()
        for i in range(1, 4):
            f_name_with_number = f"{f_name}{i}.jsonl"
            score.append(
                int(files_to_data[f_name_with_number][q_id]['score'])
            )
        scores_list.append(
            sum(score) / 3
        )
    f_name_to_scores[f_name] = scores_list

    count = re.findall(r"\d+", f_name)[0]

    if "FT" in f_name:
        ft_df[str(count)] = scores_list
    elif "RAG" in f_name:
        rag_df[str(count)] = scores_list
    else:
        g_retr_df[str(count)] = scores_list

In [None]:
sorted_vals = list(map(str,sorted(map(int, g_retr_df.columns))))

rag_df = rag_df.reindex(sorted_vals, axis=1)
g_retr_df = g_retr_df.reindex(sorted_vals, axis=1)

rag_df['category'] = ['syntax'] * 140 + ['dependencies'] * 135 + ['meta'] * 50
rag_df['type'] = ['RAG']*325

g_retr_df['category'] = ['syntax'] * 140 + ['dependencies'] * 135 + ['meta'] * 50
g_retr_df['type'] = ['G-Retriever']*325

ft_df['category'] = ['syntax'] * 56 + ['dependencies'] * 54 + ['meta'] * 20

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(font_scale=1.2)

fig, ax = plt.subplots(1,2, figsize=(20, 6))


ax1 = sns.barplot(g_retr_df, errorbar=('ci', 100), ax=ax[0])
for con in ax1.containers:
    ax1.bar_label(con, label_type='center', fontsize=11)
ax1.set_ylabel("Average score", fontsize=13)
ax1.set_xlabel("(a) G-Retriever", fontsize=13)

ax2 = sns.barplot(rag_df, errorbar=('ci', 100), ax=ax[1])
for con in ax2.containers:
    ax2.bar_label(con, label_type='center', fontsize=11)
ax2.set_xlabel("(b) RAG", fontsize=13)

In [None]:
df_full = pd.concat([rag_df, g_retr_df], axis=0)
df_full = pd.melt(df_full, id_vars=['type', 'category'])

print(df_full)

fig, ax = plt.subplots(1, 1, figsize=(20, 6))
sns.violinplot(df_full,
               x='variable',
               y='value',
               hue="type",
               split=True, inner='quart', gridsize=50, ax=ax)
ax.set_xlabel("Context window length", fontsize=13)
ax.set_ylabel("Average score", fontsize=13)
sns.move_legend(ax, "upper left", bbox_to_anchor=(-0.14,1), fontsize=11)


In [None]:
ax = sns.heatmap(
    pd.concat([rag_df, g_retr_df]).groupby(['type', 'category']).mean(),
    annot=True
)
ax.set_ylabel("LLM type and question category", fontsize=13)
ax.set_xlabel("Context length", fontsize=13)

pd.concat([rag_df, g_retr_df]).groupby(['type', 'category']).mean()

In [None]:
ax1 = sns.barplot(ft_df, errorbar=('ci', 100))
for con in ax1.containers:
    ax1.bar_label(con, label_type='center')
ax1.set_ylabel("Average score", fontsize=13)
ax1.set_xlabel("Context length", fontsize=13)

In [None]:
ax = sns.heatmap(
    ft_df.groupby('category').mean(),
    annot=True
)
ax.set_ylabel("Question category", fontsize=13)
ax.set_xlabel("Context length", fontsize=13)

# Model vs Model evaluation

In [None]:
from collections import Counter
import json
import os
import pandas as pd
import seaborn as sns

path = "Model Outpus & Evaluation/Model vs Model Evaluation"
file_names = sorted(os.listdir(path))

f_name_to_scores = dict()
for fname in file_names:
    with open(f"{path}/{fname}") as f:
        data = f.readlines()
        data = [json.loads(x)['verdict'] for x in data]

    f_name_to_scores[fname] = dict(Counter(data))

# Collecting the data in the format:
# <RAG is better>, <G-Retriever is better>, <Both assistants are good>, <Both assistants are bad>

results_df = pd.DataFrame(columns=["type", "RAG", "G-Retriever", "Tie good", "Tie bad"])

for i in range(0, 14, 2):
    type = file_names[i].split()[1]

    g_retr_vs_rag = f_name_to_scores[file_names[i]]
    rag_vs_g_retr = f_name_to_scores[file_names[i+1]]

    num_scores = sum(list(g_retr_vs_rag.values()))

    g_retr   = 100 * ( g_retr_vs_rag["Assistant A's answer is better"] + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
    rag      = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr["Assistant A's answer is better"] ) / (2*num_scores)
    tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"] + rag_vs_g_retr["Both assistants are good"] ) / (2*num_scores)
    tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"] + rag_vs_g_retr["Both assistants are bad"] ) / (2*num_scores)

    results_df.loc[len(results_df)] = (type, rag, g_retr, tie_good, tie_bad)

results_df = results_df.melt('type')

print(results_df)

f, ax = plt.subplots(1, 1, figsize=(30, 6))
sns.barplot(results_df, x="type", y="value", hue="variable", ax=ax)

for con in ax.containers:
    ax.bar_label(con, fontsize=11.5)

plt.setp(ax.get_legend().get_texts(), fontsize='15') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='18') # for legend title

ax.set_ylabel("Average % winrate", fontsize=15)
ax.set_xlabel("Context length", fontsize=15)

In [None]:
results_ft_df = pd.DataFrame(columns=["type", "Other model", "Finetuned G-Retriever", "Tie good", "Tie bad"])

descriptions = [
    "(a) - comparison with RAG, context length = 128",
    "(b) - comparison with RAG, context length = 512",
    "(c) - comparison with G-Retriever, context length = 128",
    "(d) - comparison with G-Retriever, context length = 512",
]

for i, description in zip(range(14, 24, 2), descriptions):
    rag_vs_g_retr = f_name_to_scores[file_names[i]]

    g_retr_vs_rag = f_name_to_scores[file_names[i+1]]

    num_scores = sum(list(g_retr_vs_rag.values()))

    g_retr   = 100 * ( g_retr_vs_rag["Assistant A's answer is better"] + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
    other    = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr["Assistant A's answer is better"] ) / (2*num_scores)
    tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"]       + rag_vs_g_retr["Both assistants are good"] )       / (2*num_scores)
    tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"]        + rag_vs_g_retr["Both assistants are bad"] )        / (2*num_scores)

    results_ft_df.loc[len(results_ft_df)] = (description, other, g_retr, tie_good, tie_bad)

results_ft_df = results_ft_df.melt('type')

f, ax = plt.subplots(1, 1, figsize=(30, 6))
sns.barplot(results_ft_df, x="type", y="value", hue="variable", ax=ax)
plt.setp(ax.get_legend().get_texts(), fontsize='12') # for legend text
plt.setp(ax.get_legend().get_title(), fontsize='15') # for legend title
ax.set_ylabel("Average % winrate", fontsize=15)
ax.set_xlabel("")

for con in ax.containers:
    ax.bar_label(con, fontsize=13)
sns.move_legend(ax, "upper left", bbox_to_anchor=(-0.14, 1.05))

In [None]:
results_ft_df = pd.DataFrame(columns=["type", "Context length = 128", "Context length = 512", "Tie good", "Tie bad"])

rag_vs_g_retr = f_name_to_scores[file_names[-2]]
g_retr_vs_rag = f_name_to_scores[file_names[-1]]

num_scores = sum(list(g_retr_vs_rag.values()))

g_retr   = 100 * ( g_retr_vs_rag["Assistant A's answer is better"] + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
other    = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr["Assistant A's answer is better"] ) / (2*num_scores)
tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"]       + rag_vs_g_retr["Both assistants are good"] )       / (2*num_scores)
tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"]        + rag_vs_g_retr["Both assistants are bad"] )        / (2*num_scores)

results_ft_df.loc[len(results_ft_df)] = ("", other, g_retr, tie_good, tie_bad)

results_ft_df = results_ft_df.melt('type')

# f, ax = plt.subplots(1, 1, figsize=(30, 6))
ax = sns.barplot(results_ft_df, x="type", y="value", hue="variable")
ax.set_xlabel("")
ax.set_ylabel("Average % winrate")

for con in ax.containers:
    ax.bar_label(con)

# Model vs Model with question categories (Syntax, Dependencies, and Meta)

In [None]:
from collections import Counter
import json
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

path = "Model Outpus & Evaluation/Model vs Model Evaluation"
file_names = sorted(os.listdir(path))

f_name_to_scores = dict()
for fname in file_names:
    with open(f"{path}/{fname}") as f:
        data = f.readlines()
        data = [json.loads(x)['verdict'] for x in data]

    f_name_to_scores[fname] = data

# Collecting the data in the format:
# <RAG is better>, <G-Retriever is better>, <Both assistants are good>, <Both assistants are bad>

results_df = pd.DataFrame(columns=["Type", "Category", "RAG", "G-Retriever", "Tie good", "Tie bad"])
categories = ["Syntax", "Dependencies", "Meta"]
slices = [slice(0, 140), slice(140, 275), slice(275, 325)]

for i in range(0, 14, 2):
    type = file_names[i].split()[1]

    for cat, sli in zip(categories, slices):
        
        g_retr_vs_rag = f_name_to_scores[file_names[i]][sli]
        rag_vs_g_retr = f_name_to_scores[file_names[i+1]][sli]

        g_retr_vs_rag = dict(Counter(g_retr_vs_rag))
        rag_vs_g_retr = dict(Counter(rag_vs_g_retr))

        num_scores = sum(list(g_retr_vs_rag.values()))

        g_retr   = 100 * ( g_retr_vs_rag["Assistant A's answer is better"] + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
        rag      = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr["Assistant A's answer is better"] ) / (2*num_scores)
        tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"] + rag_vs_g_retr["Both assistants are good"] ) / (2*num_scores)
        tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"] + rag_vs_g_retr["Both assistants are bad"] ) / (2*num_scores)

        results_df.loc[len(results_df)] = (int(type), cat, rag, g_retr, tie_good, tie_bad)

print(results_df)

pl, ax = plt.subplots(1, 3, figsize=(20, 6))

for i, cat in enumerate(categories):
    sns.heatmap(
        results_df.loc[results_df.Category == cat].set_index('Type').drop(['Category'], axis=1),
        annot=True, ax=ax[i])
    ax[i].set_ylabel(cat)

In [None]:
results_ft_df = pd.DataFrame(columns=["Type", "Category", "Other model", "Finetuned G-Retriever", "Tie good", "Tie bad"])

descriptions = [
    "(a) - comparison with RAG,\ncontext length = 128",
    "(b) - comparison with RAG,\ncontext length = 512",
    "(c) - comparison with G-Retriever,\ncontext length = 128",
    "(d) - comparison with G-Retriever,\ncontext length = 512",
]

categories = ["Syntax", "Dependencies", "Meta"]
slices = [slice(0, 56), slice(56, 110), slice(110, 130)]

for i, description in zip(range(14, 24, 2), descriptions):
    
    for cat, sli in zip(categories, slices):
        rag_vs_g_retr = f_name_to_scores[file_names[i]][sli]
        g_retr_vs_rag = f_name_to_scores[file_names[i+1]][sli]

        rag_vs_g_retr = dict(Counter(rag_vs_g_retr))
        g_retr_vs_rag = dict(Counter(g_retr_vs_rag))

        num_scores = sum(list(g_retr_vs_rag.values()))

        g_retr   = 100 * ( g_retr_vs_rag.get("Assistant A's answer is better", 0) + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
        other    = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr.get("Assistant A's answer is better", 0) ) / (2*num_scores)
        tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"]       + rag_vs_g_retr["Both assistants are good"] ) / (2*num_scores)
        tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"]        + rag_vs_g_retr["Both assistants are bad"] ) / (2*num_scores)

        results_ft_df.loc[len(results_ft_df)] = (description, cat, other, g_retr, tie_good, tie_bad)

results_ft_df

p, ax = plt.subplots(1, 3, figsize=(20, 6))

for i, cat in enumerate(categories):
    df_to_plot = results_ft_df.loc[results_ft_df.Category == cat].drop(["Category"], axis=1).set_index("Type")
    if i == 0: sns.heatmap(df_to_plot,ax=ax[i], annot=True)
    else:      sns.heatmap(df_to_plot,ax=ax[i], annot=True, yticklabels=[])
    ax[i].set_ylabel(cat)

In [None]:
results_ft_df = pd.DataFrame(columns=["Type", "Category", "Context length = 128", "Context length = 512", "Tie good", "Tie bad"])

for cat, sli in zip(categories, slices):
    rag_vs_g_retr = f_name_to_scores[file_names[-2]][sli]
    g_retr_vs_rag = f_name_to_scores[file_names[-1]][sli]

    rag_vs_g_retr = dict(Counter(rag_vs_g_retr))
    g_retr_vs_rag = dict(Counter(g_retr_vs_rag))

    num_scores = sum(list(g_retr_vs_rag.values()))

    g_retr   = 100 * ( g_retr_vs_rag["Assistant A's answer is better"] + rag_vs_g_retr["Assistant B's answer is better"] ) / (2*num_scores)
    other    = 100 * ( g_retr_vs_rag["Assistant B's answer is better"] + rag_vs_g_retr["Assistant A's answer is better"] ) / (2*num_scores)
    tie_good = 100 * ( g_retr_vs_rag["Both assistants are good"]       + rag_vs_g_retr["Both assistants are good"] )       / (2*num_scores)
    tie_bad  = 100 * ( g_retr_vs_rag["Both assistants are bad"]        + rag_vs_g_retr["Both assistants are bad"] )        / (2*num_scores)

    results_ft_df.loc[len(results_ft_df)] = ("", cat, other, g_retr, tie_good, tie_bad)

results_ft_df

p, ax = plt.subplots(1, 1, figsize=(12, 6))
results_ft_df = results_ft_df.set_index('Category').drop(["Type"], axis=1)
sns.heatmap(results_ft_df, annot=True, ax=ax)