In [None]:
!pip install plotly
!pip install pandas

In [None]:
import json
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import seaborn as sns

### Load results

In [None]:
# with open('output/results_gpt4_rag.json', 'r') as file:
#     results = json.load(file)
# print(len(results))
# results = [el for el in results if int(el['context_length']) != 28214]

# with open(f'output/results_gpt4_rag.json', 'w') as f:
#     json.dump(results, f)

In [None]:
def load_results(experiment_name: str) -> pd.DataFrame:
    with open(f'output/results_{experiment_name}.json', 'r') as file:
        results = json.load(file)

    result = pd.DataFrame(results)
    result['score'] = result['score'].apply(lambda x: (x if x > 3 else 0))
    result['score'] = result['score'] / 10
    average = result.groupby("context_length")['score'].mean()
    average = pd.DataFrame(average).rename(columns={'score': 'average_score'}).T
    return result, average

result_os_rag, average_os_rag = load_results('Zephyr-7b_RAG')
result_gpt_rag, average_gpt_rag = load_results('GPT4_RAG')

In [None]:
table_long_context = pd.read_csv('original_results/gpt4.csv', index_col=0) / 10
average_long_context = table_long_context.mean(axis=0)
average_long_context = pd.DataFrame(average_long_context).rename(columns={'score': 'average_score'}).T
average_long_context

### Concatenate results

In [None]:
comb = pd.concat([average_long_context, average_os_rag, average_gpt_rag], axis=0).T
comb.columns = ['Long Context', 'Mistral7b_RAG', 'GPT_RAG']
comb = pd.DataFrame(comb.unstack().reset_index())
comb.columns = ['model', 'context_length', 'average_score']
comb['context_length'] = comb['context_length'].astype(int)

### Plot

In [None]:
# Plot two figures: one fore the lines, one for the point markers
fig1 = px.scatter(comb, color="model", x='context_length', y='average_score', title='Accuracy of Retrieval - RAG vs Long-Context GPT4', width=1000)
fig2 = px.line(comb, color="model", x='context_length', y='average_score', title='Accuracy of Retrieval - RAG vs Long-Context GPT4', width=1000).add_traces(fig1.data)

fig2.update_layout(yaxis_range=[0.6, 1.05])
fig2.update_layout(xaxis=dict(range=[0,130000]), yaxis_tickformat=',.0%', font=dict(
        family="Arial",
        size=18,
        color="grey"
    ))


fig2.update_layout(
    # paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(1,1,1,0.1)',
)
fig2.update_xaxes(tickvals = list(range(10000, 130000, 10000)))

### Heatmap

In [None]:
# Create a custom colormap. Go to https://coolors.co/ and pick cool colors
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["#F0496E", "#EBB839", "#0CD79F"])

pivot_table = result_gpt_rag.groupby(['depth_percent', 'context_length'])['score'].mean().unstack()

# Create the heatmap with better aesthetics
plt.figure(figsize=(17.5, 8))  # Can adjust these dimensions as needed
sns.heatmap(
    pivot_table,
    # annot=True,
    fmt="g",
    cmap=cmap,
    cbar_kws={'label': 'Score'},
    linecolor='grey', linewidth=0.5,
)
plt.gca().collections[0].set_clim(0,1)

# More aesthetics
plt.title('Pressure Testing GPT-4 128K Context\nFact Retrieval Across Context Lengths ("Needle In A HayStack")')  # Adds a title
plt.xlabel('Token Limit')  # X-axis label
plt.ylabel('Depth Percent')  # Y-axis label
plt.xticks(rotation=45)  # Rotates the x-axis labels to prevent overlap
plt.yticks(rotation=0)  # Ensures the y-axis labels are horizontal
plt.tight_layout()  # Fits everything neatly into the figure area

plt.show()