In [None]:
!pip install transformers
!pip install faiss-gpu
!pip install load_dotenv
!pip install tiktoken
!pip install langchain
!pip install sentence-transformers
!pip install openai
!pip install accelerate

In [None]:
%load_ext autoreload
%autoreload 2

import dotenv
import os
import numpy as np
import openai

from scripts import generate_context, retrieve_relevant_excerpts
from embeddings import retrieve_relevant_excerpts_quickly

dotenv.load_dotenv()

In [None]:
needle_question_couples = [
    ("\nThe best thing to do in San Francisco is eat a sandwich and sit in Dolores Park on a sunny day.\n", "What is the most fun thing to do in San Francisco?"),
    ("\nThe most inspiring thing to do near the Hugging Face office in Paris is to visit the Louvre museum.\n", "What is the most inspiring thing to do near the Hugging Face office in Paris?"),
]

needle, question = needle_question_couples[0]

# 0. Test retrieval

In [None]:
context = generate_context(needle, 100000, 40)

from langchain.embeddings import HuggingFaceEmbeddings

hf_embedding = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    encode_kwargs={'normalize_embeddings': False},
    model_kwargs={'device': 'cuda'},
)

### On instance

In [None]:
documents = retrieve_relevant_excerpts(context, question, hf_embedding)
print(len(documents))
print(documents[-300:])

### Text Embedding Inference

In [None]:
documents = await retrieve_relevant_excerpts_quickly(context, question, hf_embedding)
print(len(documents))
print(documents[-300:])

# 1. Calculations

In [None]:
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from scripts import result_exists, evaluate_response
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage
import json
from tqdm.notebook import tqdm

# The code will check to see if a context_length, depth percent and version number have already been checked yet
# Change the version # if you would like to run the results multiple times.
# If you're just testing, then leave as version=1
results_version = 1 

# This will produce a list of context lengths for each experiment iteration. Make sure the max context length is within the bounds of your models limits.
context_lengths = np.round(np.linspace(1000, 128000, num=15, endpoint=True)).astype(int)

# This will product a list of document depths to place your random statement (needle) at.
# Suggestion: Try out different distributions (like a sigmoid) to test non-evenly space intervals
document_depth_percents = np.round(np.linspace(0, 100, num=15, endpoint=True)).astype(int)

evaluation_model  = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))

### Mistral-7B + RAG

In [None]:
from huggingface_hub.inference_api import InferenceApi

model_id = "HuggingFaceH4/zephyr-7b-beta"
client = InferenceApi(
    repo_id=model_id,
    token=os.getenv('HUGGINGFACEHUB_API_TOKEN', 'YourHuggingFaceToken'),
)

# This will get logged on your results
model_to_test_description = 'embeddings'

### Test client

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {
        "role": "system",
        "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct.",
    },
    {
        "role": "user", "content": """
        You will have to answer this question based only on the context: {question}
        Here is the context: {context}
        """
    },
    {
        "role": "user", "content": """
        Answer the question: {question}
        Don't give information outside the document or repeat your findings.
        """
    }
 ]

messages_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt")
full_prompt = messages_chat.format(question=question, context=context[:1000])
output = client(full_prompt)

output[0]['generated_text'][len(full_prompt):]

In [None]:
# Run through each iteration of context_lengths and depths
for depth_percent in tqdm(document_depth_percents):
    for context_length in context_lengths:
        # Load results from file. 
        try:
            with open(f'output/results_{model_to_test_description}.json', 'r') as f:
                results = json.load(f)
        except FileNotFoundError:
            results = []
            pass

        # Checks to see if you've already checked a length/percent/version.
        # This helps if the program stop running and you want to restart later
        if result_exists(results, context_length, depth_percent, results_version, model_to_test_description):
            continue

        # Go generate the required length context and place your needle statement in
        context = generate_context(needle, context_length, depth_percent)

        context = await retrieve_relevant_excerpts_quickly(context, question, hf_embedding)

        # Go see if the model can answer the question to pull out your random fact
        
        messages = [
            {
                "role": "system",
                "content": "You are a helpful AI bot that answers questions for a user. Keep your response short and direct.",
            },
            {"role": "user", "content": """
            You will have to answer this question based only on the context: {question}
            Here is the context: {context}
            """},
            {"role": "user", "content": """
            Answer the question: {question}
            Don't give information outside the document or repeat your findings.
            """}
        ]

        messages_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        full_prompt = messages_chat.format(question=question, context=context)
        assert len(tokenizer.encode(full_prompt)) < 4096, "Your prompt is too long. Try a shorter context length or a smaller document depth."
        output = client(full_prompt)

        response = output[0]['generated_text'][len(full_prompt):]

        # Compare the reponse to the actual needle you placed
        score = evaluate_response(response, needle, question, evaluation_model)

        results.append({
            # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
            'model' : model_to_test_description,
            'context_length' : int(context_length),
            'depth_percent' : int(depth_percent),
            'version' : results_version,
            'needle' : needle,
            'model_response' : response,
            'score' : score
        })

        print (f"Result #: {len(results)}/{len(context_lengths) * len(document_depth_percents)}")
        print (f"Context: {context_length} tokens")
        print (f"Depth: {depth_percent}%")
        print (f"Score: {score}")
        print (f"Response: {response}\n")

        # Save results to a JSON file each run
        with open(f'output/results_{model_to_test_description}.json', 'w') as f:
            json.dump(results, f)

### GPT + RAG

In [None]:
model_to_test = ChatOpenAI(model='gpt-4', temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))

model_to_test_description = 'gpt_rag'

# Run through each iteration of context_lengths and depths
for depth_percent in tqdm(document_depth_percents):
    for context_length in context_lengths:
        # Load results from file. 
        try:
            with open(f'output/results_{model_to_test_description}.json', 'r') as f:
                results = json.load(f)
        except FileNotFoundError:
            results = []
            pass

        # Checks to see if you've already checked a length/percent/version.
        # This helps if the program stop running and you want to restart later
        if result_exists(results, context_length, depth_percent, results_version, model_to_test_description):
            continue

        # Go generate the required length context and place your needle statement in
        context = generate_context(needle, context_length, depth_percent)

        context = await retrieve_relevant_excerpts_quickly(context, question, hf_embedding)

        # Prepare your message to send to the model you're going to evaluate
        messages = [
            SystemMessage(
                content="You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
            ),
            HumanMessage(
                # This is the PG essays with your needle/random statement placed in it
                # This is your haystack with a needle placed in it.
                content=f"CONTEXT:\n{context}",
            ),
            HumanMessage(
                # This is the question you'll ask to the model to tr≠≠y and retrieve your random statement/needle.
                content=f"{question} - Don't give information outside the document or repeat your findings"
            ),
        ]

        # Go see if the model can answer the question to pull out your random fact
        response = model_to_test(messages)

        # Compare the reponse to the actual needle you placed
        score = evaluate_response(response, needle, question, evaluation_model)

        results.append({
            # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
            'model' : model_to_test_description,
            'context_length' : int(context_length),
            'depth_percent' : int(depth_percent),
            'version' : results_version,
            'needle' : needle,
            'model_response' : response.content,
            'score' : score
        })

        print (f"Result #: {len(results)}/{len(context_lengths) * len(document_depth_percents)}")
        print (f"Context: {context_length} tokens")
        print (f"Depth: {depth_percent}%")
        print (f"Score: {score}")
        print (f"Response: {response.content}\n")

        # Save results to a JSON file each run
        with open(f'output/results_{model_to_test_description}.json', 'w') as f:
            json.dump(results, f)

### GPT Long Context

In [None]:
# The code will check to see if a context_length, depth percent and version number have already been checked yet
# Change the version # if you would like to run the results multiple times.
# If you're just testing, then leave as version=1
results_version = 1 

# This will produce a list of context lengths for each experiment iteration. Make sure the max context length is within the bounds of your models limits.
context_lengths = np.round(np.linspace(1000, 128000, num=15, endpoint=True)).astype(int)

# This will product a list of document depths to place your random statement (needle) at.
# Suggestion: Try out different distributions (like a sigmoid) to test non-evenly space intervals
document_depth_percents = np.round(np.linspace(0, 100, num=15, endpoint=True)).astype(int)

# The model we are testing. As of now it's set up for chat models with OpenAI
model_to_test = ChatOpenAI(model='gpt-4-1106-preview', temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))


# This will get logged on your results
model_to_test_description = 'gpt4'

evaluation_model  = ChatOpenAI(model="gpt-4", temperature=0, openai_api_key = os.getenv('OPENAI_API_KEY', 'YourAPIKey'))

# Run through each iteration of context_lengths and depths
for context_length in context_lengths:
    for depth_percent in document_depth_percents:
        # Load results from file. 
        try:
            with open('output/results.json', 'r') as f:
                results = json.load(f)
        except FileNotFoundError:
            results = []
            pass

        # Checks to see if you've already checked a length/percent/version.
        # This helps if the program stop running and you want to restart later
        if result_exists(results, context_length, depth_percent, results_version, model_to_test_description):
            continue

        # Go generate the required length context and place your needle statement in
        context = generate_context(needle, context_length, depth_percent)

        # Prepare your message to send to the model you're going to evaluate
        messages = [
            SystemMessage(
                content="You are a helpful AI bot that answers questions for a user. Keep your response short and direct"
            ),
            HumanMessage(
                # This is the PG essays with your needle/random statement placed in it
                # This is your haystack with a needle placed in it.
                content=context
            ),
            HumanMessage(
                # This is the question you'll ask to the model to tr≠≠y and retrieve your random statement/needle.
                content="What is the most fun thing to do in San Francico based on the context? Don't give information outside the document or repeat your findings"
            ),
        ]

        # Go see if the model can answer the question to pull out your random fact
        response = model_to_test(messages)

        # Compare the reponse to the actual needle you placed
        score = evaluate_response(response, needle, question, evaluation_model)

        results.append({
            # 'context' : context, # Uncomment this line if you'd like to save the context the model was asked to retrieve from. Warning: This will become very large.
            'model' : model_to_test_description,
            'context_length' : int(context_length),
            'depth_percent' : int(depth_percent),
            'version' : results_version,
            'needle' : needle,
            'model_response' : response.content,
            'score' : score
        })

        print (f"Result #: {len(results)}/{len(context_lengths) * len(document_depth_percents)}")
        print (f"Context: {context_length} tokens")
        print (f"Depth: {depth_percent}%")
        print (f"Score: {score}")
        print (f"Response: {response.content}\n")

        # Save results to a JSON file each run
        with open('outpout/results.json', 'w') as f:
            json.dump(results, f)

        # Optional. Sleep for a bit to stay under the rate limit
        # Rate limit is 150K tokens/min so it's set at 120K for some cushion
        sleep_time = (context_length / 120000)*60
        # print (f"Sleeping: {sleep_time}\n")
        time.sleep(sleep_time)

# 2. Evaluate results

In [None]:
import json
import pandas as pd


with open('output/results_gpt_rag.json', 'r') as file:
    results = json.load(file)

table_rag = pd.DataFrame(results)
table_rag = table_rag.pivot_table(index='depth_percent', columns='context_length', values='score') / 10
mask = (table_rag >= 0.3)
table_rag = table_rag.where(mask, 0)

table_long_context = pd.read_csv('original_results/gpt4.csv', index_col=0)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

import matplotlib
import matplotlib as mpl

def display_table(table):
    fig, ax = plt.subplots()
    im = ax.imshow(table)

    # Show all ticks and label them with the respective list entries
    ax.set_xticks(np.arange(len(table.columns)), labels=table.columns)
    ax.set_yticks(np.arange(len(table.index)), labels=table.index)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
            rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(table.index)):
        for j in range(len(table.columns)):
            text = ax.text(j, i, table.values[i, j],
                        ha="center", va="center", color="w")

    fig.tight_layout()
    plt.show()

display_table(table_rag)
display_table(table_long_context)