Source: https://www.pinecone.io/learn/openai-gen-qa/

In [None]:
%pip install -qU openai pinecone-client datasets cohere tiktoken --upgrade

In [None]:
# Get the openai secret key
import getpass

OPENAI_API_KEY = getpass.getpass('Please enter your openai key: ')

In [3]:
import openai

# Get API key from top-right dropdown on OpenAI website
openai.api_key = OPENAI_API_KEY

In [4]:
query = "who was the 12th person on the moon and when did they land?"

# Now query gpt-3.5-turbo WITHOUT context
res = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "user",
            "content": query,
        }
    ]
)

res.choices[0].message.content


"The 12th person to walk on the Moon was Eugene Cernan, who landed during NASA's Apollo 17 mission. Apollo 17 was launched on December 7, 1972, and the lunar module landed on the Moon's surface on December 11, 1972."

In [5]:
# First let's make it simpler to get answers
def complete(prompt):
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {
                "role": "user", 
                "content": prompt
            }
        ]
    )
    return response.choices[0].message.content

query = (
    "Which training method should I use for sentence transformers when " +
    "I only have pairs of related sentences?"
)

complete(query)

'When you only have pairs of related sentences, one of the suitable training methods for sentence transformers is the Siamese network architecture. This architecture includes two identical models that share the same weights and are fed with both sentences of the pair. The output embeddings of the sentences are compared using a similarity metric, such as cosine similarity, to measure their semantic similarity.\n\nHere is a basic outline of how you can use the Siamese network architecture:\n\n1. Prepare your data: Collect pairs of related sentences and label them as either similar (1) or dissimilar (0).\n\n2. Preprocess your sentences: Clean and tokenize your sentences, and convert them into numerical representations (e.g., word embeddings or word indices).\n\n3. Build your Siamese network: Create two identical models (e.g., using an LSTM or a transformer-based model). Each model takes one sentence of the pair as input and produces sentence embeddings.\n\n4. Define your loss function: Th

In [6]:
embed_model = "text-embedding-ada-002"

res = openai.embeddings.create(
    input=[
        "Sample document text goes here",
        "there will be several phrases in each batch"
    ], model=embed_model
)

In [7]:
# Vector embeddings for each document
res.data

[Embedding(embedding=[-0.003053202060982585, 0.01168035063892603, -0.0051088896580040455, -0.02714855968952179, -0.01637810282409191, 0.03229789063334465, -0.016216345131397247, -0.0010809211526066065, -0.02581404708325863, -0.0065950509160757065, 0.020031161606311798, 0.016661182045936584, -0.009173085913062096, 0.023374181240797043, -0.010150380432605743, 0.013520359992980957, 0.025247890502214432, -0.01691729947924614, 0.012125188484787941, -0.01637810282409191, -0.0042495448142290115, -0.006436661817133427, -0.004404563922435045, 0.020839957520365715, -0.010527817532420158, -0.0037137181498110294, 0.013661899603903294, -0.026339763775467873, -0.00038122900878079236, -0.0021466773469001055, 0.005819955840706825, -0.01004928071051836, -0.028173033148050308, -0.016229825094342232, -0.004273134749382734, 0.00740047637373209, -0.0028796480037271976, -0.031489092856645584, 0.023845979943871498, -0.03337628394365311, -0.000293820135993883, 0.013008123263716698, 0.007110658101737499, -0.00

In [8]:
# We have created two vectors (one for each sentence input)
len(res.data)

2

In [9]:
# We have created two 1536-dimensional vectors
len(res.data[0].embedding), len(res.data[1].embedding)

(1536, 1536)

In [10]:
# We can also get the vector for a single sentence
res.data[0].embedding

[-0.003053202060982585,
 0.01168035063892603,
 -0.0051088896580040455,
 -0.02714855968952179,
 -0.01637810282409191,
 0.03229789063334465,
 -0.016216345131397247,
 -0.0010809211526066065,
 -0.02581404708325863,
 -0.0065950509160757065,
 0.020031161606311798,
 0.016661182045936584,
 -0.009173085913062096,
 0.023374181240797043,
 -0.010150380432605743,
 0.013520359992980957,
 0.025247890502214432,
 -0.01691729947924614,
 0.012125188484787941,
 -0.01637810282409191,
 -0.0042495448142290115,
 -0.006436661817133427,
 -0.004404563922435045,
 0.020839957520365715,
 -0.010527817532420158,
 -0.0037137181498110294,
 0.013661899603903294,
 -0.026339763775467873,
 -0.00038122900878079236,
 -0.0021466773469001055,
 0.005819955840706825,
 -0.01004928071051836,
 -0.028173033148050308,
 -0.016229825094342232,
 -0.004273134749382734,
 0.00740047637373209,
 -0.0028796480037271976,
 -0.031489092856645584,
 0.023845979943871498,
 -0.03337628394365311,
 -0.000293820135993883,
 0.013008123263716698,
 0.0071

In [11]:
from datasets import load_dataset

data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

In [12]:
data[0]

{'title': 'Training and Testing an Italian BERT - Transformers From Scratch #4',
 'published': '2021-07-06 13:00:03 UTC',
 'url': 'https://youtu.be/35Pdoyi6ZoQ',
 'video_id': '35Pdoyi6ZoQ',
 'channel_id': 'UCv83tO5cePwHMt1952IVVHw',
 'id': '35Pdoyi6ZoQ-t0.0',
 'text': 'Hi, welcome to the video.',
 'start': 0.0,
 'end': 9.36}

In [13]:
from tqdm.auto import tqdm

new_data = []

window = 20  # number of sentences to combine
stride = 4  # number of sentences to 'stride' over, used to create overlap

for i in tqdm(range(0, len(data), stride)):
    i_end = min(len(data)-1, i+window)
    if data[i]['title'] != data[i_end]['title']:
        # in this case we skip this entry as we have start/end of two videos
        continue
    text = ' '.join(data[i:i_end]['text'])
    # create the new merged dataset
    new_data.append({
        'start': data[i]['start'],
        'end': data[i_end]['end'],
        'title': data[i]['title'],
        'text': text,
        'id': data[i]['id'],
        'url': data[i]['url'],
        'published': data[i]['published'],
        'channel_id': data[i]['channel_id']
    })

  0%|          | 0/52155 [00:00<?, ?it/s]

In [None]:
new_data[0]

In [None]:
from pinecone import Pinecone, PodSpec
import os

PINECONE_API_KEY = getpass.getpass("Please enter your pinecone key: ")

# Initialize connection (get API key at app.pinecone.io):
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [None]:
index_name = "employee-handbook"
environment = "us-west4-gcp"
pc = Pinecone()  # This reads the PINECONE_API_KEY env var

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # Using the same vector dimensions as text-embedding-ada-002
        metric="cosine",
        spec=PodSpec(
            environment=environment,
            pod_type="p1.x1",
            pods=1,
            metadata_config={"indexed": ["batch"]},
        ),
    )

In [None]:
# Connect to Index:
index = pc.Index(name=index_name)

In [None]:
# Describe the Index:
description = pc.describe_index(name=index_name)
print(description)

In [None]:
from tqdm.auto import tqdm
import datetime
from time import sleep

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(new_data), batch_size)):
    # find end of batch
    i_end = min(len(new_data), i+batch_size)
    meta_batch = new_data[i:i_end]
    # get ids
    ids_batch = [x['id'] for x in meta_batch]
    # get texts to encode
    texts = [x['text'] for x in meta_batch]
    # create embeddings (try-except added to avoid RateLimitError)
    try:
        res = openai.embeddings.create(input=texts, model=embed_model)
    except:
        done = False
        while not done:
            sleep(5)
            try:
                res = openai.embeddings.create(input=texts, model=embed_model)
                done = True
            except:
                pass
    embeds = [record.embedding for record in res.data]
    # cleanup metadata
    meta_batch = [{
        'start': x['start'],
        'end': x['end'],
        'title': x['title'],
        'text': x['text'],
        'url': x['url'],
        'published': x['published'],
        'channel_id': x['channel_id']
    } for x in meta_batch]
    to_upsert = list(zip(ids_batch, embeds, meta_batch))
    
    # Upsert to Pinecone
    index.upsert(vectors=to_upsert)

In [None]:
res = openai.embeddings.create(
    input=[query],
    model=embed_model
)

# retrieve from Pinecone
xq = res.data[0].embedding

# get relevant contexts (including the questions)
res = index.query(vector=xq, top_k=2, include_metadata=True)

In [None]:
res

In [None]:
limit = 3750

def retrieve(query):
    res = openai.embeddings.create(
        input=[query],
        model=embed_model
    )

    # retrieve from Pinecone
    xq = res.data[0].embedding

    # get relevant contexts
    res = index.query(vector=xq, top_k=3, include_metadata=True)
    contexts = [
        x['metadata']['text'] for x in res['matches']
    ]

    # build our prompt with the retrieved contexts included
    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(contexts)):
        if len("\n\n---\n\n".join(contexts[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts[:i-1]) +
                prompt_end
            )
            break
        elif i == len(contexts)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(contexts) +
                prompt_end
            )
    return prompt

In [None]:
# First we retrieve relevant items from Pinecone
query_with_contexts = retrieve(query)
query_with_contexts

In [None]:
print(query_with_contexts)

In [None]:
# Then we complete the context-infused query
complete(query_with_contexts)