# We're going to build question and answer bot

That allow you to search through youtube transcripts using natural language

In [1]:
pip install --quiet openai datasets lancedb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Download the data
700 videos and 208619 sentences

In [2]:
from datasets import load_dataset

data = load_dataset('jamescalam/youtube-transcriptions', split='train')
data

Downloading readme:   0%|          | 0.00/2.13k [00:00<?, ?B/s]

Downloading and preparing dataset json/jamescalam--youtube-transcriptions to /Users/jsc/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/79.8M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /Users/jsc/.cache/huggingface/datasets/jamescalam___json/jamescalam--youtube-transcriptions-08d889f6a5386b9b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


Dataset({
    features: ['title', 'published', 'url', 'video_id', 'channel_id', 'id', 'text', 'start', 'end'],
    num_rows: 208619
})

## Prepare context

Create context of 20 sentences

In [3]:
from lancedb.context import contextualize

df = (contextualize(data.to_pandas())
      .groupby("title").text_col("text")
      .window(20).stride(4)
      .to_df())
df.head(1)

Unnamed: 0,title,published,url,video_id,channel_id,id,text,start,end
177622,$5 MILLION AI for FREE,2022-08-12 15:18:07,https://youtu.be/3EjtHs_lXnk,3EjtHs_lXnk,UCfzlCWGWYyIQ0aLC5w48gBQ,3EjtHs_lXnk-t0.0,Imagine an AI where all in the same model you ...,0.0,24.0


## Create embedding function
We'll call the OpenAI embeddings API to get embeddings

In [4]:
import openai

# Configure environment variable OPENAI_API_KEY
# OR add variable openai.api_key = "sk-..."

def embed_func(c):    
    rs = openai.Embedding.create(input=c, engine="text-embedding-ada-002")
    return [record["embedding"] for record in rs["data"]]

## Create the LanceDB Table

In [None]:
import lancedb
from lancedb.embeddings import with_embeddings

data = with_embeddings(embed_func, df, show_progress=True)

db = lancedb.connect("/tmp/lancedb")  # current directory
tbl = db.create_table("chatbot", data)
tbl.create_index(num_partitions=64, num_sub_vectors=96)

  0%|          | 0/49 [00:00<?, ?it/s]

## Create and answer the prompt

In [8]:
def create_prompt(query, context):
    limit = 3750

    prompt_start = (
        "Answer the question based on the context below.\n\n"+
        "Context:\n"
    )
    prompt_end = (
        f"\n\nQuestion: {query}\nAnswer:"
    )
    # append contexts until hitting limit
    for i in range(1, len(context)):
        if len("\n\n---\n\n".join(context.text[:i])) >= limit:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(context.text[:i-1]) +
                prompt_end
            )
            break
        elif i == len(context)-1:
            prompt = (
                prompt_start +
                "\n\n---\n\n".join(context.text) +
                prompt_end
            )    
    return prompt

In [9]:
def complete(prompt):
    # query text-davinci-003
    res = openai.Completion.create(
        engine='text-davinci-003',
        prompt=prompt,
        temperature=0,
        max_tokens=400,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res['choices'][0]['text'].strip()

# check that it works
query = "who was the 12th person on the moon and when did they land?"
complete(query)

'The 12th person on the moon was Harrison Schmitt, and he landed on December 11, 1972.'

In [12]:
def answer(question):
    emb = embed_func(query)[0]
    context = (tbl.search(emb).limit(3)
               .nprobes(20).refine_factor(100)
               .to_df())
    prompt = create_prompt(question, context)
    return complete(prompt), context.reset_index()

## Show the answer and show the video at the right place

In [13]:
from IPython.display import YouTubeVideo

query = ("Which training method should I use for sentence transformers "
         "when I only have pairs of related sentences?")
completion, context = answer(query)

print(completion)
top_match = context.iloc[0]
YouTubeVideo(top_match["url"].split("/")[-1], start=top_match["start"])

NLI with multiple negative ranking loss.
