In [None]:
import datasets
from transformers import AutoTokenizer, AutoModel
from dotenv import dotenv_values
from openai import OpenAI

## RAG

### Dataset

In [45]:
imdb = datasets.load_dataset('stanfordnlp/imdb')

In [46]:
imdb = imdb['train']

In [47]:
imdb = imdb.remove_columns(['label'])

In [48]:
imdb = imdb.map(lambda x: {'text_length': len(x['text'].split())})

In [49]:
imdb = imdb.filter(lambda x: x['text_length'] > 15)
imdb

Dataset({
    features: ['text', 'text_length'],
    num_rows: 24993
})

In [55]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [56]:
model.to('cuda')

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [57]:
def get_embeddings(texts):
    encoded_input = tokenizer(
        texts, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to('cuda') for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0]

In [68]:
get_embeddings(imdb["text"][1])

tensor([[ 7.6005e-03, -2.0960e-01, -1.8307e-01,  2.9431e-02,  2.2302e-02,
         -1.1974e-01,  1.8083e-01,  3.7614e-02, -1.3613e-01, -6.4454e-02,
          8.9349e-02, -2.4593e-01, -3.9624e-02, -7.5779e-02, -2.4006e-01,
          2.0114e-01,  7.4570e-02,  3.8725e-01,  4.4646e-01, -7.8183e-02,
          3.5338e-02, -2.7111e-01, -1.0960e-01, -1.6919e-01, -3.4224e-02,
          1.5124e-01,  2.7274e-01,  3.5244e-01, -6.3806e-02, -2.6364e-02,
          7.0555e-02,  1.4862e-01, -1.3979e-01,  1.4999e-02, -1.1110e-04,
         -3.2058e-01,  1.1339e-01,  8.5978e-02,  2.7671e-01,  5.4564e-02,
         -1.8672e-01, -6.6090e-02, -3.8679e-02, -1.5962e-02, -8.0878e-02,
         -1.7721e-01,  1.7797e-01,  2.2016e-01, -1.2412e-02, -2.6752e-01,
          1.3995e-01,  1.7929e-01, -3.4175e-02,  3.2826e-01,  1.0332e-02,
          3.4963e-01,  3.3313e-02, -1.3043e-01,  2.3345e-01,  5.7585e-01,
         -8.0288e-02,  1.0055e-01, -2.4438e-01,  9.5533e-02,  3.5553e-01,
         -2.5742e-01,  3.3878e-01, -2.

In [59]:
imdb = imdb.map(lambda x: {'embeddings': get_embeddings(x['text']).detach().cpu().numpy()[0]})

Map:   0%|          | 0/24993 [00:00<?, ? examples/s]

In [60]:
imdb[0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

In [61]:
imdb.add_faiss_index(column='embeddings')

  0%|          | 0/25 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'text_length', 'embeddings'],
    num_rows: 24993
})

### API

In [27]:
env = dotenv_values()

In [28]:
client = OpenAI(api_key=env['TOKEN'], base_url=env["URL"])

### Completions

In [69]:
def find_similar(query, k=5):
    query_embedding = get_embeddings([query]).detach().cpu().numpy()
    _, samples = imdb.get_nearest_examples('embeddings', query_embedding, k=k)
    return samples

In [70]:
find_similar('Best acting in series')['text']

['Mr Perlman gives a standout performance (as usual). Sadly, he has to struggle with an underwritten script and some nonsensical set pieces.<br /><br />Larsen is in "Die Hard" mode complete with singlet and bulging muscles, I\'m sure he could do better but seems satisfied to grimace and snarl through his part.<br /><br />The lovely Erika is very decorative (even though fully clothed!) and shows some signs of "getting" acting at last.<br /><br />SFX are mainly poor CGI and steals from other movies.<br /><br />The shootouts are pitiful - worthy of the A-Team<br /><br />Not even worth seeing for Perlman - AVOID',
 'no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!',
 "the acting itself wasn't even that bad, since it did't come to mind in the movie but whatever had this director in mind? the intended climb towards some climax completely missed the mark,..<br /><br />almost all scenes involve acting that stand so far from our own intentions and w

In [30]:
prompt = """Answer the question based only on the following context: {context}.
Question: {question}
"""

In [71]:
def get_completion(question, k):
    samples = find_similar(question, k)
    
    question_prompt = prompt.format(context=samples['text'], question=question)
    completion = client.chat.completions.create(
        model="gpt-3.5",
        messages=[
            {"role": "user", "content": question_prompt}
        ]
    )
    return completion

In [72]:
get_completion('Best acting in series', 5).choices[0].message.content

'The best acting in the series is likely found in the review that praises William Powell for his relaxed and debonair performance in the Philo Vance series.'

In [None]:
get_completion('What actors are in Titanic?', 15).choices[0].message.content

'Leonardo DiCaprio, Kate Winslet, Billy Zane, Kathy Bates, Frances Fisher, Victor Garber, Jonathan Hyde, Bill Paxton, and Gloria Stuart.'