In [1]:
import datasets
from transformers import AutoTokenizer, AutoModel
from dotenv import dotenv_values
from openai import OpenAI

## RAG

### Dataset

In [64]:
imdb = datasets.load_dataset("stanfordnlp/imdb", cache_dir="/net/pr2/projects/plgrid/plggaigraphicsk46/.cache")

In [65]:
imdb = imdb["train"]

In [66]:
imdb = imdb.remove_columns(["label"])

In [67]:
imdb = imdb.map(lambda x: {"text_length": len(x["text"].split())})

In [68]:
imdb = imdb.filter(lambda x: x["text_length"] > 15)
imdb

Dataset({
    features: ['text', 'text_length'],
    num_rows: 24993
})

In [61]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, cache_dir="/net/pr2/projects/plgrid/plggaigraphicsk46/.cache")
model = AutoModel.from_pretrained(model_ckpt, cache_dir="/net/pr2/projects/plgrid/plggaigraphicsk46/.cache")

In [62]:
model = model.to("cuda")

In [63]:
def get_embeddings(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    encoded_input = {k: v.to("cuda") for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return model_output.last_hidden_state[:, 0]

In [18]:
get_embeddings(imdb["text"][1])

tensor([[ 8.3199e-02, -1.5071e-01,  9.8255e-02,  4.4446e-02, -3.1686e-01,
          1.8682e-01, -2.9042e-01,  1.3492e-01, -2.3458e-01, -4.2669e-02,
         -1.7268e-01,  1.0392e+00, -1.1429e-01,  4.9655e-01,  5.3812e-01,
         -1.5356e-01, -1.7012e-01,  1.1264e+00, -5.3759e-01, -5.2373e-01,
          5.0227e-01,  4.4956e-01, -3.2713e-02, -2.6857e-02,  1.0683e-01,
         -5.9092e-01,  3.5564e-01,  3.0762e-01, -3.0808e-01,  5.6195e-01,
          1.4137e-02, -5.9447e-01, -3.9890e-01, -2.7629e-01,  3.8616e-01,
         -1.0317e-01,  2.8774e-01,  1.1236e-01, -2.4730e-01,  4.4272e-03,
          2.1356e-01,  4.2738e-01,  1.4895e-01,  2.8548e-01,  5.0963e-01,
         -2.3894e-01, -7.8339e-01,  1.2078e-01, -1.2966e-01, -3.0228e-01,
          3.0922e-01, -5.1364e-01,  4.3015e-01, -6.8773e-02, -4.3012e-02,
          7.9915e-01,  1.8918e-01,  2.4105e-02,  7.0270e-01,  5.4920e-01,
          1.0149e-02, -4.8151e-01, -6.2601e-01,  9.2661e-02,  2.1179e-01,
          2.9671e-01,  2.7568e-01, -9.

In [69]:
imdb = imdb.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map:   0%|          | 0/24993 [00:00<?, ? examples/s]

In [70]:
# Save imdb to pickle
imdb.save_to_disk("imdb_embeddings_2")

Saving the dataset (0/1 shards):   0%|          | 0/24993 [00:00<?, ? examples/s]

In [72]:
# Load imdb from pickle
imdb = datasets.load_from_disk("imdb_embeddings_2")

In [None]:
imdb[0]

In [73]:
import numpy as np


imdb.add_faiss_index(column="embeddings")

  0%|          | 0/25 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'text_length', 'embeddings'],
    num_rows: 24993
})

### API

In [74]:
env = dotenv_values("/net/people/plgrid/plgjwasala/nlp/lab3/.env")

In [75]:
client = OpenAI(api_key=env["TOKEN"], base_url=env["URL"])

### Completions

In [76]:
def find_similar(query, k=5):
    query_embedding = get_embeddings([query]).detach().cpu().numpy()
    _, samples = imdb.get_nearest_examples("embeddings", query_embedding, k=k)
    return samples

In [77]:
find_similar("What actors are in Titanic?", k=15)["text"]

["Please avoid this movie at all costs. This is without a doubt, the worst movie I've ever seen. Most movies have at least one redeeming value. This has none. Totally horrible!",
 "I was really disappointed in this movie. Those that voted this thing a 10 have a screw lose. The acting was ok, kinda wooden and cardboard. The ending was sorry. I just didn't care for this at all.<br /><br />No way could I recommend this mess.",
 "<br /><br />What an absolutely crappy film this is. How or why this movie was made and what the hell Billy Bob Thornton and Charlize Theron were doing signing up for this mediocre waste of time is beyond me. Strong advise for anyone sitting down to catch a flick: DO NOT waste your time on this 'film'.",
 'no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!',
 "This film is the worst film I have ever seen. The story line is weak - I couldn't even follow it. The acting is high-schoolish. The sound track is irritating. The a

In [79]:
prompt = """Answer the question based only on the following context: {context}.
Question: {question}
"""

In [80]:
def get_completion(question, k):
    samples = find_similar(question, k)

    question_prompt = prompt.format(context=samples["text"], question=question)
    completion = client.chat.completions.create(
        model="gpt-3.5", messages=[{"role": "user", "content": question_prompt}]
    )
    return completion

In [88]:
q = "What actors are in Titanic?"

print(f"Context for '{q}':")
display(find_similar(q, k=15)["text"])

print("Answer:")

get_completion(q, 15).choices[0].message.content

Context for 'What actors are in Titanic?':


["Please avoid this movie at all costs. This is without a doubt, the worst movie I've ever seen. Most movies have at least one redeeming value. This has none. Totally horrible!",
 "I was really disappointed in this movie. Those that voted this thing a 10 have a screw lose. The acting was ok, kinda wooden and cardboard. The ending was sorry. I just didn't care for this at all.<br /><br />No way could I recommend this mess.",
 "<br /><br />What an absolutely crappy film this is. How or why this movie was made and what the hell Billy Bob Thornton and Charlize Theron were doing signing up for this mediocre waste of time is beyond me. Strong advise for anyone sitting down to catch a flick: DO NOT waste your time on this 'film'.",
 'no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!',
 "This film is the worst film I have ever seen. The story line is weak - I couldn't even follow it. The acting is high-schoolish. The sound track is irritating. The a

Answer:


'Leonardo DiCaprio, Kate Winslet, Billy Zane, Kathy Bates, Frances Fisher, Victor Garber, Jonathan Hyde, Bill Paxton, Gloria Stuart'

Model poprawnie wyciągnął informacje o aktorach z różnych komentarzy omawiających film Titanic.

In [91]:
q = "What is a good horror movie to watch?"

print(f"Context for '{q}':")
display(find_similar(q, k=15)["text"])

print("Answer:")

get_completion(q, 15).choices[0].message.content

Context for 'What is a good horror movie to watch?':


['NO SPOILERS.<br /><br />I love horror movies, but this has got to be the poorest attempt to make one ever. Calling it "a movie" is also a stretch. This "random-clips-of-obviously-fake-and-tacky-violence-and-an-ugly- woman-trying-to-act-sexy-edited-poorly-together" is not worth watching.<br /><br />Watching this is about as interesting watching as some random family\'s holiday pictures, and it has about the same quality you would expect when you send your ten year old son into the woods with your new vid-cam, and tell him to make a movie.<br /><br />Terrible.',
 "As far as horror flicks go, this one is pretty darn good. While it may not be a classic tale of horror and suspense, it does provide many quality chuckles that make this movie a must see if you're into the horror/comedy genre.",
 'I love horror movies that brings out a real amount of mystery like say "silent hill" ( which i found to be quite good, but still, was missing something ) and movies that keeps you guessing, this i t

Answer:


'Based on the context provided, "John Carpenter\'s Halloween" is mentioned as a good horror movie to watch.'

Model poprawnie podsumował, że z komentarzy wynika rekomendacja dla filmu Halloween, ale prawdopodobnie nie jest ona reprezentatywna dla całego zbioru - w top 15 wynikach zwróconych przez retriever nie znalazły się szczegółowe informacje o innych filmach, które warto rozważyć.

In [99]:
q = "What can be said about movie Titanic?"

print(f"Context for '{q}':")
display(find_similar(q, k=15)["text"])

print("Answer:")

get_completion(q, 15).choices[0].message.content

Context for 'What can be said about movie Titanic?':


 'Every once in a while the conversation will turn to "favorite movies." I\'ll mention Titanic, and at least a couple people will snicker. I pay them no mind because I know that five years ago, these same people were moved to tears by that very movie. And they\'re too embarrassed now to admit it.<br /><br />I just rewatched Titanic for the first time in a long time. Expecting to simply enjoy the story again, I was surprised to find that the movie has lost none of its power over these five years. I cried again.... in all the same places. It brought me back to 1997 when I can remember how a movie that no one thought would break even became the most popular movie of all time. A movie that burst into the public consciousness like no other movie I can recall (yes, even more than Star Wars). And today, many people won\'t even admit they enjoyed it. Folks, let\'s get something straight -- you don\'t look cool when you badmouth this film. You look like an out of touch cynic.<br /><br />No movi

Answer:


'The movie Titanic is a polarizing film that has received mixed reviews. Some people praise it for its captivating love story, stunning special effects, and emotional impact, while others criticize it for its historical inaccuracies, cheesy dialogue, and shallow characters. Despite this, the film has been a massive commercial success and has left a lasting impact on popular culture.'

Dla przykładu filmu Titanic wystąpiło duzo próbek danych, które wspominają tę nazwę i przekazują opinię, dzięki czemu model był w stanie wygenerować spójną wypowiedź.

In [100]:
q = "What can be said about movie Shawshank Redemption?"

print(f"Context for '{q}':")
display(find_similar(q, k=15)["text"])

print("Answer:")

get_completion(q, 15).choices[0].message.content

Context for 'What can be said about movie Shawshank Redemption?':


['This was a disappointing film. The people seem to have no substance, the lead protagonist Martin Cahil has zero redemptive values, in fact everyone in it including Jon Voight epitomizes sleeze. I would not recommend this film to anyone. The violence is distasteful, though artfully done. The filming is to black, at least the print i saw fit this category. A disappointment.',
 "I am very surprised to see such a high rating for this film, and of the few reviews that there are to be positive. I saw the movie and was pretty dissapointed. I didn't find it very enjoyable at all. It was slow, and lacks the entertainment value. Even the murder scenes are lackluster, with real close-up shots of generic stabbings that don't look good at all. And the supposed great twist ending is really not much, I did see it coming, and then the ending just seemed cliche. This movie may not get much mention, but by the little that it does get, it is overrated. I would not recommend this movie.",
 'This film wa

Answer:


'The movie Shawshank Redemption cannot be judged based on the provided context as it is not mentioned or discussed in the reviews.'

Dla filmu Skazani na Shawshank, nie zostały znalezione żadne opinie, które wymieniają ten film z nazwy. Model w tej sytuacji słusznie zauważył, że z kontekstu nie wynika, o jakich filmach mowa.

In [101]:
q = "Which actors deserve praise for their role and in what movie?"

print(f"Context for '{q}':")
display(find_similar(q, k=15)["text"])

print("Answer:")

get_completion(q, 15).choices[0].message.content

Context for 'Which actors deserve praise for their role and in what movie?':


["I can't believe they got the actors and actresses of that caliber to do this movie. That's all I've got to say - the movie speaks for itself!!",
 "Nicole Kidman is a wonderful actress and here she's great. I really liked Ben Chaplin in The Thin Red Line and he is very good here too. This is not Great Cinema but I was most entertained. Given most films these days this is High Praise indeed.",
 'There is so much of worth in this movie that it is hard to know where to begin with praise. Let me begin by expressing my admiration for a perfect portrayal by Reese Witherspoon. That her performance stands out in the excellent cast is praise indeed. Robert Mulligan has seldom disappointed those of us who have admired his work. Every frame of The Man in the Moon is evidence of film making at its best.',
 'no comment - stupid movie, acting average or worse... screenplay - no sense at all... SKIP IT!',
 'Nominated for the oscar "worst script ever" in my opinion. There\'s no decent story, rediculo

Answer:


'Reese Witherspoon deserves praise for her role in "The Man in the Moon" and Nicole Kidman and Ben Chaplin deserve praise for their roles in the same movie.'

Przykład halucynacji: o ile Reese Witherspoon faktycznie wystąpiła w "The Man in the Moon" i w kontekście znajduje się pozytywna opinia jej aktorstwa, tak pozostali aktorzy występują w innych opiniach, gdzie nawet wspomniane są nazwy innych filmów.