# Retrieval Augmented Generation (Movie Plots Dataset)
### Models
- Embedding model (Retriever)
- Cross Encoder (Reranker)
- QA (Answer generation)

### Dataset
- [Kaggle: Wikipedia Movie Plots](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots)
- Data already preprocessed
    - splitted into batches of 256 characters

In [22]:
!pip install transformers datasets faiss-gpu sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=c29845ca782f89011a8d5ff89d5286f6bdd08d9a35b64d38db8f87a4080f7f5b
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

In [39]:
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, Pipeline, pipeline
import torch
import torch.nn.functional as F
from datasets import Dataset
from sentence_transformers import CrossEncoder
import pandas as pd
import numpy as np
from pathlib import Path
import os

# Retriever/ Embedding model

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [5]:
# Load model from HuggingFace Hub
retriever_id = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(retriever_id)
model = AutoModel.from_pretrained(retriever_id)


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class EmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}


    def preprocess(self, text):
        encoded_text = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        return encoded_text


    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}


    def postprocess(self, model_outputs):
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings[0].numpy()

retriever = EmbeddingPipeline(model=model, tokenizer=tokenizer, device=device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

# Dataset

In [6]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
# Long movie pltos splitted into chunks
df = pd.read_pickle("/content/drive/MyDrive/Movie_dataset.pkl")
df.head(3)

Unnamed: 0,text_batch,ref_id
0,"A bartender is working at a saloon, serving d...",0
1,"They assault the Irish man, pulling his hat ov...",0
2,The bartender then sprays seltzer water in Nat...,0


In [8]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text_batch', 'ref_id'],
    num_rows: 372606
})

In [None]:
# embedd data
ds = ds.map(lambda row: {"embeddings": retriever(row["text_batch"])})
ds

In [14]:
dataset_path = Path("/content/drive/MyDrive/dataset")

import os

if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

# Save dataset with embeddings to disk
ds.save_to_disk(dataset_path)

if dataset_path.exists():
    ds = Dataset.load_from_disk(dataset_path)
ds

Dataset({
    features: ['text_batch', 'ref_id', 'embeddings'],
    num_rows: 372606
})

# Add faiss

In [15]:
ds.add_faiss_index(column="embeddings")

  0%|          | 0/373 [00:00<?, ?it/s]

Dataset({
    features: ['text_batch', 'ref_id', 'embeddings'],
    num_rows: 372606
})

## Searching on faiss

In [35]:
q = "Who created a magic ring to rule everything?"
q_embedd = retriever(q)
scores, retrieved_docs = ds.get_nearest_examples('embeddings', q_embedd, k=10)

In [36]:
retrieved_docs["text_batch"]

[' Early in the Second Age of Middle-earth, elven smiths forge nine Rings of Power for mortal men, seven for the Dwarf-Lords, and three for the Elf-Kings. Soon after, the Dark Lord Sauron makes the One Ring, and uses it to attempt to conquer Middle-earth.',
 ' In the Second Age of Middle-earth, the lords of Elves, Dwarves, and Men are given Rings of Power.',
 'Over time, Sauron captures the Nine Rings and transforms their owners into the Ringwraiths. The One Ring is discovered by Déagol, whose friend, Sméagol, kills him and takes the Ring for himself.',
 'Unbeknownst to them, the Dark Lord Sauron forges the One Ring in Mount Doom, infusing into it a great part of his power to dominate, through it and at a distance, the other Rings, so he might conquer Middle-earth.',
 ' The world consists of five elements: gold, wood, water, fire and earth. In the wizard world of "Magic to Win", the story also revolves around the "Five Element Wizardry", portraying a story that surpasses our imaginatio

In [18]:
scores

array([0.79417896, 0.8305794 , 0.9485778 , 0.9667423 , 0.9924834 ,
       1.0250964 , 1.0299743 , 1.032616  , 1.0587047 , 1.0645365 ],
      dtype=float32)

In [54]:
q = "What is the name of the boy who became a wizard?"
q_embedd = retriever(q)
scores, retrieved_docs = ds.get_nearest_examples('embeddings', q_embedd, k=10)

In [55]:
retrieved_docs["text_batch"]

['He also tells Harry of the latter\'s past; Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse, with Harry being the only survivor in the chaos thus, leading to his fame in the wizarding world as "The Boy Who Lived".',
 'He also tells Harry of the latter\'s past; Harry is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse, with Harry being the only survivor in the chaos thus, leading to his fame in the wizarding world as "The Boy Who Lived".',
 'The party encounters the Wizard (J. Charles Haydon), who tricks Mombi by letting the group hide in the Red Wagon, pulled by the sawhorse; when Mombi attempts to follow them, the group escape out the back of the wagon.',
 'Evan follows Arthur to his home in a condemned theater, and is taken in by Maxwell "Wizard" Wallace (Robin Williams), a vagrant, arrogant, and

In [21]:
scores

array([0.8058087 , 0.8058087 , 0.86204576, 0.8865734 , 0.949302  ,
       0.9616364 , 0.96369815, 0.97125214, 0.9870838 , 0.987492  ],
      dtype=float32)

# (WIP) Searching and reranking

## Cross Encoder (Reranker)

Seems to mess up "good" retrieval

In [24]:
cross_encoder_id = "dangvantuan/CrossEncoder-camembert-large"
cross_encoder = CrossEncoder(cross_encoder_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/805 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/809k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

In [46]:
reranker_scores = []
for text in retrieved_docs["text_batch"]:
    _score = cross_encoder.predict([(q, text)])[0]
    reranker_scores.append(_score)
reranker_scores

[0.40997267,
 0.41701207,
 0.33070418,
 0.491077,
 0.4278202,
 0.5055611,
 0.41799185,
 0.31706372,
 0.28559807,
 0.1659543]

In [38]:
retrieved_docs["text_batch"][np.argmax(reranker_scores)]

'Gandalf, suspicious of the Ring, tells Frodo to keep it secret and to keep it safe. Gandalf then investigates the Ring, discovers its true identity, and returns to warn Frodo.'

## QA model

In [40]:
qa_id = "deepset/roberta-base-squad2"
qa_pipeline = pipeline('question-answering', model=qa_id, tokenizer=qa_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [89]:
def get_answer(query, top_k, rerank=False):
    # Embedd query
    q_embedd = retriever(query)
    # Search rlevant documents
    scores, docs = ds.get_nearest_examples('embeddings', q_embedd, k=top_k)
    texts = docs["text_batch"]

    # reranking
    if rerank:
        reranker_scores = []
        for text in texts:
            _score = cross_encoder.predict([(q, text)])[0]
            reranker_scores.append(_score)
        texts = [docs["text_batch"][i] for i in np.argsort(reranker_scores)[::-1]]

    answers = []
    # For retrieved documents extract answer
    for text in texts:
        qa_input = {
            "question": query,
            "context": text
        }

        res = qa_pipeline(qa_input)
        answers.append(res["answer"])
    return answers

In [60]:
get_answer("Who created a magic ring to rule everything?", 5)

['the Dark Lord Sauron', 'Elves', 'Déagol', 'Dark Lord Sauron', 'wizard']

In [81]:
get_answer("Who created a magic ring to rule everything?", 5, True)

['wizard', 'the Dark Lord Sauron', 'Déagol', 'Dark Lord Sauron', 'Elves']

In [90]:
get_answer("What is the name of the boy who became a wizard?", 5)

['Harry', 'Harry', 'J. Charles Haydon', 'Evan', '18 Hopi Native American, Boy']

In [91]:
get_answer("What is the name of the boy who became a wizard?", 5, True)



['Harry', 'Harry', 'J. Charles Haydon', '18 Hopi Native American, Boy', 'Evan']

In [92]:
get_answer("Who destroys the magic ring?", 5)

['Sméagol', "Ōba's men", 'A monkey', 'A monkey', 'Ravus']

In [97]:
get_answer("Who is woody from toy story jealous of?", 5)

['Buzz', 'Andy', 'Sheriff Woody', 'Andy', 'Andy']