# Retrieval Augmented Generation (Movie Plots Dataset)
### Models
- Embedding model (Retriever)
- Cross Encoder (Reranker)
- QA (Answer generation)

### Dataset
- [Kaggle: Wikipedia Movie Plots](https://www.kaggle.com/datasets/jrobischon/wikipedia-movie-plots)
- Data already preprocessed
    - splitted into batches of 256 characters

In [None]:
!pip install transformers datasets faiss-gpu

In [None]:
from transformers import AutoTokenizer, AutoModel, Pipeline
import torch
import torch.nn.functional as F
from datasets import Dataset
import pandas as pd
import numpy as np

# Retriever/ Embedding model

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [None]:
# Load model from HuggingFace Hub
retriever_id = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(retriever_id)
model = AutoModel.from_pretrained(retriever_id)


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


class EmbeddingPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        return preprocess_kwargs, {}, {}


    def preprocess(self, text):
        encoded_text = self.tokenizer(text, padding=True, truncation=True, return_tensors='pt')
        return encoded_text


    def _forward(self, model_inputs):
        outputs = self.model(**model_inputs)
        return {"outputs": outputs, "attention_mask": model_inputs["attention_mask"]}


    def postprocess(self, model_outputs):
        sentence_embeddings = mean_pooling(model_outputs["outputs"], model_outputs['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
        return sentence_embeddings[0].numpy()

retriever = EmbeddingPipeline(model=model, tokenizer=tokenizer, device=device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

# Dataset

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
# Long movie pltos splitted into chunks
df = pd.read_pickle("/content/drive/MyDrive/Movie_dataset.pkl")
df.head(3)

Unnamed: 0,text_batch,ref_id
0,"A bartender is working at a saloon, serving d...",0
1,"They assault the Irish man, pulling his hat ov...",0
2,The bartender then sprays seltzer water in Nat...,0


In [None]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text_batch', 'ref_id'],
    num_rows: 372606
})

In [None]:
# embedd data
ds = ds.map(lambda row: {"embeddings": retriever(row["text_batch"])})
ds

Map:   0%|          | 0/372606 [00:00<?, ? examples/s]

Dataset({
    features: ['text_batch', 'ref_id', 'embeddings'],
    num_rows: 372606
})

In [None]:
import os

dataset_path = "/content/drive/MyDrive/dataset"
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

# Save dataset with embeddings to disk
ds.save_to_disk(dataset_path)

Saving the dataset (0/3 shards):   0%|          | 0/372606 [00:00<?, ? examples/s]

# Add faiss

In [None]:
ds.add_faiss_index(column="embeddings")

  0%|          | 0/373 [00:00<?, ?it/s]

Dataset({
    features: ['text_batch', 'ref_id', 'embeddings'],
    num_rows: 372606
})

In [None]:
q = "Who created a magic ring to rule everything?"
q_embedd = retriever(q)
scores, retrieved_docs = ds.get_nearest_examples('embeddings', q_embedd, k=10)



In [None]:
retrieved_docs["text_batch"]

[' Early in the Second Age of Middle-earth, elven smiths forge nine Rings of Power for mortal men, seven for the Dwarf-Lords, and three for the Elf-Kings. Soon after, the Dark Lord Sauron makes the One Ring, and uses it to attempt to conquer Middle-earth.',
 ' In the Second Age of Middle-earth, the lords of Elves, Dwarves, and Men are given Rings of Power.',
 'Over time, Sauron captures the Nine Rings and transforms their owners into the Ringwraiths. The One Ring is discovered by Déagol, whose friend, Sméagol, kills him and takes the Ring for himself.',
 'Unbeknownst to them, the Dark Lord Sauron forges the One Ring in Mount Doom, infusing into it a great part of his power to dominate, through it and at a distance, the other Rings, so he might conquer Middle-earth.',
 ' The world consists of five elements: gold, wood, water, fire and earth. In the wizard world of "Magic to Win", the story also revolves around the "Five Element Wizardry", portraying a story that surpasses our imaginatio

In [None]:
scores

array([0.79417896, 0.8305794 , 0.9485778 , 0.9667423 , 0.9924834 ,
       1.0250964 , 1.0299743 , 1.032616  , 1.0587047 , 1.0645365 ],
      dtype=float32)

# (WIP) Searching and reranking

In [None]:
cross_encoder_id = "dangvantuan/CrossEncoder-camembert-large"

In [None]:
qa_id = "deepset/roberta-base-squad2"