# DeBERTa + Wikipedia RAG
### The input data for this notebook can be obtained via [this](https://www.kaggle.com/competitions/kaggle-llm-science-exam) Kaggle competition


In [None]:
# Installing offline dependencies
!pip install -U --no-deps /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -U --no-deps /kaggle/input/datasets-214/datasets-2.14.5-py3-none-any.whl

Processing /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2
Processing /kaggle/input/datasets-214/datasets-2.14.5-py3-none-any.whl
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.1.0
    Uninstalling datasets-2.1.0:
      Successfully uninstalled datasets-2.1.0
Successfully installed datasets-2.14.5


## Create FAISS index
### This index contains embeddings of wiki text

In [None]:
import faiss
from tqdm.auto import tqdm
from pathlib import Path
import numpy as np
import torch.nn.functional as F
import torch
import collections
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForMultipleChoice
from datasets import load_from_disk
from torch.nn.utils.rnn import pack_sequence

wikipedia_path = Path("/kaggle/input/270k-llm-wiki") # Wiki data URL: https://www.kaggle.com/datasets/eashish/270k-llm-wiki
embedding_size = 384
batch_size = 128
max_length = 512
checkpoint = 'BAAI/bge-base-en-v1.5'
embedding_size = 768

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint).cuda().half()

def transform(batch):

    if 'BAAI' in checkpoint:
        batch["text"] = ["Represent this sentence for searching relevant passages: " + x for x in batch["text"]]
    elif checkpoint == "intfloat/e5-small-v2":
        batch["text"] = ["passage: "+ x for x in batch["text"]]

    tokens = tokenizer(batch["text"], truncation=True, padding='max_length', return_tensors="pt", max_length=max_length)
    return tokens.to("cuda")

# Create faiss index, it will use the same index as wikipedia_index (not the "id", but the row index)
faiss_index = faiss.IndexFlatL2(embedding_size)

# Create dataset and dataloader
dataset = load_from_disk(wikipedia_path)
dataset.set_transform(transform)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Compute embeddings
outputs = np.zeros((len(dataset), embedding_size), dtype=np.float16)
with torch.inference_mode():
        for i, batch in tqdm(enumerate(dataloader), leave=False, total=len(dataloader)):
            embeddings = model(**batch).pooler_output
            embeddings = F.normalize(embeddings, p=2, dim=1)
            outputs[batch_size*i:batch_size*(i+1)] = embeddings.detach().cpu().numpy()

# Add embeddings to faiss index (it will use the same index as wiki_2023_index.parquet)
faiss_index.add(outputs.astype(np.float32))
faiss.write_index(faiss_index, str(wikipedia_path/ f"faiss_index_{checkpoint.split('/')[-1]}.index"))

In [None]:
import gc
import logging
from time import time
from concurrent.futures import ThreadPoolExecutor
import ctypes
from functools import partial
import pandas as pd

# For RAG
import faiss
from torch.utils.data import DataLoader
from datasets import Dataset

NUM_TITLES = 5
MAX_SEQ_LEN = 512
MODEL_PATH = "/kaggle/input/bge-small-faiss/" # Faiss model saved in above cell

# For LLM
from transformers import AutoConfig, AutoModelForCausalLM
from accelerate import init_empty_weights
from accelerate.utils.modeling import set_module_tensor_to_device
from safetensors.torch import load_file

N_BATCHES = 5
MAX_CONTEXT = 2750
MAX_LENGTH = 4096



In [None]:
# Function to clean RAM & vRAM
def clean_memory():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

# Load data
df = pd.read_csv("/kaggle/input/kaggle-llm-science-exam/test.csv", index_col="id")
IS_TEST_SET = len(df) != 200

## 1. Wikipedia Retrieval Augmented Generation (RAG)

We use the [bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) to embed the Wikipedia dataset.

In [None]:
# New SentenceTransformer class similar to the one used in @Mgöksu notebook but relying on the transformers library only

class SentenceTransformer:
    def __init__(self, checkpoint, device="cuda:0"):
        self.device = device
        self.checkpoint = checkpoint
        self.model = AutoModel.from_pretrained(checkpoint).to(self.device).half()
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def transform(self, batch):
        tokens = self.tokenizer(batch["text"], truncation=True, padding=True, return_tensors="pt", max_length=MAX_SEQ_LEN)
        return tokens.to(self.device)

    def get_dataloader(self, sentences, batch_size=32):
        sentences = ["Represent this sentence for searching relevant passages: " + x for x in sentences]
        dataset = Dataset.from_dict({"text": sentences})
        dataset.set_transform(self.transform)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        return dataloader

    def encode(self, sentences, show_progress_bar=False, batch_size=32):
        dataloader = self.get_dataloader(sentences, batch_size=batch_size)
        pbar = tqdm(dataloader) if show_progress_bar else dataloader

        embeddings = []
        for batch in pbar:
            with torch.no_grad():
                e = self.model(**batch).pooler_output
                e = F.normalize(e, p=2, dim=1)
                embeddings.append(e.detach().cpu().numpy())
        embeddings = np.concatenate(embeddings, axis=0)
        return embeddings

In [None]:
if IS_TEST_SET:
    # Load embedding model
    start = time()
    print(f"Starting prompt embedding, t={time() - start :.1f}s")
    model = SentenceTransformer(MODEL_PATH, device="cuda:0")

    # Get embeddings of prompts
    f = lambda row : " ".join([row["prompt"], row["A"], row["B"], row["C"], row["D"], row["E"]])
    inputs = df.apply(f, axis=1).values # better results than prompt only
    prompt_embeddings = model.encode(inputs, show_progress_bar=False)

    # Search closest sentences in the wikipedia index
    print(f"Loading faiss index, t={time() - start :.1f}s")
    faiss_index = faiss.read_index(MODEL_PATH + '/faiss.index')
    # faiss_index = faiss.index_cpu_to_all_gpus(faiss_index) # causes OOM, and not that long on CPU

    print(f"Starting text search, t={time() - start :.1f}s")
    search_index = faiss_index.search(np.float32(prompt_embeddings), NUM_TITLES)[1]

    print(f"Starting context extraction, t={time() - start :.1f}s")
    dataset = load_from_disk("/kaggle/input/all-paraphs-parsed-expanded")
    for i in range(len(df)):
        df.loc[i, "context"] = "-" + "\n-".join([dataset[int(j)]["text"] for j in search_index[i]])

    # Free memory
    faiss_index.reset()
    del faiss_index, prompt_embeddings, model, dataset
    clean_memory()
    print(f"Context added, t={time() - start :.1f}s")

## Multiple ways to perform answer selection from the context obtained above via RAG.

1. ### We can use multiple models for inferencing directly on the context obtained above.
2. ### Fine tune a model offline on the train data as well as external data and use it for inference.
3. ### Fine tune via PEFT techniques.

In [None]:
test_df = df
test_df.index = list(range(len(test_df)))
test_df['id'] = list(range(len(test_df)))
test_df["prompt"] = test_df["context"].apply(lambda x: x[:1750]) + " #### " +  test_df["prompt"]
test_df['answer'] = 'A'

In [None]:
df_train = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')
df_train = df_train.drop(columns="id")

df_train = pd.concat([
    df_train,
    pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv'),
    pd.read_csv('/kaggle/input/llm-mcq-dataset/100_examples.csv')
])
df_train.drop_duplicates().reset_index(inplace=True, drop=True)

In [None]:
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentences = [example[option] for option in 'ABCDE']
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation=False)
    tokenized_example['label'] = option_to_index[example['answer']]

    return tokenized_example

In [None]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

### Let's fine tune DeBERTa model on the train data

In [None]:
model_dir = '/kaggle/input/deberta-v3-large-hf-weights'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

dataset = Dataset.from_pandas(df_train)
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

training_args = TrainingArguments(
    warmup_ratio=0.8,
    learning_rate=5e-6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    report_to='none',
    output_dir='.',
)

model = AutoModelForMultipleChoice.from_pretrained(model_dir).cuda()
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
)

trainer.train()

In [None]:
tokenized_test_dataset = Dataset.from_pandas(test_df.drop(columns=['id'])).map(preprocess, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(["__index_level_0__"])

### Get predictions on the test set

In [None]:
test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)

predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

submission = test_df[['id', 'prediction']]
submission.to_csv('submission.csv', index=False)