<a href="https://colab.research.google.com/github/DS-VERMA-S/KaggleProjects/blob/main/RAG_Solutions/rag-pipeline-for-the-hf-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG Framework using hugginface dataset

In [1]:
!uv pip install sentence-transformers faiss-cpu autoawq==0.2.9 --quiet

In [2]:
# %%writefile rag-pipeline-hf-lib-dataset.py

#!uv pip install sentence-transformers faiss-cpu autoawq==0.2.9 --quiet

import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.special import expit

import torch
import faiss
from sentence_transformers import SentenceTransformer
from sentence_transformers import CrossEncoder
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def q_retrival_with_corss_encoder(query, model, index, cross_encoder_model, top_k=10):
    q_embedding = model.encode([query])
    distances, indexes = index.search(q_embedding, top_k)
    retrived_document = [unpack_list[i] for i in indexes[0]]
    output = cross_encoder_reranker(query, retrived_document, cross_encoder_model)
    return output


def cross_encoder_reranker(query, retrived_document, cross_encoder_model):
    data_prep = [(query, retrived) for retrived in retrived_document]
    output_socre = cross_encoder_model.predict(data_prep)
    softmax = [round(x, 2) for x in expit(output_socre)/(1 + (expit(output_socre)))]

    document_prob_pair = {'query': retrived_document,
                          'prob': softmax}
    reranked_final_documents = [x for x, y in zip(list(document_prob_pair.values())[0], list(document_prob_pair.values())[1]) if y > 0.1]
    return "\n".join(reranked_final_documents)

## 3. Preparing the prompt for the each query and context fetched by retriver.

def prepare_prompt(question, context):

    prompt_template =  f"""You are a medical professional with knowledge of medical termilogy and field.

    for a given question

    {question}

    with the context provided below:

    {context}

    now based on this give you answer below

    """.format(question, context)

    return prompt_template


def inference_in_batches(df, batch_size=16):

    text_generator = pipeline('text-generation', model='mistralai/Mistral-7B-Instruct-v0.3')

    results = []
    for i in tqdm(range(0, len(df), batch_size)):

        prompts_batch = df['prompt'].iloc[i : i + 16].tolist()
        otuput_batch = text_generator(prompts_batch, batch_size=batch_size,
                                      max_new_tokens=1000, return_full_text=False,
                                      num_return_sequences=1)
        results.extend([o[0]["generated_text"].strip() for o in otuput_batch])

    df['ai_final_answer'] = results

    return df


## download data from the HF
dataset = load_dataset("pubmed_qa", "pqa_labeled")
df = pd.DataFrame(dataset['train'])


cross_encoder_model = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L2-v2")
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

list_of_list = [x['contexts'] for x in df["context"]]
unpack_list = [x for sublist in list_of_list for x in sublist]

context_embeddings = embedding_model.encode(unpack_list, show_progress_bar=True)

index = faiss.IndexFlatL2(context_embeddings.shape[1])
index.add(np.array(context_embeddings))

df['retrived_documents'] = df.question.apply(lambda x: q_retrival_with_corss_encoder(x, embedding_model,
                                                                    index, cross_encoder_model))

df['prompt'] = df.apply(lambda x: prepare_prompt(x['question'], x['retrived_documents']), axis=1)

# df.to_csv("fina_output.csv", index=False)

README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/17.6M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/105 [00:00<?, ?it/s]

In [3]:
df.head(2)

Unnamed: 0,pubid,question,context,long_answer,final_decision,retrived_documents,prompt
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,Programmed cell death (PCD) is the regulated d...,You are a medical professional with knowledge ...
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,Differences between Landolt C acuity (LR) and ...,You are a medical professional with knowledge ...


In [4]:
def model_setup(model_id="mistralai/Mistral-7B-Instruct-v0.3"):

  tokenizer = AutoTokenizer.from_pretrained(model_id)
  model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.float16,  # use half precision
            load_in_4bit=False           # <-- critical: 4-bit quantization
        )
  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer,
                  torch_dtype=torch.float16, device_map="auto")

  if pipe.tokenizer.pad_token is None:
    pipe.tokenizer.pad_token = pipe.tokenizer.eos_token
    pipe.model.config.pad_token_id = pipe.tokenizer.eos_token_id


  return pipe


def inference_in_batches(df, model_id="mistralai/Mistral-7B-Instruct-v0.3", batch_size=8):  # start very small
    results = []

    model_pipeline = model_setup(model_id)
    for i in tqdm(range(0, len(df), batch_size)):
        prompts_batch = df['prompt'].iloc[i : i + batch_size].tolist()
        outputs = model_pipeline(
            prompts_batch,
            batch_size=batch_size,
            max_new_tokens=512,  # don't use 1000 on 7B model
            return_full_text=False,
            num_return_sequences=1
        )
        results.extend([o[0]["generated_text"].strip() for o in outputs])
        torch.cuda.empty_cache()
    df['ai_final_answer'] = results

    return df


In [5]:
df1 = inference_in_batches(df)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0
  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 33%|███▎      | 1/3 [01:31<03:02, 91.09s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 67%|██████▋   | 2/3 [02:43<01:20, 80.33s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
100%|██████████| 3/3 [03:31<00:00, 70.44s/it]

CPU times: user 4min 37s, sys: 1min 5s, total: 5min 42s
Wall time: 19min 48s



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ai_final_answer'] = results


In [6]:
df1

Unnamed: 0,pubid,question,context,long_answer,final_decision,retrived_documents,prompt,ai_final_answer
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,Programmed cell death (PCD) is the regulated d...,You are a medical professional with knowledge ...,"Yes, mitochondria play a role in remodeling la..."
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,Differences between Landolt C acuity (LR) and ...,You are a medical professional with knowledge ...,"Based on the provided information, Landolt C a..."
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,,You are a medical professional with knowledge ...,1. Syncope is a sudden loss of consciousness a...
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,The transanal endorectal pull-through (TERPT) ...,You are a medical professional with knowledge ...,"Based on the provided context, this study appe..."
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,"Compared to usual care alone, telephone counse...",You are a medical professional with knowledge ...,"Yes, tailored interventions can potentially in..."
5,23831910,Double balloon enteroscopy: is it efficacious ...,{'contexts': ['From March 2007 to January 2011...,DBE appears to be equally safe and effective w...,yes,There are few data concerning emergency double...,You are a medical professional with knowledge ...,"Based on the information provided, the double ..."
