In [1]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
import os
import re
import uuid
import torch
import json
import PyPDF2


# RAG
**Step1: Create A vector Database.**
    - Parse the Content
    - Create Chunks
    - Create Embeddings
**Step2: Create A Retrevel.**

# -----------------------------STEP 1 --------------------------------------------------

### ResumeParser
- Create a **Function to Parse** all Resumes and return will be a dictionary. Key's of dictionary are Base path and Value's are the content of Resume.
    - Input: Directory
    - Define the directory of all Resumes papers.
    - Read all Resumes available in the Directory and store their Content in Variable name CV
    - Return: Dictionary

In [2]:
def ResumeParser(Directory):
    ResumeDict = {}
    for Candidate in os.listdir(Directory):
        PaperPath = os.path.join(Directory,Candidate)
        base = os.path.basename(PaperPath)
        if os.path.isfile(PaperPath):
            with open(PaperPath,'rb') as file:
                reader = PyPDF2.PdfReader(file)
                TotalPages = len(reader.pages)
                CV = ""
                for page_num in range(TotalPages):
                    page = reader.pages[page_num]
                    CV += page.extract_text()
            ResumeDict[base] = CV.strip()
    return ResumeDict

### Chunking
- Create a Function to create chunks using the Content of all resumes.
    - Create paragraphs using Desired Methods.
    - Create Words using seprator = " "
    - Create a list of chunks
    - Generate unique Id for each item of chunk list.
    - return all chunk dictionary.

In [3]:
def ChunkGeneration(Content,para_seperator, ChunkSize,tokenizer,sku, separator=" "):
    all_chunks = {}
    paragraphs = re.split(para_seperator, Content)
    for paragraph in paragraphs:
        words = paragraph.split(separator)
        current_chunk_str = ""
        chunk = []
        for word in words:
            new_chunk = current_chunk_str + separator + word
            new_chunk = new_chunk.strip()
            new_chunk_tokens = tokenizer.tokenize(new_chunk)
            if len(new_chunk_tokens) <= ChunkSize:
                current_chunk_str = new_chunk
            else:
                chunk.append(current_chunk_str.strip())
                current_chunk_str = ""
        if len(current_chunk_str.strip())>0:
            chunk.append(current_chunk_str.strip())
        for item in chunk:
            chunk_id = str(uuid.uuid4())
            all_chunks[chunk_id] = {"text": item, "metadata": {"file_name":sku}}
    return all_chunks

### ChunkPipeline
- Create a Function to Manage the Workflow.
    - Iterate over the Dictionary.
    - Create a SKU (Stock Keeping Unit) using Key (base) of item.
    - Generate a Unique ID for the each content using UUID.
    - return a new Dictionary.

In [4]:
def ChunkPipeline(Directory,ChunkSize,tokenizer):
    documents = {}
    ResumeDataBase = ResumeParser(Directory)
    for item in ResumeDataBase:
        sku = os.path.splitext(item)[0]
        doc_id = str(uuid.uuid4())
        ChunkData = ChunkGeneration(ResumeDataBase[item],"/n/n", ChunkSize,tokenizer,sku)
        documents[doc_id] = ChunkData
    return documents

## VectorDataBase
we need to obtain embeddings for each chunk’s text and map these embeddings to their corresponding chunk IDs and document IDs.
- Create a IndexerFunction which will iterate the DocID & Content of each document.

In [5]:
def Vectoraization(Chunks,tokenizer,model):
    MappedEmbeddings = {}
    for DocId, DocContent in Chunks.items():
        ChunkEmbeddings = {}
        for ChunkID,ChunkContent in DocContent.items():
            Content = ChunkContent["text"]
            inputs = tokenizer(Content, return_tensors="pt", padding=True, truncation=True)
            with torch.no_grad():
                embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze().tolist()
            ChunkEmbeddings[ChunkID] = embeddings
        MappedEmbeddings[DocId] = ChunkEmbeddings
    return MappedEmbeddings

# -----------------------------STEP 2 --------------------------------------------------

Retrieval: For retrieving the answer to the query we will be using Cosine Similarity.
- Create a function
- Input: VectorDatabase

In [6]:
def Retrieval(query, top_k,VectorDataBase,tokenizer,model):
    query_inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
    query_embeddings = model(**query_inputs).last_hidden_state.mean(dim=1).squeeze()
    query_embeddings=query_embeddings.tolist()
    query_embeddings=np.array(query_embeddings)
    scores = {}
    for doc_id, chunk_dict in VectorDataBase.items():
        for chunk_id, chunk_embeddings in chunk_dict.items():
            chunk_embeddings = np.array(chunk_embeddings) 
            normalized_query = np.linalg.norm(query_embeddings)
            normalized_chunk = np.linalg.norm(chunk_embeddings)
            if normalized_chunk == 0 or normalized_query == 0:
                score == 0
            else:
                score = np.dot(chunk_embeddings, query_embeddings)/ (normalized_chunk * normalized_query)
            scores[(doc_id, chunk_id )] = score
    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)[:top_k]
    top_results=[]
    for ((doc_id, chunk_id), score) in sorted_scores:
        results = (doc_id, chunk_id, score)
        top_results.append(results)  
    return top_results

In [7]:
def ContextRetrieval(VectorDataBase,ContentTuple):
    Context = ""
    for item in ContentTuple:
        docId = item[0]
        ChunkId = item[1]
        Context = Context + "\n"+VectorDataBase[docId][ChunkId]['text']
    return Context.strip()

# -----------------------------STEP 3 --------------------------------------------------

- Output Generation Using LLM

In [8]:
def generate_llm_response(model,tokenizer,query, relavent_text):
    template = """
    You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.

    Your job is to understand the request, and answer based on the retrieved context.
    Here is context:

    <context>
    {context}
    </context>

    Question: {question}
    """
    template = template.format(context = relavent_text, question = query)
    inputs = tokenizer(template, return_tensors="pt")
    # Generate text based on the input prompt
    output = model.generate(inputs["input_ids"], max_length=1000,  num_return_sequences=1,  no_repeat_ngram_size=2,  do_sample=True,  temperature=0.7 )
    # Decode the generated tokens back into text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


## ModelLoding

In [9]:
model_name = "BAAI/bge-small-en-v1.5"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Inputs for Pipeline

In [10]:
Directory = r"D:\Coding\RAG\Documents"
ChunkSize = 100

In [11]:
query = """Passionate and results-oriented Generative AI Engineer with 2+ years of experience specializing in Natural Language Processing
 (NLP), Data Science, and Computer Vision. Leveraging a strong mathematical background and proven ability to develop
 innovative solutions, I contribute to cutting-edge projects at Collins Aerospace’s R&D team."""

In [12]:
Chunks = ChunkPipeline(Directory,ChunkSize,tokenizer)

In [13]:
VectorDataBase = Vectoraization(Chunks,tokenizer,model)

In [14]:
ContentDoc = Retrieval(query, 2,VectorDataBase,tokenizer,model)

In [15]:
Context = ContextRetrieval(Chunks,ContentDoc)

In [16]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



In [17]:
generate_llm_response(model,tokenizer,query, Context)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


'\n    You are an intelligent search engine. You will be provided with some retrieved context, as well as the users query.\n\n    Your job is to understand the request, and answer based on the retrieved context.\n    Here is context:\n\n    <context>\n    projects at Collins Aerospace’s R&D team.\nEducation\n•Masters of Science in Mathematics and Computer Science from National Institute of Technology, Warangal with 7.1 GPA.\n•Bachelor of Science in Mathematics from University of Delhi with 7.767 GPA.\nExperience\nGenerative AI Engineer, Raytheon Technology - Collins Aerospace (R&D) – Hyd. July 2022 – Present\n•Applied prompt engineering on a LLM model to address 20% to 25% gaps caused\nLinux\nEngineering Skills: Generative-AI, NLP, AI-ML, Data Science, Computer Vision, Automation, DSA, OOPs, AR-VR\nSoft Skills: Collaborator, Learner, Team Player, Innovator, Networking, Multi-Tasking\nCertifications\n•Language Processing Specialization, NLP with Python for Machine Learning, Prompt Engin

## Important Steps of RAG
- Data Chunks using chunking strategies
- Data indexing in Vector Database using Indexing
- ReRanking based on querry using cross encoders.
- Generation using Retreived context and Querry

In [3]:
import Image
img = Image.open('RAG.png')

# Show the image
img.show()

ModuleNotFoundError: No module named 'Image'