# Opensource LLM based RAG for information retrieval from 100 research papers

Overview of approach:
*   Different opensource light weight LLMs for each task. Embedding creation, Summerization of retrieved chunks and Q&A
*   Fassai for indexing (IndexIVFFlat trained)
*   Ngrok for serving flask through colab

Discussion on improvement (Approach not taken due to unavailability of free gpu credits):


* Text extraction using LLM to retain table and chart info better

* Finetuning on Research papers

* More advanced LLM can also be utilized which can handle all the three task together.

* Prompting can be leveraged to further improve the output.

* Memory chain, aggregation of reasoning, pdf level llm agents are few other techniques that can be leveraged to improve the performance and experience.

How to test code:

* open colab in your account.
* upload extracted pdf file directory and update path.
* change runtime type and set T4 GPU.
* execute all cell then at end you will get ngrok tunnel like "https://*********.ngrok-free.app".
* open it and use chat.

NOTE: Retrieval code is retrieving relevant chunks from documents. Q&A needs some work for consistant result generation.

In [1]:
!pip install PyMuPDF faiss-gpu langchain langchain-community sentence-transformers

Collecting PyMuPDF
  Downloading PyMuPDF-1.24.4-cp310-none-manylinux2014_x86_64.whl (3.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain
  Downloading langchain-0.2.1-py3-none-any.whl (973 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m973.5/973.5 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-community
  Downloading langchain_community-0.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K   

In [2]:
!pip install Flask==3.0.0 pyngrok==7.1.2


Collecting Flask==3.0.0
  Downloading flask-3.0.0-py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.7/99.7 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyngrok==7.1.2
  Downloading pyngrok-7.1.2-py3-none-any.whl (22 kB)
Collecting blinker>=1.6.2 (from Flask==3.0.0)
  Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Installing collected packages: pyngrok, blinker, Flask
  Attempting uninstall: blinker
    Found existing installation: blinker 1.4
[31mERROR: Cannot uninstall 'blinker'. It is a distutils installed project and thus we cannot accurately determine which files belong to it which would lead to only a partial uninstall.[0m[31m
[0m

In [4]:
import os
import fitz  # PyMuPDF
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pickle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

os.makedirs('/content/pdfs', exist_ok=True)

pdf_dir = '/content/drive/MyDrive/Colab Notebooks/llm_papers'

# Extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# Chunk text into manageable sizes and include the PDF name
def chunk_text(text, pdf_name, chunk_size=300, overlap=150):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append((chunk, pdf_name))
    return chunks

# Directory containing PDF files
pdf_texts = []
pdf_chunks = []

for pdf_file in os.listdir(pdf_dir):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_dir, pdf_file)
        text = extract_text_from_pdf(pdf_path)
        if text:
            chunks = chunk_text(text, pdf_file)
            if chunks:
                pdf_chunks.extend(chunks)

print(f"Total number of chunks: {len(pdf_chunks)}")

# model for creating embeddings
embedding_model = SentenceTransformer('intfloat/e5-large-v2')

def get_embedding(text_chunk):
    return embedding_model.encode(text_chunk, convert_to_numpy=True)

def batch_process_chunks(chunks, batch_size=128):
    all_embeddings = []
    for i in range(0, len(chunks), batch_size):
        batch_chunks = [chunk[0] for chunk in chunks[i:i + batch_size]]
        batch_embeddings = embedding_model.encode(batch_chunks, convert_to_numpy=True)
        all_embeddings.extend(batch_embeddings)
    return all_embeddings

embeddings_file = 'chunk_embeddings_with_metadata.pkl'
if os.path.exists(embeddings_file):
    with open(embeddings_file, 'rb') as f:
        chunk_embeddings, pdf_chunk_data = pickle.load(f)
    print("Loaded embeddings from file.")
else:
    chunk_embeddings = batch_process_chunks(pdf_chunks)
    pdf_chunk_data = pdf_chunks
    with open(embeddings_file, 'wb') as f:
        pickle.dump((chunk_embeddings, pdf_chunk_data), f)
    print("Generated and saved embeddings to file.")

print(f"Total number of embeddings: {len(chunk_embeddings)}")


Error reading /content/drive/MyDrive/Colab Notebooks/llm_papers/2306.09339v1.pdf: Failed to open file '/content/drive/MyDrive/Colab Notebooks/llm_papers/2306.09339v1.pdf'.
Total number of chunks: 6511


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Generated and saved embeddings to file.
Total number of embeddings: 6511


In [5]:
dimension = chunk_embeddings[0].shape[0]
index = faiss.IndexIVFFlat(faiss.IndexFlatL2(dimension), dimension, 100) # 100 clusters
# Train the index
index.train(np.array(chunk_embeddings))
# Add vectors to the index
index.add(np.array(chunk_embeddings))
faiss.write_index(index, 'faiss_index.index')


In [6]:
# model for summarization
model_name = "t5-large"
summarization_tokenizer = T5Tokenizer.from_pretrained(model_name)
summarization_model = T5ForConditionalGeneration.from_pretrained(model_name)
summarization_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# model for QA
qa_model_name = "sjrhuschlee/bart-base-squad2"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/348 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1'}. The number of labels wil be overwritten to 2.


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

BartForQuestionAnswering(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNor

In [11]:

def summarize_documents(documents):
    summaries = []
    for doc in documents:
        inputs = summarization_tokenizer.encode("summarize: " + doc, return_tensors="pt", max_length=512, truncation=True).to(device)
        outputs = summarization_model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = summarization_tokenizer.decode(outputs[0])
        summaries.append(summary)
    return " ".join(summaries)

def rag(question, retrieved_documents, max_input_length=512, max_output_length=200):
    information_summary = summarize_documents(retrieved_documents)
    # prompt = f"""You are an advanced AI model specialized in summarizing and answering questions based on provided text chunks. answer the query accurately and concisely. If the information is not directly available, then state Solely based on the information shared,the documents don't directly has answer to it .
    # Question: {question} """

    encoding = qa_tokenizer(question, information_summary, return_tensors="pt")
    encoding.to(device)

    # Generate answer logits
    outputs = qa_model(**encoding)
    answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits

    # Decode answer
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1  # +1 to include end token
    answer = qa_tokenizer.decode(encoding["input_ids"][0][answer_start:answer_end])
    return answer

def search_index(query_text, top_k=5):
    query_embedding = get_embedding(query_text)
    D, I = index.search(np.array([query_embedding]), top_k)
    return I[0]

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [1]:
from flask import Flask, render_template_string, request, jsonify
from pyngrok import ngrok

app = Flask(__name__)

# HTML template for chatbot interface
HTML_TEMPLATE = """
<!DOCTYPE html>
<html>
<head>
    <title>Chatbot Interface</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            background-color: #f4f4f9;
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
            margin: 0;
        }
        .chat-container {
            width: 50%;
            background-color: white;
            box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
            border-radius: 8px;
            overflow: hidden;
        }
        .chat-box {
            border-bottom: 1px solid #ccc;
            padding: 20px;
            height: 300px;
            overflow-y: scroll;
        }
        .chat-box div {
            margin-bottom: 10px;
        }
        .chat-box div.user-message {
            text-align: right;
        }
        .chat-box div.bot-message {
            text-align: left;
        }
        .input-container {
            display: flex;
            padding: 10px;
        }
        .user-input {
            flex-grow: 1;
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 4px;
            margin-right: 10px;
            font-size: 16px;
        }
        .send-button {
            padding: 10px 20px;
            border: none;
            background-color: #4CAF50;
            color: white;
            border-radius: 4px;
            cursor: pointer;
            font-size: 16px;
        }
        .send-button:hover {
            background-color: #45a049;
        }
        .loading-spinner {
            display: none;
            margin-left: 10px;
        }
    </style>
</head>
<body>
    <div class="chat-container">
        <div class="chat-box" id="chat-box"></div>
        <div class="input-container">
            <input type="text" id="user-input" class="user-input" placeholder="Type your query here..."/>
            <button onclick="sendQuery()" class="send-button">Send</button>
            <span class="loading-spinner" id="loading-spinner">⏳</span>
        </div>
    </div>
    <script>
        async function sendQuery() {
            const query = document.getElementById('user-input').value;
            if (!query.trim()) return;

            const chatBox = document.getElementById('chat-box');
            const loadingSpinner = document.getElementById('loading-spinner');

            chatBox.innerHTML += '<div class="user-message"><strong>You:</strong> ' + query + '</div>';
            chatBox.scrollTop = chatBox.scrollHeight;
            document.getElementById('user-input').value = '';

            loadingSpinner.style.display = 'inline';

            try {
                const response = await fetch('/query', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify({ query: query })
                });
                const result = await response.json();
                if (response.ok) {
                    chatBox.innerHTML += '<div class="bot-message"><strong>LLM:</strong> ' + result.results.join('<br>') + '</div>';
                } else {
                    chatBox.innerHTML += '<div class="bot-message"><strong>Error:</strong> ' + result.error + '</div>';
                }
            } catch (error) {
                chatBox.innerHTML += '<div class="bot-message"><strong>Error:</strong> Something went wrong.</div>';
            } finally {
                loadingSpinner.style.display = 'none';
                chatBox.scrollTop = chatBox.scrollHeight;
            }
        }
    </script>
</body>
</html>
"""

@app.route('/')
def home():
    return render_template_string(HTML_TEMPLATE)

@app.route('/query', methods=['POST'])
def query():
    query = request.json.get('query')
    if not query:
        return jsonify({"error": "No query provided"}), 400

    indices = search_index(query)
    relevant_chunks = [pdf_chunk_data[i][0] for i in indices]
    relevant_pdfs = [pdf_chunk_data[i][1] for i in indices]
    print(relevant_pdfs)
    response = rag(query, relevant_chunks)
    print(response)
    return jsonify({"results": [response]}), 200

# Set up ngrok
ngrok_key = "ADD_KEY_HERE"
port = 5000
ngrok.set_auth_token(ngrok_key)
public_url = ngrok.connect(port).public_url
print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"")

# Run Flask app
if __name__ == '__main__':
    app.run(port=port)


ModuleNotFoundError: No module named 'pyngrok'