In [None]:
!pip install langchain-community
!pip install litellm
!pip install PyPDF2
! pip install torch
! pip install pypdf
!pip install faiss-cpu
!apt-get install -q -y poppler-utils # Install poppler-utils which contains pdfinfo
!pip install pdf2image pytesseract

/bin/bash: line 1: pip install torch: command not found
Reading package lists...
Building dependency tree...
Reading state information...
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
!apt-get install -y tesseract-ocr # Install Tesseract OCR
!apt-get install -y libtesseract-dev # Install Tesseract development files

import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Set the path to the Tesseract executable

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libtesseract-dev is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
# Import necessary libraries
from langchain.text_splitter import CharacterTextSplitter
from litellm import completion
import numpy as np
import faiss
from transformers import AutoTokenizer, AutoModel
import torch
import os
import pytesseract
from pdf2image import convert_from_path
import concurrent.futures
from pprint import pprint

In [None]:
# Helper function to extract text from a single image page
def extract_text_from_image(image, page_num):
    text = pytesseract.image_to_string(image)
    return {'content': text, 'page_number': page_num + 1}

In [None]:
# Function to handle PDF with images and extract text
def get_text_from_file_tesseract(file_path):
    images = convert_from_path(file_path)

    texts = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = [executor.submit(extract_text_from_image, img, idx) for idx, img in enumerate(images)]
        for future in concurrent.futures.as_completed(futures):
            texts.append(future.result())

    # Sorting by page number
    texts = sorted(texts, key=lambda x: x['page_number'])

    return texts

In [None]:
# Load PDF file and extract text
pdf_path = r"sampledata.pdf"
documents = get_text_from_file_tesseract(pdf_path)
pprint(documents)

[{'content': 'Verizon Interview Experience for Software Developer\n'
             'Last Updated : 28 Nov, 2023 an (2) a\n'
             '\n'
             'Round 1:Online Test\n'
             '\n'
             'It consists of 3 sections:\n'
             '\n'
             '¢ Section 1 is essay writing which will give a topic and 20 '
             'minutes u should write min 200 words to 300 words\n'
             '\n'
             '¢ Section 2 is coding there will 2 coding questions to solve one '
             'is easy and the other is medium-level. Questions on math,\n'
             'arrays, strings, and maps will be asked mostly. | have solved 2 '
             'questions.\n'
             '\n'
             '¢ Section 3 is like a puzzle round questions are infinite u '
             'should answer as many as questions possible in a given time.\n'
             'this is a very crucial round to get selected | have done a good '
             'job in it.\n'
             '\n'
             'Verdi

In [None]:
class CharacterTextSplitterWithPageNumbers(CharacterTextSplitter):
    def __init__(self, chunk_size=1000, chunk_overlap=200, **kwargs):
        # Call the superclass's __init__ method to initialize inherited attributes
        super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
        # Store chunk_size and chunk_overlap as instance attributes
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def split_documents(self, documents):
        chunks = []
        for doc in documents:
            page_num = doc['page_number']
            content = doc['content']

            # Break the content into smaller chunks
            while len(content) > 0:
                # Take the first 1000 characters from the content
                chunk = content[:self.chunk_size]
                chunks.append({'content': chunk, 'page_number': page_num})

                # Remove the processed chunk from the content
                content = content[self.chunk_size - self.chunk_overlap:]  # Account for overlap

        return chunks

# Define your text splitter with the chunk size and overlap
text_splitter = CharacterTextSplitterWithPageNumbers(chunk_size=1000, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

In [None]:
# Load tokenizer and model for embedding
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Generate embeddings for each text chunk
def get_embeddings(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text['content'], return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
        embeddings.append({'embedding': embedding, 'page_number': text['page_number']})
    return embeddings

In [None]:

# Create FAISS index
embeddings = get_embeddings(texts)
embedding_vectors = np.array([emb['embedding'] for emb in embeddings], dtype=np.float32)
dimension = embedding_vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embedding_vectors)

In [None]:
# Save the index to a file
faiss.write_index(index, 'faiss_index.index')

In [None]:
# Find the most similar document to a query
def find_most_similar_document(query):
    inputs = tokenizer(query, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    query_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    D, I = index.search(np.array([query_embedding], dtype=np.float32), k=1)
    most_similar_idx = I[0][0]
    most_similar_document_content = texts[most_similar_idx]['content']
    page_number = texts[most_similar_idx]['page_number']

    return most_similar_document_content, page_number

In [None]:
#from huggingface_hub import notebook_login

#notebook_login()

In [None]:
# Install necessary library if you haven't already
#!pip install transformers

# Download and load the Mistral model for generating answers
#from transformers import AutoTokenizer, AutoModelForCausalLM  # Import necessary classes

#mistral_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
#tokenizer = AutoTokenizer.from_pretrained(mistral_model_name)
#mistral_model = AutoModelForCausalLM.from_pretrained(mistral_model_name) # Use a different variable name to avoid overwriting

In [None]:
from litellm import completion
import os

# Set environment variable for Replicate API key
os.environ["REPLICATE_API_KEY"] = "r8_6clShA5Hu1RWuZk354YyqsKp8kA76ZL1hY7AR"

# Generate a response for a query using the Replicate LLaMA-3 model
def get_response(query):
    # Format the message as per the API requirements
    response = completion(
        model="replicate/meta/meta-llama-3-8b-instruct",
        messages=[{"content": query, "role": "user"}]
    )

    # Extract response text from the completion result
    response_text = response.get("choices", [{}])[0].get("message", {}).get("content", "")
    return response_text


In [None]:
# Get the answer from the extracted text
def get_answer_from_pdf(query):
    most_similar_document_content, page_number = find_most_similar_document(query)
    prompt = f"Based on the following content:\n\n{most_similar_document_content}\n\nAnswer the following question: {query}"
    answer = get_response(prompt)
    return answer, page_number

In [None]:
# Example usage
#os.environ["HUGGINGFACE_API_KEY"] = "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"

In [None]:
query = "What is Navin from this pdf"
answer, page_number = get_answer_from_pdf(query)
pprint(f"Answer: {answer}, Found on Page: {page_number}")

('Answer: \n'
 '\n'
 'Based on the provided content, Navin is a student pursuing his B.Tech in '
 'Artificial Intelligence and Data Science at Shiv Nadar University, Chennai. '
 'He has also completed his Class 12 and Class 10 from SKV Higher Secondary '
 'School and R.N.Oxford Matriculation School, respectively., Found on Page: 3')
