In [66]:
!pip install langchain
!pip install PyPDF2
!pip install faiss-cpu
!pip install transformers
!pip install pickle5

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=255318 sha256=290623ad6e30b61e3e843e09ff8e6693de3d8f483c70ccba406bc88eb44ffbd5
  Stored in directory: /root/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11


In [67]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np
import pickle

In [68]:
# Load a pre-trained model from Hugging Face
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [69]:
# Function to get embeddings using the Hugging Face model
def get_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings

In [70]:
# connect your Google Drive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"

Mounted at /content/gdrive


In [71]:
# location of the pdf file/files.
reader = PdfReader('/content/gdrive/MyDrive/Corpus.pdf')

In [62]:
# Read data from the file and put them into a variable called raw_text
raw_text = ''
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        raw_text += text

# Split the text into chunks
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

# Get embeddings for each chunk and concatenate into a single NumPy array
embeddings_list = [get_embeddings(text).cpu().numpy() for text in texts]
embeddings_matrix = np.vstack(embeddings_list).astype('float32')

# Create FAISS index and add embeddings
d = embeddings_matrix.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings_matrix)

# Function to search for similar documents
def similarity_search(query, k=4):
    query_embedding = get_embeddings(query).cpu().numpy().astype('float32')
    D, I = index.search(query_embedding, k)
    return [texts[i] for i in I[0]]

In [72]:
# Save the texts and embeddings to a pickle file
with open('/content/gdrive/MyDrive/texts_and_embeddings.pkl', 'wb') as f:
    pickle.dump((texts, embeddings_matrix), f)