In [None]:
import os
import PyPDF2
from transformers import BertTokenizer, BertModel
import torch
import faiss
import numpy as np

def extract_text_from_pdfs_in_folder(folder_path):
    pdf_texts = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page_num in range(len(reader.pages)):
                    text += reader.pages[page_num].extract_text()
                pdf_texts[filename] = text
    return pdf_texts

def get_text_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Extract and vectorize PDF texts
folder_path = "test_source"
pdf_texts = extract_text_from_pdfs_in_folder(folder_path)
pdf_embeddings = {filename: get_text_embeddings(text, model, tokenizer) for filename, text in pdf_texts.items()}

# Create FAISS index
dimension = next(iter(pdf_embeddings.values())).shape[1]
index = faiss.IndexFlatL2(dimension)
for embedding in pdf_embeddings.values():
    faiss.normalize_L2(embedding)
    index.add(embedding)

# Query vectorization and search
query_text = "your query text"
query_vector = get_text_embeddings(query_text, model, tokenizer)
faiss.normalize_L2(query_vector)
distances, indices = index.search(query_vector, k=5)  # Top 5 results

# Print the results
print(indices)
