In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from transformers import AutoTokenizer, AutoModel
from langchain.embeddings.base import Embeddings
import os
import torch
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
# Step 1: Parse PDFs using the provided method
def langchain_parse_pdf(directory_path):
    results = {}
    try:
        for filename in os.listdir(directory_path): 
            if filename.endswith(".pdf"):
                file_path = os.path.join(directory_path, filename)
                pdf_loader = PyPDFLoader(file_path)
                documents = pdf_loader.load()
                results[filename] = "\n".join([doc.page_content for doc in documents])
                print(f"Finished parsing {filename} with LangChain")
    except Exception as e:
        print(f"An error occurred while parsing PDFs with LangChain: {e}")
    return results

directory_path = "books"
langchain_results = langchain_parse_pdf(directory_path)

Finished parsing كتاب التقنية الرقمية 1-1 مسارات أول ثانوي 1445.pdf with LangChain


In [3]:
# Step 2: Use an Arabic embedding model from Hugging Face
model_name = "asafaya/bert-base-arabic"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [4]:
def embed_texts(texts):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)  
        with torch.no_grad():
            outputs = model(**inputs)
        # Take the mean of the token embeddings to get a single vector per document
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return embeddings

In [5]:
# Split the parsed content into smaller chunks for better indexing
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = [doc for content in langchain_results.values() for doc in text_splitter.split_text(content)]

class CustomEmbeddings(Embeddings):
    def __init__(self, embed_function):
        self.embed_function = embed_function

    def embed_documents(self, texts):
        return self.embed_function(texts)

    def embed_query(self, text):
        return self.embed_function([text])[0]

# Create an instance of CustomEmbeddings
custom_embeddings = CustomEmbeddings(embed_texts)

# Create FAISS vector store from embeddings
vectorstore = FAISS.from_texts(documents, embedding=custom_embeddings)

In [6]:
os.environ["OPENAI_API_KEY"] =""

In [7]:
# Step 3: Build the Retrieval-based QA pipeline
llm = ChatOpenAI(model_name="gpt-3.5-turbo")
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)

: 

In [8]:
# Example query
query = "ما هو النظام الستة عشري؟"

# Use the new invoke method
result = qa_chain.invoke({"query": query})

print(f"Response: {result['result']}")
print("Source Documents:")
for doc in result['source_documents']:
    print(doc.page_content[:100] + "...")  # Print first 100 characters of each source document