In [None]:
# !pip install PyPDF2
import os
import pickle
import PyPDF2
from datetime import datetime
from google.colab import drive

drive.mount('/content/drive')

pdf_directory = "/content/drive/My Drive/codefiles/"
data_file = "/content/drive/My Drive/pdf_data.pkl"

# Function to extract content and metadata from PDFs
def extract_pdf_data(directory):
    pdf_data = []
    unique_id = 1

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory, filename)

            # Read PDF content
            with open(file_path, "rb") as f:
                reader = PyPDF2.PdfReader(f)
                content = ""
                for page in reader.pages:
                    content += page.extract_text()

            # Generate metadata
            metadata = {
                "unique_id": unique_id,
                "file_name": filename,
                "content": content,
                "upload_datetime": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
            pdf_data.append(metadata)
            unique_id += 1

    return pdf_data

# Load existing data or process PDFs if running for the first time
if os.path.exists(data_file):
    with open(data_file, "rb") as f:
        pdf_data = pickle.load(f)
    print("Data loaded from previous run.")
else:
    pdf_data = extract_pdf_data(pdf_directory)
    with open(data_file, "wb") as f:
        pickle.dump(pdf_data, f)
    print("PDF data extracted and saved.")

# Display the extracted data
for pdf in pdf_data:
    print(f"ID: {pdf['unique_id']}, Name: {pdf['file_name']}, Uploaded: {pdf['upload_datetime']}")
    print(f"Content: {pdf['content'][:200]}...")  # Displaying first 200 characters


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data loaded from previous run.
ID: 1, Name: 6492.pdf, Uploaded: 2025-03-19 19:41:03
Content:           
 
 
 تهران – بلوار  میرداماد – پالک  198 :تلفن  29951   :کدپستی 33111 - 15496 افک س: 66735 674    سایت اینترنتی: www.cbi .ir   
 
 
                                                         ...
ID: 2, Name: 117238.pdf, Uploaded: 2025-03-19 19:41:04
Content:           
 
 
 تهران – بلوار  میرداماد – پالک  198 :تلفن  29951   :کدپستی 33111 - 15496 افک س: 66735 674    سایت اینترنتی: www.cbi .ir   
 
 
                                    

In [None]:
# !pip install faiss-cpu requests sentence-transformers
import numpy as np
import faiss
import pickle
import requests
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer

# Configuration
DEEPSEEK_API_KEY = "sk-ad6********************761"  
LOCAL_EMBEDDING_MODEL = "all-MiniLM-L6-v2"  # Fallback model

class HybridEmbeddings:
    def __init__(self, api_key=None):
        self.api_key = api_key
        self.local_model = SentenceTransformer(LOCAL_EMBEDDING_MODEL)
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json"
        } if api_key else None

    def embed_documents(self, texts):
        if self.api_key:
            try:
                response = requests.post(
                    "https://api.deepseek.com/v1/embeddings",
                    json={
                        "model": "text-embedding",
                        "input": texts,
                        "encoding_format": "float"
                    },
                    headers=self.headers,
                    timeout=30
                )
                if response.status_code == 200:
                    # Sort embeddings by index to maintain order
                    embeddings = sorted(response.json()["data"], key=lambda x: x["index"])
                    return [e["embedding"] for e in embeddings]
            except Exception as e:
                print(f"DeepSeek API failed, using local model: {str(e)}")

        # Fallback to local model
        return self.local_model.encode(texts).tolist()

# Load PDF data
data_file = "/content/drive/My Drive/pdf_data.pkl"
with open(data_file, "rb") as f:
    pdf_data = pickle.load(f)

# Split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = []
metadata = []

for pdf in pdf_data:
    chunks = text_splitter.split_text(pdf["content"])
    for chunk in chunks:
        documents.append({"page_content": chunk})
        metadata.append({
            "file_name": pdf["file_name"],
            "unique_id": pdf["unique_id"],
            "upload_datetime": pdf["upload_datetime"]
        })

print(f"Total chunks created: {len(documents)}")

# Initialize embeddings
embedder = HybridEmbeddings(api_key=DEEPSEEK_API_KEY)
embeddings = embedder.embed_documents([doc["page_content"] for doc in documents])

# Create FAISS index
dimension = len(embeddings[0])
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings, dtype="float32"))

# Save index
index_file = "/content/drive/My Drive/faiss_index/index.faiss"
os.makedirs(os.path.dirname(index_file), exist_ok=True)
faiss.write_index(index, index_file)
print("FAISS index created and saved.")

# Example search (uncomment to test)
# query = "Explain banking regulations"
# query_embedding = embedder.embed_documents([query])[0]
# D, I = index.search(np.array([query_embedding], dtype="float32"), k=3)
# for idx in I[0]:
#     print(f"Match: {metadata[idx]['file_name']}")
#     print(f"Content: {documents[idx]['page_content'][:200]}...")

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

FAISS index created and saved.


In [9]:
# !pip install sentence-transformers langchain-community faiss-cpu

import pickle
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from typing import List, Dict

class LocalEmbeddings(Embeddings):
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

# Load PDF Data
data_file = "/content/drive/My Drive/pdf_data.pkl"
with open(data_file, "rb") as f:
    pdf_data = pickle.load(f)

# Split Documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = []
metadatas = []

for pdf in pdf_data:
    chunks = text_splitter.split_text(pdf["content"])
    for chunk in chunks:
        documents.append(chunk)
        metadatas.append({
            "file_name": pdf["file_name"],
            "unique_id": pdf["unique_id"],
            "upload_datetime": pdf["upload_datetime"]
        })

print(f"Total chunks: {len(documents)}")

# Initialize Embeddings
embeddings = LocalEmbeddings()

# Create/load FAISS index
index_path = "/content/drive/My Drive/faiss_index"
if os.path.exists(index_path):
    print("Loading existing index")
    vectorstore = FAISS.load_local(
        index_path,
        embeddings,
        allow_dangerous_deserialization=True
    )
else:
    print("Creating new index")
    vectorstore = FAISS.from_texts(
        documents,
        embeddings,
        metadatas=metadatas
    )
    vectorstore.save_local(index_path)

# Retriever Class
class ParentDocumentRetriever:
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore

    def get_parent_documents(self, query: str, top_k: int = 5) -> List[Dict]:
        results = self.vectorstore.similarity_search(query, k=top_k*2)
        seen_ids = set()
        return [
            doc.metadata
            for doc in results
            if not (doc.metadata["unique_id"] in seen_ids or seen_ids.add(doc.metadata["unique_id"]))
        ][:top_k]

# Test it
retriever = ParentDocumentRetriever(vectorstore)
results = retriever.get_parent_documents("banking regulations in Iran", 3)
for i, doc in enumerate(results, 1):
    print(f"{i}. {doc['file_name']} (ID: {doc['unique_id']})")

Total chunks: 3019
Loading existing index
1. 60446-1.pdf (ID: 7)
2. 104009-1.pdf (ID: 6)


In [None]:

# !pip install -q llama-cpp-python sentence-transformers langchain-community faiss-cpu

import pickle
from llama_cpp import Llama
from langchain_core.embeddings import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import os
from IPython.display import Markdown, display
import textwrap

# Model Setup 
MODEL_URL = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
MODEL_PATH = "/content/llama-2-7b-chat.Q4_K_M.gguf"

if not os.path.exists(MODEL_PATH):
    print("Downloading model...")
    !wget -q --show-progress -O {MODEL_PATH} {MODEL_URL}

# Improved Components 
class LocalEmbeddings(Embeddings):
    def __init__(self):
        from sentence_transformers import SentenceTransformer
        self.model = SentenceTransformer("all-MiniLM-L6-v2")

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        return self.model.encode(texts).tolist()

    def embed_query(self, text: str) -> List[float]:
        return self.model.encode(text).tolist()

class ChatLLM:
    def __init__(self):
        self.llm = Llama(
            model_path=MODEL_PATH,
            n_ctx=2048,
            n_threads=8,
            n_gpu_layers=33,
            verbose=False
        )

    def generate_response(self, prompt: str, max_tokens=256):
        # Clean prompt template
        clean_prompt = f"""<s>[INST] <<SYS>>
You are a helpful banking expert. Answer concisely using ONLY the provided context.
<</SYS>>

Context:
{prompt}[/INST]"""

        response = self.llm(
            clean_prompt,
            max_tokens=max_tokens,
            temperature=0.5,  # Lower for more focused answers
            top_p=0.9,
            stop=["</s>", "INST"],
            echo=False
        )
        return response['choices'][0]['text'].strip()

# Initialize System 
embeddings = LocalEmbeddings()
index_path = "/content/drive/My Drive/faiss_index"

if os.path.exists(index_path):
    print("Loading vectorstore...")
    vectorstore = FAISS.load_local(index_path, embeddings, allow_dangerous_deserialization=True)
else:
    raise FileNotFoundError("FAISS index not found")

llm = ChatLLM()

# Optimized RAG Function
def get_clean_answer(question: str, top_k=3):
    # Retrieve documents
    docs = vectorstore.similarity_search(question, k=top_k)
    context = "\n\n".join(f"--- Document {i+1} ---\n{doc.page_content}"
                         for i, doc in enumerate(docs))

    # Generate response
    prompt = f"{context}\n\nQuestion: {question}\nAnswer:"
    response = llm.generate_response(prompt)

    # Clean output
    clean_response = response.split("Answer:")[-1].strip()
    clean_response = textwrap.fill(clean_response, width=80)

    display(Markdown(f"**Question:** {question}"))
    display(Markdown(f"**Answer:** {clean_response}"))
    return clean_response

print("System Ready!\n")
get_clean_answer("Explain the key banking regulations in Iran")

Loading vectorstore...


llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


System Ready!



**Question:** Explain the key banking regulations in Iran

**Answer:** Based on the provided documents, the key banking regulations in Iran are: 1.
Banking Act (2010): This act establishes the legal framework for the banking
sector in Iran, including the supervisory regime for banks. 2. Basel Committee
on Banking Supervision (2012): The "Core principles for effective banking
supervision (revised)" provides a framework for banking supervision, including
the key principles for effective supervision. 3. Code of Federal Regulations
(2017): Title 12-Banks and Banking Part 30 safety and soundness standards set
out the safety and soundness standards for banks in the United States. 4.
Comisión Nacional Bancaria y de Valores (2005): Disposiciones de carácter
general aplicables a las instituciones de crédito provide general regulations
for banks in Mexico. 5. International Finance Corporation (2015): Guidelines for
identifying and dealing with weak banks provide a framework for identifying and
addressing weak banks. 6. Asian Development Bank (2015): Frameworks for early
supervisory intervention provide a

'Based on the provided documents, the key banking regulations in Iran are: 1.\nBanking Act (2010): This act establishes the legal framework for the banking\nsector in Iran, including the supervisory regime for banks. 2. Basel Committee\non Banking Supervision (2012): The "Core principles for effective banking\nsupervision (revised)" provides a framework for banking supervision, including\nthe key principles for effective supervision. 3. Code of Federal Regulations\n(2017): Title 12-Banks and Banking Part 30 safety and soundness standards set\nout the safety and soundness standards for banks in the United States. 4.\nComisión Nacional Bancaria y de Valores (2005): Disposiciones de carácter\ngeneral aplicables a las instituciones de crédito provide general regulations\nfor banks in Mexico. 5. International Finance Corporation (2015): Guidelines for\nidentifying and dealing with weak banks provide a framework for identifying and\naddressing weak banks. 6. Asian Development Bank (2015): Fr

Loading vectorstore...


llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


System Ready!

