In [3]:
import os
import re
import requests
from langchain_community.document_loaders import ArxivLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma

# -------------- STEP 3: LOAD ALL PDFs --------------
print("\nLoading all PDFs...")
all_docs = []
downloaded_folder = "../Database/arXiv_papers"

# Get full paths of all PDF files in the folder
downloaded_files = [os.path.join(downloaded_folder, f) for f in os.listdir(downloaded_folder) if f.endswith(".pdf")]

for path in downloaded_files:
    try:
        loader = PyPDFLoader(path)
        docs = loader.load()
        
        filename = os.path.basename(path)
        arxiv_id = filename.replace(".pdf", "")
        arxiv_url = f"https://arxiv.org/pdf/{arxiv_id}"

        for doc in docs:
            doc.metadata["source"] = arxiv_url
        all_docs.extend(docs)
    except Exception as e:
        print(f"Failed to load {path}: {e}")

# -------------- STEP 4: SPLIT TEXT --------------
print("Splitting documents...")
from langchain.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer

# Load the tokenizer - choose one closest to your Ollama model
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")  # or the model you used

# Define a tokenizer function as expected by LangChain
def token_length_function(text):
    return len(tokenizer.encode(text))

# Now use LangChain's TokenTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    length_function=token_length_function,
)

chunks = text_splitter.split_documents(all_docs)


# -------------- STEP 5: EMBEDDINGS + VECTOR DB --------------
print("Embedding and creating Chroma DB...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_db = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory="../Database/Vector-DB")

# Optional: Persist the DB
vector_db.persist()

print(f"\n VectorDB created with {len(chunks)} chunks!")



Loading all PDFs...


Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 10 0 (offset 0)
Ignoring wrong pointing object 12 0 (offset 0)
Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 38 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 46 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 59 0 (offset 0)
Ignoring wrong pointing object 65 0 (offset 0)
Ignoring wrong pointing object 70 0 (offset 0)
Ignoring wrong pointing object 79 0 (offset 0)
Ignoring wrong pointing object 81 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 104 0 (offset 0)
Ignoring wrong

Splitting documents...
Embedding and creating Chroma DB...


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



 VectorDB created with 493 chunks!


  vector_db.persist()
