Load raw Persian text (e.g., enhelal.txt), normalize, clean, tokenize into sentences and paragraphs, and split into chunks suitable for embedding.

In [None]:
import sys
import os

# 1) Attempt 1: If current working dir is 'notebooks', add parent folder
notebooks_dir = os.getcwd()
print("Notebook cwd before:", notebooks_dir)

# The code below computes candidate root: one level above notebooks.
project_root_candidate = os.path.abspath(os.path.join(notebooks_dir, os.pardir))
if os.path.isdir(os.path.join(project_root_candidate, "modules")):
    if project_root_candidate not in sys.path:
        sys.path.insert(0, project_root_candidate)
        print("Inserted project root into sys.path:", project_root_candidate)
else:
    # 2) Fallback: maybe cwd is already project root
    if os.path.isdir(os.path.join(notebooks_dir, "modules")):
        if notebooks_dir not in sys.path:
            sys.path.insert(0, notebooks_dir)
            print("Inserted notebooks_dir as project root into sys.path:", notebooks_dir)

# Confirm sys.path
print("First entries of sys.path:", sys.path[:3])


In [None]:
# Standard libraries
import os
import re
from hazm import Normalizer, word_tokenize, sent_tokenize
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

# Bring in GPU memory cleanup (import from Notebook 1 if needed)
import torch

# Import utility functions from modules/utils.py
from modules.utils import clean_text 

In [None]:
# Adjust path to raw file
raw_path = os.path.join("..", "data" , "docs", "enhelal.txt")
loader = TextLoader(raw_path, encoding="utf-8")
raw_docs = loader.load()
print(f"Loaded {len(raw_docs)} raw documents.")

In [None]:
# Clean each document
cleaned_docs = [
    Document(page_content=clean_text(doc.page_content), metadata=doc.metadata)
    for doc in raw_docs
]

# Split into 3-sentence paragraphs
paragraphs = []
for d in cleaned_docs:
    sentences = sent_tokenize(d.page_content)
    for i in range(0, len(sentences), 3):
        para = " ".join(sentences[i:i + 3])
        if para.strip():
            paragraphs.append(Document(page_content=para, metadata=d.metadata))
print(f"Number of paragraphs: {len(paragraphs)}")


In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30,
    length_function=lambda x: len(x.split()),
    separators=["۔", "؛", "\n", " ", ""]
)

chunks = splitter.split_documents(paragraphs)
# Filter out very short chunks (fewer than 20 words)
chunks = [c for c in chunks if len(c.page_content.split()) > 20]
for i, chunk in enumerate(chunks):
    chunk.metadata['chunk_index'] = i
print(f"Number of chunks after splitting: {len(chunks)}")

In [None]:
# Optionally, save `chunks` to disk (e.g., as JSON or pickle) for future loading
import pickle

with open(os.path.join("..", "data", "chunks.pkl"), "wb") as f:
    pickle.dump(chunks, f)
print("Chunks saved to data/chunks.pkl")
