<a href="https://colab.research.google.com/github/Dotunbey/Branham-ai/blob/main/preprocessing%20data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install pymupdf -q

from google.colab import drive
import os
import re
import json
import fitz  # PyMuPDF
from datetime import datetime

drive.mount('/content/drive')

SOURCE_FOLDER = '/content/drive/My Drive/Branham_Sermons'
OUTPUT_FILENAME = 'branham_rag_dataset.jsonl'

def clean_branham_text(text):
    """
    Specific regex cleaning for Voice of God Recordings PDF format.
    """
    # 1. Remove tags (e.g., single quotes)
    text = re.sub(r"'", '', text)

    # 2. Remove Page Headers (e.g., --- PAGE 1 ---)
    text = re.sub(r'--- PAGE \d+ ---', '', text)

    # 3. Remove "THE SPOKEN WORD" header which appears on every page
    text = re.sub(r'THE SPOKEN WORD', '', text, flags=re.IGNORECASE)

    # 4. Remove Editorial Notes (e.g., [Congregation says... -Ed.])
    text = re.sub(r'\[.*?Ed\.\]', '', text)

    # 5. Remove Tape Blank markers
    text = re.sub(r'\(Blank spot on tape.*?\)', '', text)

    # 6. Join hyphenated words split across lines (e.g., "congre- gation")
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)

    # 7. Collapse multiple spaces and newlines into a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def parse_metadata(filename):
    """
    Extracts Date and Title from filename format: "63_0113M_Letting_Off_Pressure.PDF"
    Returns: {"date": "1963-01-13", "title": "Letting Off Pressure"}
    """
    try:
        # Remove extension
        name = os.path.splitext(filename)[0]

        # Split by the first few underscores (Year_MonthDay_Title)
        parts = name.split('_', 2)

        if len(parts) >= 3:
            year_short = parts[0] # "63"
            date_code = parts[1]  # "0113M" (M means Morning, sometimes E for Evening)
            raw_title = parts[2]  # "Letting_Off_Pressure"

            # Format Year (Assuming 19xx)
            full_year = f"19{year_short}"

            # Format Date
            month = date_code[:2]
            day = date_code[2:4]
            formatted_date = f"{full_year}-{month}-{day}"

            # Format Title
            title = raw_title.replace('_', ' ')

            return {"date": formatted_date, "title": title}
    except:
        pass

    # Fallback if filename format doesn't match
    return {"date": "Unknown", "title": filename}

def process_sermons(source_folder, output_file):
    print(f"ðŸš€ Starting processing in: {source_folder}")

    success_count = 0
    with open(output_file, 'w', encoding='utf-8') as f_out:
        # Walk through the folder
        for root, dirs, files in os.walk(source_folder):
            for file in files:
                if file.lower().endswith('.pdf'):
                    file_path = os.path.join(root, file)

                    try:
                        # 1. Open PDF
                        doc = fitz.open(file_path)
                        full_text = ""
                        for page in doc:
                            full_text += page.get_text()

                        # 2. Clean Text
                        cleaned_text = clean_branham_text(full_text)

                        # 3. Get Metadata
                        metadata = parse_metadata(file)

                        # 4. Chunking (Optional but recommended for RAG)
                        # We split by roughly 1000 characters for vector storage
                        # For now, we will save the whole sermon as one unit,
                        # but usually you want smaller chunks.

                        entry = {
                            "id": file,
                            "metadata": metadata,
                            "text": cleaned_text
                        }

                        # Write to JSONL
                        f_out.write(json.dumps(entry) + '\n')
                        success_count += 1

                        if success_count % 10 == 0:
                            print(f"Processed {success_count} sermons...")

                    except Exception as e:
                        print(f"Error processing {file}: {e}")

    print(f"âœ… COMPLETED. Processed {success_count} sermons.")
    print(f"ðŸ“‚ Output saved to: {output_file}")

# Run the function
process_sermons(SOURCE_FOLDER, OUTPUT_FILENAME)


Mounted at /content/drive
ðŸš€ Starting processing in: /content/drive/My Drive/Branham_Sermons
Processed 10 sermons...
Processed 20 sermons...
Processed 30 sermons...
Processed 40 sermons...
Processed 50 sermons...
Processed 60 sermons...
Processed 70 sermons...
Processed 80 sermons...
Processed 90 sermons...
Processed 100 sermons...
Processed 110 sermons...
Processed 120 sermons...
Processed 130 sermons...
Processed 140 sermons...
Processed 150 sermons...
Processed 160 sermons...
Processed 170 sermons...
Processed 180 sermons...
Processed 190 sermons...
Processed 200 sermons...
Processed 210 sermons...
Processed 220 sermons...
Processed 230 sermons...
Processed 240 sermons...
Processed 250 sermons...
Processed 260 sermons...
Processed 270 sermons...
Processed 280 sermons...
Processed 290 sermons...
Processed 300 sermons...
Processed 310 sermons...
Processed 320 sermons...
Processed 330 sermons...
Processed 340 sermons...
Processed 350 sermons...
Processed 360 sermons...
Processed 370 