# Data Preparation

## Imports

In [1]:
!pip install PyPDF2 nltk pandas

import PyPDF2
import re
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
from pathlib import Path
import json

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m143.4/232.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


## PDF Preprocessor

In [4]:
class PDFPreprocessor:
    def __init__(self):
        # Download required NLTK data
        nltk.download('punkt')
        nltk.download('punkt_tab')

    def extract_text_from_pdf(self, pdf_path):
        """Extract raw text from PDF file."""
        text = ""
        try:
            with open(pdf_path, 'rb') as file:
                # Create PDF reader object
                pdf_reader = PyPDF2.PdfReader(file)

                # Extract text from each page
                for page in pdf_reader.pages:
                    text += page.extract_text() + "\n"

            return text
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
            return ""

    def clean_text(self, text):
        """Clean and normalize the extracted text."""
        # Convert to lowercase
        text = text.lower()

        # Remove page numbers
        text = re.sub(r'\b\d+\b(?=\s*$)', '', text)

        # Remove headers and footers (customize patterns based on your PDFs)
        text = re.sub(r'^\s*page\s+\d+\s*$', '', text, flags=re.MULTILINE)

        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)

        # Remove special characters but keep periods for sentence splitting
        text = re.sub(r'[^a-z0-9\s\.]', '', text)

        return text.strip()

    def split_into_chunks(self, text, max_tokens=512):
        """Split text into chunks suitable for training."""
        # First split into sentences
        sentences = sent_tokenize(text)

        chunks = []
        current_chunk = ""
        current_token_count = 0

        for sentence in sentences:
            # Rough estimation of tokens (words + punctuation)
            sentence_tokens = len(sentence.split())

            if current_token_count + sentence_tokens > max_tokens:
                if current_chunk:
                    chunks.append(current_chunk.strip())
                current_chunk = sentence
                current_token_count = sentence_tokens
            else:
                current_chunk += " " + sentence
                current_token_count += sentence_tokens

        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def create_training_data(self, input_dir, output_file):
        """Process all PDFs in a directory and create training data."""
        pdf_files = list(Path(input_dir).glob('*.pdf'))
        all_chunks = []

        for pdf_file in pdf_files:
            print(f"Processing {pdf_file}")

            # Extract and clean text
            raw_text = self.extract_text_from_pdf(pdf_file)
            cleaned_text = self.clean_text(raw_text)

            # Split into chunks
            chunks = self.split_into_chunks(cleaned_text)
            all_chunks.extend(chunks)

        # Create training examples
        training_data = []
        for chunk in all_chunks:
            # Create example with context
            example = {
                "text": chunk,
                "metadata": {
                    "source": "rice_farming_manual",
                    "tokens": len(chunk.split())
                }
            }
            training_data.append(example)

        # Save to JSON file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(training_data, f, indent=2)

        print(f"Created {len(training_data)} training examples")
        return training_data

    def analyze_dataset(self, training_data):
        """Analyze the created dataset."""
        df = pd.DataFrame([{
            'text_length': len(example['text']),
            'token_count': example['metadata']['tokens']
        } for example in training_data])

        stats = {
            'total_examples': len(training_data),
            'avg_text_length': df['text_length'].mean(),
            'avg_tokens': df['token_count'].mean(),
            'min_tokens': df['token_count'].min(),
            'max_tokens': df['token_count'].max()
        }

        return stats

In [5]:
preprocessor = PDFPreprocessor()

# Set your input and output paths
input_directory = "pdfs"
output_file = "rice_farming_training_data.json"

# Process PDFs and create training data
training_data = preprocessor.create_training_data(input_directory, output_file)

# Analyze the dataset
stats = preprocessor.analyze_dataset(training_data)
print("\nDataset Statistics:")
for key, value in stats.items():
    print(f"{key}: {value:.2f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Processing pdfs/Kenya Rice-Cultivation-Manual.pdf
Created 31 training examples

Dataset Statistics:
total_examples: 31.00
avg_text_length: 3125.06
avg_tokens: 473.45
min_tokens: 162.00
max_tokens: 512.00
