### Package install

In [None]:
! pip3 install trafilatura requests bs4 fitz pytesseract pillow surya-ocr faster-whisper openai-whisper datasketch

# install ffmpg for Whisper to process your audio
# On macOS (with Homebrew)
# ! brew install ffmpeg
# On Ubuntu/Debian:
# ! sudo apt-get update -y
# ! sudo apt-get install -y ffmpeg
# ðŸ‘‰ On Windows (if using WSL or native):
# You can download it from:
# ðŸ”— https://ffmpeg.org/download.html
# Or use a package manager like Chocolatey:
# ! choco install ffmpeg

# Week 3: Pretraining Data Collection & Extraction - Hands-on Notebook

## 1. Clean Web Page Text Using trafilatura

In [None]:
import sys
print(sys.executable)

In [None]:
!{sys.executable} -m pip install trafilatura

In [None]:
!python --version

In [None]:
!{sys.executable} -m pip install requests

In [None]:
# âœ… Install dependencies if not already installed
import trafilatura
print(trafilatura.__version__)

In [None]:
import requests
print(requests.__version__)

In [None]:
# Example: An arXiv paper abstract page
url = "https://arxiv.org/abs/2404.00001"

# Step 1: Fetch raw HTML
response = requests.get(url)
html = response.text

# Step 2: Use Trafilatura to extract clean text
downloaded_text = trafilatura.extract(html, include_comments=False, include_tables=False)

# Step 3: Display the result
print("ðŸ“„ Extracted Text Preview:\n")
print(downloaded_text[:1000])  # Show first 1000 characters


Explanation:
trafilatura.extract() pulls main article content while removing headers, menus, and boilerplate.

This works great on academic websites like arXiv, blog posts, or news articles.

No need to write custom HTML parsers.

## 2: OCR â€“ Convert Images to Text
### Option A: Tesseract OCR (Offline)

In [None]:
# you might use the following install if the pytesseract is not installed
# ! sudo apt-get update -y
# ! sudo apt-get install -y tesseract-ocr

In [None]:
!{sys.executable} -m pip install pytesseract

In [None]:
# Install: sudo apt install tesseract-ocr OR !pip install pytesseract Pillow
import pytesseract
from PIL import Image

In [None]:
%pwd

In [None]:
# Load and preprocess image (convert to grayscale)
image = Image.open("C:\\Users\\ch939\\Downloads\\LLMBootCampCodes\\MyGPU.png").convert("L")  # grayscale

In [None]:
text = pytesseract.image_to_string(image)

print("ðŸ“„ Tesseract OCR Output (first 500 chars):")
print(text[:500])


### Option B: Surya OCR (Fast PyTorch-based layout-aware tool)
https://github.com/VikParuchuri/surya

### Usage
To perform OCR on an image, PDF, or a folder containing them:

* Good for: simple single-column text, PDFs converted to images
* Struggles with layout, math, or low-res scans 
    * As you can see from the image: "Download Models" has not been extreact out correctly.

In [1]:
import sys
print(sys.executable)

C:\Users\ch939\anaconda3\envs\llmweek3env\python.exe


In [None]:
!{sys.executable} -m pip install git+https://github.com/datalab-to/surya.git

In [None]:
!nvidia-smi

In [3]:
import sys
print(sys.version)

3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:42:04) [MSC v.1943 64 bit (AMD64)]


In [None]:
!{sys.executable} -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!{sys.executable} -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [None]:
!{sys.executable} -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 --extra-index-url https://pypi.org/simple

In [None]:
!surya_ocr --version

In [None]:
! surya_ocr ./test_data/image/image.png --langs en --images --output_dir results/

Where:

**DATA_PATH** is the path to your image, PDF, or folder.

**--langs** specifies the language(s) for OCR (e.g., en for English).

**--images** saves images of the pages and detected text lines (optional).

**--output_dir** specifies the directory to save results.â€‹

This command will generate a results.json file containing the detected text and bounding boxes.â€‹

Sample Output Structure
The **results.json** will have entries like:â€‹

{
  "image": [
    {
      "text_lines": [
        {
          "polygon": [
            [
              13,
              48
            ],
            [
              538,
              51
            ],
            [
              538,
              87
            ],
            [
              12,
              84
            ]
          ],
          "confidence": 0.9970703125,
          "text": "Llama 4: Leading intelligence.",
          "bbox": [
            12,
            48,
            538,
            87
          ]
        },
        ...
        {
          "polygon": [
            [
              47,
              364
            ],
            [
              176,
              364
            ],
            [
              176,
              378
            ],
            [
              47,
              378
            ]
          ],
          "confidence": 0.9716796875,
          "text": "Download models",
          "bbox": [
            47,
            364,
            176,
            378
          ]
        }
      ],
      "languages": [
        "en"
      ],
      "image_bbox": [
        0,
        0,
        600,
        471
      ],
      "page": 1
    }
  ]
}

#### or in python code

In [None]:
from PIL import Image
from surya.detection import DetectionPredictor
from surya.recognition import RecognitionPredictor

# Load the image
image = Image.open("./test_data/image/image.png")  # Replace with your image path
langs = ["en"]  # Specify the language(s)

# Initialize predictors
detection_predictor = DetectionPredictor()
recognition_predictor = RecognitionPredictor()

# Perform OCR
predictions = recognition_predictor([image], [langs], detection_predictor)

# Display results with polygon coordinates
for page in predictions:
    for line in page.text_lines:
        print(f"Text: {line.text}")
        print(f"Confidence: {line.confidence}")
        print(f"Polygon: {line.polygon}\n")


* Good for: structured layouts like academic papers
* Fast inference and easy to integrate with PDF workflows

### Option C: OpenAI GPT-4o Vision OCR (Highly Accurate & Multicolumn)
don't forget to add you `OPENAI_API_KEY`

In [None]:
import base64
import requests

def vision_extract(b64_image, prompt, api_key):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": "gpt-4o-mini",
        "temperature": 0.0,
        "messages": [
            {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_image}"}}
            ]}
        ],
        "max_tokens": 3000
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

# Load image and run GPT-4o OCR
with open("test_data/image/image.png", "rb") as f:
    b64_img = base64.b64encode(f.read()).decode("utf-8")

# Use your actual API key here
result = vision_extract(b64_img, "Extract all the readable text from this document.", api_key="YOUR_OPENAI_API_KEY")
print(result["choices"][0]["message"]["content"])


* Good for: complex, multi-column documents and natural layout reasoning
* Great fallback when you need accuracy over speed

## 3. Automatic Speech Recognition (ASR)
### Option A: Whisper by OpenAI

In [None]:
# ! brew install ffmpeg


In [None]:
# Install: pip install openai-whisper
import whisper

# Load model
model = whisper.load_model("base")  # or "small", "medium", "large"

# Transcribe audio
result = model.transcribe("./test_data/audio/sample-1.mp3")
print("ðŸ“„ Whisper Transcription:")
print(result["text"])


* Great for: balanced speed and accuracy
* Supports many audio formats: mp3, wav, m4a, webm

### Option B: Faster-Whisper (Fast & Lightweight)

In [None]:
# ! pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel

# Load model with float16 for speed
model = WhisperModel("base", device="cpu", compute_type="int8")  # For CPUs

# Transcribe
segments, _ = model.transcribe("./test_data/audio/sample-1.mp3")

print("ðŸ“„ Faster-Whisper Transcription:")
for segment in segments:
    print(f"[{segment.start:.2f} - {segment.end:.2f}] {segment.text}")


* Optimized for GPU or even CPU 
* Useful when batch-processing long audio datasets

## 4. Pretraining Data Cleaning Pipeline
### Step 1: Remove duplicates using MinHash

In [None]:
from datasketch import MinHash, MinHashLSH

def minhash_deduplication(texts, threshold=0.7):
    lsh = MinHashLSH(threshold=threshold, num_perm=128)
    unique_texts = []
    for i, doc in enumerate(texts):
        m = MinHash(num_perm=128)
        for word in set(doc.split()):
            m.update(word.encode('utf8'))
        if not lsh.query(m):
            lsh.insert(f"doc{i}", m)
            unique_texts.append(doc)
    return unique_texts


### Step 2: Filter for language and strip HTML noise

In [None]:
! pip install langdetect

In [None]:
from langdetect import detect
from bs4 import BeautifulSoup

def clean_html_and_filter_lang(texts, lang='en'):
    filtered = []
    for txt in texts:
        txt = BeautifulSoup(txt, 'html.parser').get_text()
        try:
            if detect(txt.strip()) == lang:
                filtered.append(txt.strip())
        except:
            continue
    return filtered

### Step 3: Strip PII using regex

In [None]:
import re

def strip_pii(text):
    text = re.sub(r'[\w\.-]+@[\w\.-]+', '[EMAIL]', text)
    text = re.sub(r'\b\d{12,19}\b', '[CREDIT_CARD]', text)
    text = re.sub(r'\b(?:\d{3}-){2}\d{4}\b', '[PHONE]', text)
    return text

### Step 4: Remove repetitive n-grams

In [None]:
import re
from collections import Counter

def remove_repetitive_ngrams(text, n=3, threshold=3):
    words = text.split()
    ngrams = [' '.join(words[i:i+n]) for i in range(len(words)-n+1)]

    counts = Counter(ngrams)
    repetitive = [ngram for ngram, count in counts.items() if count >= threshold]

    for phrase in repetitive:
        # regex-safe version of the phrase
        escaped_phrase = re.escape(phrase)
        # match the phrase repeated 2+ times with optional whitespace
        text = re.sub(rf'(?:{escaped_phrase}\s*){{{threshold},}}', phrase + ' ', text)

    # Remove extra spaces
    text = re.sub(r'\s{2,}', ' ', text).strip()
    return text


### Step 7: prepare for the text data
load the Fake_pretraining_Texts.csv

In [None]:
import pandas as pd
fake_texts = pd.read_csv("test_data/data/Fake_Pretraining_Texts.csv")
raw_dataset = fake_texts["Raw Text"]
print(raw_dataset)

### Step 7: Apply the Cleaning Pipeline

In [None]:
# Step 1: Remove HTML + Language Filter
step1 = clean_html_and_filter_lang(raw_dataset)
display(step1)

In [None]:
# Step 2: Deduplicate Paragraphs
step2 = minhash_deduplication(step1)
display(step2)


In [None]:
# Step 3: Strip PII
step3 = [strip_pii(t) for t in step2]
display(step3)

In [None]:
# Step 4: Remove Repetitive N-grams
cleaned_data = [remove_repetitive_ngrams(t) for t in step3]
display(cleaned_data)

In [None]:
# Done!
print("âœ… Cleaned dataset sample:")
for idx, text in enumerate(cleaned_data):
    print(f"--- Article {idx + 1} ---")
    print(text)
