In [None]:
# 1. Install Dependencies
!pip install pymupdf

# 2. Imports
import os
import io
import re
import random
import requests
import fitz  # PyMuPDF
import numpy as np
import torch
import pandas as pd
from PIL import Image
from dataclasses import dataclass
from google.colab import drive



In [None]:
# 3. Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def extract_and_create_dataset():
    # --- Configuration ---
    # Updated path to match your screenshot
    output_dir = "/content/drive/MyDrive/ML Project/via_dataset"

    # Crop settings
    crop_box = (150, 160, 1000, 800) # Ensure this matches your desired crop
    # ---------------------

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created folder: {output_dir}")

    # Download PDF
    pdf_url = "https://screening.iarc.fr/doc/cecap_flashcards.pdf"
    print(f"Downloading PDF...")
    response = requests.get(pdf_url)
    pdf_stream = io.BytesIO(response.content)
    doc = fitz.open(stream=pdf_stream, filetype="pdf")

    print(f"Processing PDF...")

    dataset_data = []
    sample_id = 1

    # Iterate (Stopping before the last 2 pages)
    for i in range(36, len(doc) - 3, 2):
        try:
            # --- 1. Process Image ---
            page_img = doc.load_page(i)
            mat = fitz.Matrix(2.0, 2.0)
            pix = page_img.get_pixmap(matrix=mat)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

            # Crop
            width, height = img.size
            # Recalculating crop based on your previous 'width - 145' logic if needed
            # Or use the explicit tuple you had: (150, 160, width - 145, height - 365)
            actual_crop = (150, 160, width - 145, height - 365)
            img = img.crop(actual_crop)

            # --- 2. Determine Label ---
            page_text_obj = doc.load_page(i + 1)
            text = page_text_obj.get_text("text")

            label = "Unknown"
            q3_match = re.search(r"Question 3[\s\S]*?\?[\s\S]*?(VIA-positive|VIA-negative)", text, re.IGNORECASE)
            q1_match = re.search(r"Question 1[\s\S]*?\?[\s\S]*?(Yes|No)", text, re.IGNORECASE)

            if q3_match:
                via_status = q3_match.group(1).lower()
                if "negative" in via_status:
                    label = "Normal"
                elif "positive" in via_status:
                    if q1_match and "yes" in q1_match.group(1).lower():
                        label = "Cancerous"
                    else:
                        label = "Precancerous"

            # --- 3. Save Image ---
            filename = f"sample{sample_id}_{label}.jpg"
            save_path = os.path.join(output_dir, filename)
            img.save(save_path)

            # --- 4. Collect Data for CSV ---
            dataset_data.append({
                'sample_id': sample_id,
                'filename': filename,
                'label': label
            })

            print(f"Saved: {filename}")
            sample_id += 1

        except Exception as e:
            print(f"Error on page pair starting index {i}: {e}")

    # --- 5. Save CSV ---
    csv_path = os.path.join(output_dir, "labels.csv")
    df = pd.DataFrame(dataset_data)
    df.to_csv(csv_path, index=False)

    print("-" * 30)
    print(f"Done! Saved images and 'labels.csv' to: {output_dir}")
    print(df['label'].value_counts())

# Run it
extract_and_create_dataset()

Downloading PDF...
Processing PDF...
Saved: sample1_Precancerous.jpg
Saved: sample2_Precancerous.jpg
Saved: sample3_Normal.jpg
Saved: sample4_Normal.jpg
Saved: sample5_Cancerous.jpg
Saved: sample6_Normal.jpg
Saved: sample7_Cancerous.jpg
Saved: sample8_Precancerous.jpg
Saved: sample9_Normal.jpg
Saved: sample10_Precancerous.jpg
Saved: sample11_Precancerous.jpg
Saved: sample12_Normal.jpg
Saved: sample13_Precancerous.jpg
Saved: sample14_Normal.jpg
Saved: sample15_Normal.jpg
Saved: sample16_Normal.jpg
Saved: sample17_Precancerous.jpg
Saved: sample18_Precancerous.jpg
Saved: sample19_Normal.jpg
Saved: sample20_Precancerous.jpg
Saved: sample21_Precancerous.jpg
Saved: sample22_Normal.jpg
Saved: sample23_Precancerous.jpg
Saved: sample24_Precancerous.jpg
Saved: sample25_Cancerous.jpg
Saved: sample26_Normal.jpg
Saved: sample27_Normal.jpg
Saved: sample28_Precancerous.jpg
Saved: sample29_Precancerous.jpg
Saved: sample30_Normal.jpg
Saved: sample31_Normal.jpg
Saved: sample32_Precancerous.jpg
Saved: sa