In [None]:
# ==============================================================================
# NOTEBOOK 1: BALANCED PREPROCESSING (High Volume Augmentation)
# ==============================================================================

import os
import shutil
import random
import re
import cv2
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from google.colab import drive

# 1. Setup Paths
drive.mount('/content/drive')

# --- CONFIGURATION ---
CSV_PATH = "/content/drive/MyDrive/ML Project/labels_combine.csv"
SOURCE_IMG_DIR = "/content/drive/MyDrive/ML Project/via_dataset"

BASE_DIR = '/content/via_dataset_final_processed'
TRAIN_DIR = os.path.join(BASE_DIR, 'train')
TEST_DIR = os.path.join(BASE_DIR, 'test')
# ---------------------

# 2. COMPLETE CLEANUP
print(f"Cleaning up {BASE_DIR}...")
if os.path.exists(BASE_DIR):
    shutil.rmtree(BASE_DIR)
os.makedirs(TRAIN_DIR)
os.makedirs(TEST_DIR)

# 3. Load Data
print(f"Loading metadata from: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)

# Clean Labels to ensure matching works
df['label'] = df['label'].astype(str).str.strip().str.capitalize()

# 4. Patient Split
def get_patient_id(filename):
    filename = str(filename).strip()
    if "IARC" in filename:
        match = re.search(r"([0-9]+[A-Z]+|[A-Z]+[0-9]+)", filename)
        if match: return match.group(0)
    if "sample" in filename:
        return filename.split('_')[0]
    return filename

df['patient_id'] = df['filename'].apply(get_patient_id)
unique_patients = df['patient_id'].unique()

train_patients, test_patients = train_test_split(unique_patients, test_size=0.2, random_state=42)
print(f"Train Patients: {len(train_patients)} | Test Patients: {len(test_patients)}")

# 5. Move Originals
filename_to_label = dict(zip(df['filename'], df['label']))

def move_files(patient_list, destination):
    count = 0
    subset = df[df['patient_id'].isin(patient_list)]
    for _, row in subset.iterrows():
        fname = row['filename']
        src = os.path.join(SOURCE_IMG_DIR, fname)
        dst = os.path.join(destination, fname)
        if os.path.exists(src):
            shutil.copy2(src, dst)
            count += 1
    return count

print("Moving original files...")
n_train = move_files(train_patients, TRAIN_DIR)
n_test = move_files(test_patients, TEST_DIR)
print(f"Originals in Train: {n_train}")

# 6. AGGRESSIVE AUGMENTATION LOGIC
def generate_random_variant(image):
    h, w = image.shape[:2]
    aug_img = image.copy()

    # 1. Rotation (Higher range)
    angle = random.uniform(-45, 45) # Increased from 15 to 45 for variety
    M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
    aug_img = cv2.warpAffine(aug_img, M, (w, h), borderMode=cv2.BORDER_REFLECT)

    # 2. Flips
    if random.random() > 0.5: aug_img = cv2.flip(aug_img, 1)
    if random.random() > 0.5: aug_img = cv2.flip(aug_img, 0)

    # 3. Color/Contrast
    if random.random() > 0.3: # Increased probability
        alpha = random.uniform(0.8, 1.2)
        beta = random.uniform(-20, 20)
        aug_img = cv2.convertScaleAbs(aug_img, alpha=alpha, beta=beta)

    return aug_img

print("\nStarting Aggressive Augmentation...")
train_files = [f for f in os.listdir(TRAIN_DIR) if f.lower().endswith('.jpg')]

augmented_count = 0

for fname in train_files:
    img_path = os.path.join(TRAIN_DIR, fname)
    image = cv2.imread(img_path)
    if image is None: continue

    label = filename_to_label.get(fname, "Unknown")

    # --- UPDATED MULTIPLIERS ---
    if label == 'Cancerous':
        num_copies = 30  # 30x multiplier!
    elif label == 'Precancerous':
        num_copies = 15  # 15x multiplier
    elif label == 'Normal':
        num_copies = 5   # 5x multiplier
    else:
        num_copies = 0

    base_name = os.path.splitext(fname)[0]

    for i in range(num_copies):
        aug_variant = generate_random_variant(image)
        save_name = f"{base_name}_aug_{i}.jpg"
        save_path = os.path.join(TRAIN_DIR, save_name)
        cv2.imwrite(save_path, aug_variant)
        augmented_count += 1

print("-" * 30)
print(f"Augmentation Stats:")
print(f"Originals: {n_train}")
print(f"New Augmentations: {augmented_count}")
print(f"TOTAL Training Images: {len(os.listdir(TRAIN_DIR))}")
print("-" * 30)

# 7. Zip and Overwrite
zip_name = '/content/drive/MyDrive/ML Project/Combined_Dataset_Processed'
shutil.make_archive(zip_name, 'zip', BASE_DIR)
print(f"Dataset successfully overwritten at: {zip_name}.zip")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Cleaning up /content/via_dataset_final_processed...
Loading metadata from: /content/drive/MyDrive/ML Project/labels_combine.csv
Train Patients: 244 | Test Patients: 62
Moving original files...
Originals in Train: 235

Starting Aggressive Augmentation...
------------------------------
Augmentation Stats:
Originals: 235
New Augmentations: 2685
TOTAL Training Images: 2920
------------------------------
Dataset successfully overwritten at: /content/drive/MyDrive/ML Project/Combined_Dataset_Processed.zip
