  week1

In [1]:
# Install libraries
!pip install Pillow pandas

# Import necessary libraries
import os
import pandas as pd
from PIL import Image
import glob # Useful for finding files in directories



In [2]:
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR = '/content/drive/MyDrive/scanned_documents/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os

BASE_DIR = '/content/drive/MyDrive/scanned_documents/'

# List the contents of the base directory
if os.path.isdir(BASE_DIR):
    print(f"Contents of {BASE_DIR}:")
    print(os.listdir(BASE_DIR))
else:
    print(f"ERROR: Directory not found at {BASE_DIR}. Please check the path.")

Contents of /content/drive/MyDrive/scanned_documents/:
['Wikipedia', 'Flatfield', 'Official', 'scanner_data.csv']


In [6]:
import os

# visual evidence

EXAMPLE_SCANNER_PATH = '/content/drive/MyDrive/scanned_documents/Wikipedia/'

print(f"Listing all contents (files AND folders) inside: {EXAMPLE_SCANNER_PATH}")

if os.path.isdir(EXAMPLE_SCANNER_PATH):
    # This will show you the exact file names and extensions
    all_contents = os.listdir(EXAMPLE_SCANNER_PATH)

    # Filter for files (excluding hidden files and folders)
    files_only = [f for f in all_contents if os.path.isfile(os.path.join(EXAMPLE_SCANNER_PATH, f)) and not f.startswith('.')]

    print(f"Total items found in subfolder: {len(all_contents)}")
    print(f"Total files found in subfolder: {len(files_only)}")
    print("\n--- FIRST 20 FILES LISTED ---")
    for i, file_name in enumerate(files_only):
        if i < 20:
            print(file_name)
        else:
            break
else:
    print(f"ERROR: Subfolder not found at {EXAMPLE_SCANNER_PATH}. Check the name.")

Listing all contents (files AND folders) inside: /content/drive/MyDrive/scanned_documents/Wikipedia/
Total items found in subfolder: 11
Total files found in subfolder: 0

--- FIRST 20 FILES LISTED ---


In [7]:
# --- Configuration ---
data = []

DATA_ROOT = '/content/drive/MyDrive/scanned_documents/'

# --- Script to build the DataFrame (Recursive Version) ---
print(f"Scanning files RECURSIVELY in: {DATA_ROOT}")

# Get a list of all scanner model folders (the top-level class labels)
scanner_classes = [d for d in os.listdir(DATA_ROOT) if os.path.isdir(os.path.join(DATA_ROOT, d)) and not d.startswith('.')]

for label_id, scanner_model in enumerate(scanner_classes):
    # This is the path to the top-level class folder (e.g., /.../Wikipedia)
    scanner_path = os.path.join(DATA_ROOT, scanner_model)

    file_list = []

    # *** CHANGE: Use glob.glob with recursive=True (Requires Python 3.5+) ***
    # This finds files in the scanner_path and all sub-directories (**)
    extensions = ['jpg', 'jpeg', 'png', 'tiff', 'tif', 'JPG', 'JPEG', 'PNG', 'TIFF', 'TIF']

    for ext in extensions:
        # The '**' means search recursively in all subdirectories
        pattern = os.path.join(scanner_path, '**', f'*.{ext}')
        file_list.extend(glob.glob(pattern, recursive=True))
    # **********************************************************************

    for file_path in file_list:
        try:
            with Image.open(file_path) as img:
                width, height = img.size
                file_format = img.format
                color_mode = img.mode

            data.append({
                'file_path': file_path,
                'scanner_model': scanner_model,
                'label_id': label_id,
                'width': width,
                'height': height,
                'format': file_format,
                'color_mode': color_mode,
                'resolution': f"{width}x{height}"
            })
        except Exception as e:
            print(f"Skipping file {file_path}. Error: {e}")

# Create the final DataFrame
df = pd.DataFrame(data)

# Save the manifest
OUTPUT_CSV_PATH = os.path.join(DATA_ROOT, 'scanner_data_final.csv')
df.to_csv(OUTPUT_CSV_PATH, index=False)

print("\n--- Week 1 Labeling & Manifest Creation Complete ---")
print(f"Total images found: {len(df)}")
print(f"Scanner Models Found: {df['scanner_model'].nunique()}")

# --- Initial Analysis ---
if len(df) > 0:
    print("\n--- Initial Dataset Analysis ---")
    print("\nClass Balance (Number of Samples per Scanner):")
    print(df['scanner_model'].value_counts())

    print("\nResolution Distribution (Top 10):")
    print(df['resolution'].value_counts().head(10))

    print("\nColor Mode Distribution:")
    print(df['color_mode'].value_counts())
else:
    # This should now be a definite error if it fails again
    print("FATAL ERROR: Still found 0 images. Please manually check Google Drive for file extensions.")

Scanning files RECURSIVELY in: /content/drive/MyDrive/scanned_documents/

--- Week 1 Labeling & Manifest Creation Complete ---
Total images found: 4590
Scanner Models Found: 3

--- Initial Dataset Analysis ---

Class Balance (Number of Samples per Scanner):
scanner_model
Wikipedia    2368
Official     2200
Flatfield      22
Name: count, dtype: int64

Resolution Distribution (Top 10):
resolution
2480x3508    1871
1240x1754     837
1240x1752     626
1236x1754     420
2478x3508     209
1239x1754     209
2481x3487     146
1240x1743     137
1240x1727      72
2481x3471      62
Name: count, dtype: int64

Color Mode Distribution:
color_mode
RGB    4590
Name: count, dtype: int64


WEEK2

In [8]:
# Install OpenCV if it's not already in Colab (it usually is)
!pip install opencv-python

import os
import pandas as pd
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm # for showing progress bar

# --- Define Paths and Parameters ---
BASE_DIR = '/content/drive/MyDrive/scanned_documents/'
MANIFEST_PATH = os.path.join(BASE_DIR, 'scanner_data_final.csv')

# Define the target directory for the preprocessed data structure
OUTPUT_DIR = os.path.join(BASE_DIR, 'preprocessed_data')

# TARGET RESOLUTION (Common choice for ML models)
TARGET_SIZE = (256, 256)
# Note: Since your largest images are ~3500x2400, 256x256 is a significant reduction.
# If computational resources allow, a larger size like 512x512 could retain more detail.
# We'll use 256x256 for efficiency.



In [10]:
import os
import cv2
import numpy as np
from PIL import Image


OFFICIAL_IMAGE_PATH = '/content/drive/MyDrive/Official/Canon120-1/150/s1_1.tif'
WIKIPEDIA_IMAGE_PATH = '/content/drive/MyDrive/Wikipedia/Canon120-1/150/s1_1.tif'

OUTPUT_VISUALS_DIR = '/content/drive/MyDrive/scanned_documents/report_visuals/'
os.makedirs(OUTPUT_VISUALS_DIR, exist_ok=True)

TARGET_SIZE = (512, 512)

# --- Preprocessing Function (for Report Visuals) ---
def preprocess_for_report(file_path, sample_name):
    """Loads image, processes it, and saves the before/after visuals."""

    # 1. Load Image
    img = cv2.imread(file_path, cv2.IMREAD_UNCHANGED)
    if img is None:
        print(f"Error: Could not load {file_path}")
        return

    # Save ORIGINAL Image (for visual)
    original_save_path = os.path.join(OUTPUT_VISUALS_DIR, f'{sample_name}_Original.png')
    cv2.imwrite(original_save_path, img)
    print(f"Saved Original Visual: {original_save_path}")

    # 2. Grayscale & 16-bit to 8-bit conversion
    if img.ndim == 3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    elif img.dtype == np.uint16:
        gray_img = (img / 256).astype(np.uint8) # Scale 16-bit down to 8-bit
    else:
        gray_img = img

    # 3. Resize to 512x512
    resized_img = cv2.resize(gray_img, TARGET_SIZE, interpolation=cv2.INTER_AREA)

    # Save PREPROCESSED Image (512x512)
    processed_save_path = os.path.join(OUTPUT_VISUALS_DIR, f'{sample_name}_Preprocessed_512x512.png')
    cv2.imwrite(processed_save_path, resized_img)
    print(f"Saved Preprocessed Visual: {processed_save_path}")

# --- Execution ---
print("--- Generating Report Visuals ---")
preprocess_for_report(OFFICIAL_IMAGE_PATH, "Official_Sample")
preprocess_for_report(WIKIPEDIA_IMAGE_PATH, "Wikipedia_Sample")

print("\n--- Visuals Generation Complete ---")
print(f"Find your 4 visualization files in: {OUTPUT_VISUALS_DIR}")

--- Generating Report Visuals ---
Saved Original Visual: /content/drive/MyDrive/scanned_documents/report_visuals/Official_Sample_Original.png
Saved Preprocessed Visual: /content/drive/MyDrive/scanned_documents/report_visuals/Official_Sample_Preprocessed_512x512.png
Saved Original Visual: /content/drive/MyDrive/scanned_documents/report_visuals/Wikipedia_Sample_Original.png
Saved Preprocessed Visual: /content/drive/MyDrive/scanned_documents/report_visuals/Wikipedia_Sample_Preprocessed_512x512.png

--- Visuals Generation Complete ---
Find your 4 visualization files in: /content/drive/MyDrive/scanned_documents/report_visuals/
