In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


# Locate the IARC image bank


In [None]:
import os
import pandas as pd

path = "/content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/Cases Meta data.xlsx"

if os.path.exists(path):
    print("Found it!")
    # Load the Excel file
    df = pd.read_excel(path)

    # Select specific columns and the first 30 rows
    # Note: Ensure the column names match exactly what is in the Excel file (case-sensitive)
    subset = df[['CaseNumber', 'CaseID', 'VIA']].head(30)

    print(subset)
else:
    print(f"File not found at: {path}")

Found it!
    CaseNumber CaseID       VIA
0            1    AFC  Negative
1            2    AJL  Negative
2            3    AGY  Negative
3            4    AJE  Negative
4            5    AHS  Negative
5            6    AMY  Negative
6            7    AFE  Negative
7            8    AMF  Negative
8            9    AIF  Negative
9           10    AJG  Negative
10          11    AGW  Negative
11          12    AMK  Negative
12          13    AFH  Negative
13          14    ANC  Negative
14          15    AIH  Negative
15          16    AHT  Negative
16          17    AGV  Negative
17          18    AMT  Negative
18          19    AIL  Negative
19          20    AFI  Negative
20          21    AKB  Negative
21          22    AHR  Negative
22          23    ANP  Negative
23          24    AGM  Negative
24          25    AJW  Negative
25          26    AID  Negative
26          27    ANV  Negative
27          28    AFJ  Negative
28          29    ANS  Negative
29          30    AJZ  Negativ

# Generate Label CSV

In [None]:
df = pd.read_excel(path)

# 2. Define the classification logic
def classify_via(via_value):
    # Convert to string and lowercase to be safe (handles "Positive", "positive", etc.)
    status = str(via_value).strip().lower()

    if status == 'negative':
        return 'Normal'
    elif status == 'positive':
        return 'Precancerous'
    else:
        # "Else" case (e.g., 'Suspicious for cancer', 'Invasive cancer', etc.)
        return 'Cancerous'

# Check what values exist before mapping (for your verification)
print("Unique VIA values found in source:", df['VIA'].unique())

# 3. Apply the logic
df['VIA result'] = df['VIA'].apply(classify_via)

# 4. Select and Rename columns
# User requested: caseID, VIA result
output_df = df[['CaseID', 'VIA result']].copy()
output_df.columns = ['caseID', 'VIA result']

# 5. Save to CSV
output_path = os.path.join(os.path.dirname(path), 'labels.csv')
output_df.to_csv(output_path, index=False)

print("-" * 30)
print(f"Success! labels.csv saved to: {output_path}")
print("\nPreview of the new dataset:")
print(output_df.head(10))
print("\nDistribution of classes:")
print(output_df['VIA result'].value_counts())

Unique VIA values found in source: ['Negative' 'Positive' 'Suspicious of cancer']
------------------------------
Success! labels.csv saved to: /content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/labels.csv

Preview of the new dataset:
  caseID VIA result
0    AFC     Normal
1    AJL     Normal
2    AGY     Normal
3    AJE     Normal
4    AHS     Normal
5    AMY     Normal
6    AFE     Normal
7    AMF     Normal
8    AIF     Normal
9    AJG     Normal

Distribution of classes:
VIA result
Normal          92
Precancerous    74
Cancerous       20
Name: count, dtype: int64


# Extract only the Post-VIA images and Store their ID in a csv

In [None]:
import pandas as pd
import os

# 1. Define Paths
base_dir = "/content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/"
input_path = os.path.join(base_dir, "Cases - Images.xlsx")
output_path = os.path.join(base_dir, "cases_images_post_via.csv")

if os.path.exists(input_path):
    print(f"Reading file: {input_path}")
    df = pd.read_excel(input_path)

    print("Original Columns:", df.columns.tolist())

    # 2. Filter for Post-VIA images
    # We look for the column that describes the image type (usually named 'Type' or similar)
    # The code below automatically finds the column containing 'acetic acid' to be safe.
    type_col = None
    for col in df.columns:
        # Check if the column contains string values matching the target
        if df[col].astype(str).str.contains("acetic acid", case=False).any():
            type_col = col
            break

    if type_col:
        print(f"Found description column: '{type_col}'")

        # Filter: Keep rows containing "After" (e.g. "After application of acetic acid")
        filtered_df = df[df[type_col].astype(str).str.contains("After", case=False)]

        # 3. Save to CSV
        filtered_df.to_csv(output_path, index=False)

        print("-" * 30)
        print(f"Success! Filtered dataset saved to: {output_path}")
        print(f"Original rows: {len(df)}")
        print(f"Filtered rows: {len(filtered_df)}")
        print("\nPreview:")
        print(filtered_df.head())
    else:
        print("Error: Could not identify the column containing 'Before/After' descriptions.")
        print("Please check the column names printed above.")

else:
    print(f"File not found: {input_path}")
    print("Please verify the file name and directory.")

Reading file: /content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/Cases - Images.xlsx
Original Columns: ['CaseNumber', 'File', 'Type']
Found description column: 'Type'
------------------------------
Success! Filtered dataset saved to: /content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/cases_images_post_via.csv
Original rows: 420
Filtered rows: 234

Preview:
   CaseNumber      File                              Type
1           1  AFC1.jpg  After application of acetic acid
3           2  AJL1.jpg  After application of acetic acid
5           3  AGY1.jpg  After application of acetic acid
7           4  AJE1.jpg  After application of acetic acid
9           5  AHS1.jpg  After application of acetic acid


# Store all the VIA images with the jphiego images

In [None]:
import shutil
def collect_iarc_images():
    # --- Configuration ---
    # Path to the shortcut of the IARC database in your Drive
    # Based on your previous paths, it seems to be nested like this:
    SOURCE_ROOT = "/content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/"

    # Destination folder where you want to collect all images
    DEST_DIR = "/content/drive/MyDrive/ML Project/via_dataset/"

    # Prefix to add to the new filenames
    FILENAME_PREFIX = "IARC_image_bank_"
    # ---------------------

    # Basic checks
    if not os.path.exists(SOURCE_ROOT):
        print(f"Error: Source directory not found at: {SOURCE_ROOT}")
        print("Please ensure you have created the shortcut in 'My Drive/ML Project'.")
        return

    if not os.path.exists(DEST_DIR):
        os.makedirs(DEST_DIR)
        print(f"Created destination folder: {DEST_DIR}")
    else:
        print(f"Destination folder found: {DEST_DIR}")

    print(f"\nStarting image collection from: {SOURCE_ROOT}")
    print("Looking for Post-VIA images (ending in '1.jpg')...\n")

    copied_count = 0
    errors = []

    # 2. Iterate through each 'Case' folder
    # os.listdir gives us the names of folders like 'Case 001', 'Case 002', etc.
    for case_folder_name in os.listdir(SOURCE_ROOT):
        case_dir_path = os.path.join(SOURCE_ROOT, case_folder_name)

        # Check if it's actually a directory and starts with "Case"
        if os.path.isdir(case_dir_path) and case_folder_name.startswith("Case "):
            try:
                # 3. Look for images inside the Case folder
                for filename in os.listdir(case_dir_path):
                    # Filter Logic: We want .jpg files that end with '1' before the extension
                    # e.g., 'AFC1.jpg' is good, 'AFC0.jpg' is skipped.
                    if filename.lower().endswith('1.jpg'):

                        # Setup paths
                        src_path = os.path.join(case_dir_path, filename)

                        # Create new filename with prefix
                        new_filename = f"{FILENAME_PREFIX}{filename}"
                        dest_path = os.path.join(DEST_DIR, new_filename)

                        # 4. Copy the file
                        # Check if file already exists to avoid overwriting and save time
                        if not os.path.exists(dest_path):
                            shutil.copy2(src_path, dest_path)
                            print(f"[Copied] {case_folder_name}/{filename}  ->  {new_filename}")
                            copied_count += 1
                        else:
                            print(f"[Skipped] {new_filename} already exists.")

            except Exception as e:
                error_msg = f"Error processing {case_folder_name}: {e}"
                print(error_msg)
                errors.append(error_msg)

    # 5. Final Summary
    print("-" * 30)
    print(f"Completed! Successfully copied {copied_count} new images.")
    print(f"All images are now in: {DEST_DIR}")

    if errors:
        print("\nEncountered some errors:")
        for e in errors:
            print(f"- {e}")

# Run the function
collect_iarc_images()

Destination folder found: /content/drive/MyDrive/ML Project/via_dataset/

Starting image collection from: /content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/
Looking for Post-VIA images (ending in '1.jpg')...

[Copied] Case 001/AFC1.jpg  ->  IARC_image_bank_AFC1.jpg
[Copied] Case 151/ABQ1.jpg  ->  IARC_image_bank_ABQ1.jpg
[Copied] Case 132/AAO1.jpg  ->  IARC_image_bank_AAO1.jpg
[Copied] Case 071/ACN1.jpg  ->  IARC_image_bank_ACN1.jpg
[Copied] Case 014/ANC1.jpg  ->  IARC_image_bank_ANC1.jpg
[Copied] Case 134/ABO1.jpg  ->  IARC_image_bank_ABO1.jpg
[Copied] Case 168/AEB1.jpg  ->  IARC_image_bank_AEB1.jpg
[Copied] Case 085/ABE1.jpg  ->  IARC_image_bank_ABE1.jpg
[Copied] Case 102/AJA1.jpg  ->  IARC_image_bank_AJA1.jpg
[Copied] Case 160/ADX1.jpg  ->  IARC_image_bank_ADX1.jpg
[Copied] Case 105/ABJ1.jpg  ->  IARC_image_bank_ABJ1.jpg
[Copied] Case 167/ABP1.jpg  ->  IARC_image_bank_ABP1.jpg
[Copied] Case 157/ADN1.jpg  ->  IARC_image_bank_ADN1.jpg
[Copied] Case 158/ACW1.jpg  ->  

# Create IARC Label with case ID, file name, VIA results.

In [None]:
def merge_iarc_data():
    # --- Configuration ---
    # Path to the folder containing your CSVs (the nested folder)
    base_dir = "/content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/"

    labels_path = os.path.join(base_dir, "labels.csv")
    images_path = os.path.join(base_dir, "cases_images_post_via.csv")
    output_path = os.path.join(base_dir, "iarc_merged_dataset.csv")
    # ---------------------

    # 2. Load the CSVs
    if not os.path.exists(labels_path) or not os.path.exists(images_path):
        print("Error: Could not find one of the CSV files.")
        print(f"Checking: {labels_path}")
        print(f"Checking: {images_path}")
        return

    print("Loading files...")
    df_labels = pd.read_csv(labels_path)
    df_images = pd.read_csv(images_path)

    # 3. Prepare 'caseID' for merging
    # We need to extract the ID from the filename in df_images.
    # Logic: "AFC1.jpg" -> "AFC" (Remove '1.jpg' from the end)

    def extract_id_from_filename(filename):
        # Ensure it's a string
        filename = str(filename)
        # Check if it ends with '1.jpg' or '0.jpg' and strip it
        # Using a generic approach: remove extension, then remove last char
        name_no_ext = os.path.splitext(filename)[0]
        return name_no_ext[:-1]

    df_images['caseID'] = df_images['File'].apply(extract_id_from_filename)

    # Ensure keys are strings for consistent merging
    df_images['caseID'] = df_images['caseID'].astype(str).str.strip()
    df_labels['caseID'] = df_labels['caseID'].astype(str).str.strip()

    # 4. Merge
    # We use a 'left' join to keep all images, attaching labels where matches are found.
    merged_df = pd.merge(df_images, df_labels, on='caseID', how='left')

    # 5. Add 'new_filename' column
    # This helps match the images we copied to the 'via_dataset' folder
    merged_df['new_filename'] = "IARC_image_bank_" + merged_df['File']

    # 6. Save
    merged_df.to_csv(output_path, index=False)

    print("-" * 30)
    print("Success! Data merged.")
    print(f"Saved to: {output_path}")
    print(f"Total Rows: {len(merged_df)}")

    # Check for missing labels
    missing_labels = merged_df['VIA result'].isna().sum()
    if missing_labels > 0:
        print(f"Warning: {missing_labels} images could not be matched to a label.")
    else:
        print("All images matched with a label!")

    print("\nPreview:")
    print(merged_df[['caseID', 'File', 'new_filename', 'VIA result']].head())

# Run it
merge_iarc_data()

Loading files...
------------------------------
Success! Data merged.
Saved to: /content/drive/MyDrive/ML Project/IARCImageBankVIA/IARCImageBankVIA/iarc_merged_dataset.csv
Total Rows: 191
All images matched with a label!

Preview:
  caseID      File              new_filename VIA result
0    AFC  AFC1.jpg  IARC_image_bank_AFC1.jpg     Normal
1    AJL  AJL1.jpg  IARC_image_bank_AJL1.jpg     Normal
2    AGY  AGY1.jpg  IARC_image_bank_AGY1.jpg     Normal
3    AJE  AJE1.jpg  IARC_image_bank_AJE1.jpg     Normal
4    AHS  AHS1.jpg  IARC_image_bank_AHS1.jpg     Normal


In [None]:
import pandas as pd
import os
from google.colab import drive

drive.mount('/content/drive')

def merge_datasets_final():
    # --- Configuration ---
    project_root = "/content/drive/MyDrive/ML Project"

    # 1. Path to the IARC CSV (using the CLEAN version from previous step)
    # If you didn't run the clean step, change this to "iarc_merged_dataset.csv"
    iarc_csv_path = os.path.join(project_root, "IARCImageBankVIA/IARCImageBankVIA/iarc_merged_dataset.csv")

    # 2. Path to the Jhpiego (PDF) CSV
    jhpiego_csv_path = os.path.join(project_root, "via_dataset/labels.csv")

    # 3. Output Path
    output_path = os.path.join(project_root, "labels_combine.csv")
    # ---------------------

    # --- Load IARC Data ---
    if os.path.exists(iarc_csv_path):
        print("Loading IARC data...")
        df_iarc = pd.read_csv(iarc_csv_path)

        # Rename columns to match the target format
        # new_filename -> filename
        # VIA result   -> label
        df_iarc = df_iarc.rename(columns={'new_filename': 'filename', 'VIA result': 'label'})

        # Keep only the columns we need
        df_iarc_clean = df_iarc[['filename', 'label']].copy()
        print(f"IARC samples: {len(df_iarc_clean)}")
    else:
        print(f"Error: Could not find IARC CSV at {iarc_csv_path}")
        return

    # --- Load Jhpiego (PDF) Data ---
    if os.path.exists(jhpiego_csv_path):
        print("Loading Jhpiego data...")
        df_jh = pd.read_csv(jhpiego_csv_path)

        # It should already have 'filename' and 'label', but let's be safe
        df_jh_clean = df_jh[['filename', 'label']].copy()
        print(f"Jhpiego samples: {len(df_jh_clean)}")
    else:
        print(f"Error: Could not find Jhpiego CSV at {jhpiego_csv_path}")
        return

    # --- Merge ---
    print("Merging...")
    final_df = pd.concat([df_jh_clean, df_iarc_clean], ignore_index=True)

    # Shuffle the dataset (optional, but good practice)
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # --- Save ---
    final_df.to_csv(output_path, index=False)

    print("-" * 30)
    print("Success! Final combined dataset created.")
    print(f"Saved to: {output_path}")
    print(f"Total Images: {len(final_df)}")
    print("\nLabel Distribution:")
    print(final_df['label'].value_counts())
    print("\nPreview:")
    print(final_df.head())

merge_datasets_final()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loading IARC data...
IARC samples: 191
Loading Jhpiego data...
Jhpiego samples: 115
Merging...
------------------------------
Success! Final combined dataset created.
Saved to: /content/drive/MyDrive/ML Project/labels_combine.csv
Total Images: 306

Label Distribution:
label
Normal          149
Precancerous    131
Cancerous        26
Name: count, dtype: int64

Preview:
                     filename         label
0    IARC_image_bank_ADI1.jpg        Normal
1    IARC_image_bank_AHQ1.jpg        Normal
2  sample112_Precancerous.jpg  Precancerous
3    IARC_image_bank_AAA1.jpg        Normal
4   sample61_Precancerous.jpg  Precancerous
