In [None]:

all_files = [
    '107f24d6e9_F1BE1D4184INSPIRE', '11cdce7802_B6A62F8BE0INSPIRE', '12fa5e614f_53197F206FOPENPIPELINE', '130a76ebe1_68B40B480AOPENPIPELINE', 
    '1476907971_CHADGRISMOPENPIPELINE', '1553541487_APIGENERATED', '1553541585_APIGENERATED', '1553627230_APIGENERATED', '15efe45820_D95DF0B1F4INSPIRE', 
    '1726eb08ef_60693DB04DINSPIRE', '1d056881e8_29FEA32BC7INSPIRE', '1d4fbe33f3_F1BE1D4184INSPIRE', '1df70e7340_4413A67E91INSPIRE', '2552eb56dd_2AABB46C86OPENPIPELINE', 
    '25f1c24f30_EB81FE6E2BOPENPIPELINE', '2ef3a4994a_0CCD105428INSPIRE', '2ef883f08d_F317F9C1DFOPENPIPELINE', '34fbf7c2bd_E8AD935CEDINSPIRE', 
    '3502e187b2_23071E4605OPENPIPELINE', '39e77bedd0_729FB913CDOPENPIPELINE', '420d6b69b8_84B52814D2OPENPIPELINE', '520947aa07_8FCB044F58OPENPIPELINE', 
    '551063e3c5_8FCB044F58INSPIRE', '57426ebe1e_84B52814D2OPENPIPELINE', '5fa39d6378_DB9FF730D9OPENPIPELINE', '6f93b9026b_F1BFB8B17DOPENPIPELINE', 
    '7008b80b00_FF24A4975DINSPIRE', '74d7796531_EB81FE6E2BOPENPIPELINE', '7c719dfcc0_310490364FINSPIRE', '84410645db_8D20F02042OPENPIPELINE', 
    '8710b98ea0_06E6522D6DINSPIRE', '888432f840_80E7FD39EBINSPIRE', '9170479165_625EDFBAB6OPENPIPELINE', 'a1af86939f_F1BE1D4184OPENPIPELINE', 
    'b61673f780_4413A67E91INSPIRE', 'b705d0cc9c_E5F5E0E316OPENPIPELINE', 'b771104de5_7E02A41EBEOPENPIPELINE', 'c2e8370ca3_3340CAC7AEOPENPIPELINE', 
    'c37dbfae2f_84B52814D2OPENPIPELINE', 'c644f91210_27E21B7F30OPENPIPELINE', 'c6d131e346_536DE05ED2OPENPIPELINE', 'c8a7031e5f_32156F5DC2INSPIRE', 
    'cc4b443c7d_A9CBEF2C97INSPIRE', 'd06b2c67d2_2A62B67B52OPENPIPELINE', 'd9161f7e18_C05BA1BC72OPENPIPELINE', 'dabec5e872_E8AD935CEDINSPIRE', 
    'e87da4ebdb_29FEA32BC7INSPIRE', 'ebffe540d0_7BA042D858OPENPIPELINE', 'ec09336a6f_06BA0AF311OPENPIPELINE', 
    'f0747ed88d_E74C0DD8FDOPENPIPELINE', 'f4dd768188_NOLANOPENPIPELINE', 'f56b6b2232_2A62B67B52OPENPIPELINE', 
    'f971256246_MIKEINSPIRE', 'f9f43e5144_1DB9E6F68BINSPIRE', 'fc5837dcf8_7CD52BE09EINSPIRE'
]


val_files = [
    "c644f91210_27E21B7F30OPENPIPELINE",
    "f9f43e5144_1DB9E6F68BINSPIRE",
    "1d056881e8_29FEA32BC7INSPIRE",
    "3502e187b2_23071E4605OPENPIPELINE",
    "d9161f7e18_C05BA1BC72OPENPIPELINE",
    "c8a7031e5f_32156F5DC2INSPIRE",
    "551063e3c5_8FCB044F58INSPIRE",
    "fc5837dcf8_7CD52BE09EINSPIRE",
    "39e77bedd0_729FB913CDOPENPIPELINE",
]

test_files = [
    "25f1c24f30_EB81FE6E2BOPENPIPELINE",
    "1d4fbe33f3_F1BE1D4184INSPIRE",
    "15efe45820_D95DF0B1F4INSPIRE",
    "c6d131e346_536DE05ED2OPENPIPELINE",
    "12fa5e614f_53197F206FOPENPIPELINE",
    "5fa39d6378_DB9FF730D9OPENPIPELINE",
    "ebffe540d0_7BA042D858OPENPIPELINE",
    "8710b98ea0_06E6522D6DINSPIRE",
    "84410645db_8D20F02042OPENPIPELINE",
    "a1af86939f_F1BE1D4184OPENPIPELINE"
]



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display


NUM_CLASSES = 6
class_cols = ['0: Building', '1: Clutter', '2: Vegetation', '3: Water', '4: Background', '5: Car']
CLASS_NAMES = ['Building', 'Clutter', 'Vegetation', 'Water', 'Background', 'Car']
COLOR_TO_CLASS = {
    (230, 25, 75): 0,
    (145, 30, 180): 1,
    (60, 180, 75): 2,
    (245, 130, 48): 3,
    (255, 255, 255): 4,
    (0, 130, 200): 5,
    (255, 0, 255): 6
}
CLASS_TO_COLOR = {v: k for k, v in COLOR_TO_CLASS.items() if v < 6}


def compute_chip_score(row):
    car_ratio = row["5: Car_norm"]
    background_ratio = row["4: Background_norm"]
    water_ratio = row["3: Water_norm"]

    class_ratios = {f"{i}: {cls}": row[f"{i}: {cls}_norm"] for i, cls in enumerate(CLASS_NAMES)}

    if car_ratio > 0:
        return -1  # Always keep
    
    if water_ratio > 0 and water_ratio < 0.9:
        return -1  # Keep

    if background_ratio >= 0.825:
        return float('inf')  # Skip

    # Count how many unique classes exist in the chip
    unique_class_count = sum(1 for val in class_ratios.values() if val > 0.001)
    if unique_class_count == 1:
        return float('inf')  # Too homogeneous

    score = 0.0
    score += 2.5 * max(1 - class_ratios["0: Building"], 0)
    score += 11.0 * max(1 - class_ratios["3: Water"], 0)
    score += -19.5 * min(0.1 - class_ratios["2: Vegetation"], 0)

    # Diversity rewards
    if unique_class_count >= 4:
        score -= 6
    elif unique_class_count == 3:
        score -= 3
    elif unique_class_count == 2:
        score -= 0.5
    elif unique_class_count == 1:
        score += 2

    return score


def csv_to_df(split):
    metadata_path = "/content/chipped_data/content/chipped_data/train_metadata.csv"
    df = pd.read_csv(metadata_path)

    target_subset = 0.27

    if split == 'train':
        excluded_prefixes = val_files + test_files
        df = df[~df['tile_id'].apply(lambda tid: any(tid.startswith(p) for p in excluded_prefixes))].copy()

        df['total'] = df[class_cols].sum(axis=1)
        for col in class_cols:
            df[col + '_norm'] = df[col] / df['total']

        '''
        print("\n📊 Normalised Class Distribution Stats (per chip):\n")
        for col in class_cols:
            norm_col = col + '_norm'
            values = df[norm_col]
            print(f"{col}:")
            print(f"  Mean     = {values.mean():.4f}")
            print(f"  Std Dev  = {values.std():.4f}")
            print(f"  Min      = {values.min():.4f}")
            print(f"  Median   = {values.median():.4f}")
            print(f"  Max      = {values.max():.4f}\n")
        '''


        df["score"] = df.apply(compute_chip_score, axis=1)


        # Breakdown stats
        keep_chips = df[df["score"] == -1]
        skipped_chips = df[df["score"] == float('inf')]
        print(f"🚗 Chips with cars (kept): {len(keep_chips)} ({len(keep_chips)/len(df):.2%})")
        print(f"🧱 Chips skipped due to background: {len(skipped_chips)} ({len(skipped_chips)/len(df):.2%})")


        # Select chips
        n_chips = int(len(df) * target_subset)
        always_keep = df[df['score'] == -1].copy()
        rest = df[df['score'] != -1].copy()
        rest = rest[rest['score'] != float('inf')].sort_values('score')

        best_chips = pd.concat([always_keep, rest.head(n_chips - len(always_keep))])
        final_n = len(best_chips)

        plot_class_distribution_from_df(best_chips, title="Training Class Distribution")
        print(f"\n📦 Selected {final_n:,} chips from {len(df):,} total ({final_n / len(df):.2%})")

        return best_chips

    elif split in ['val', 'test']:
        file_list = val_files if split == 'val' else test_files
        df = df[df['tile_id'].apply(lambda tid: any(tid.startswith(p) for p in file_list))].copy()
        df = df[(df['x'] % 256 == 0) & (df['y'] % 256 == 0)].copy()
        return df

    else:
        raise ValueError(f"Invalid split: {split}. Choose from 'train', 'val', or 'test'.")


NUM_CLASSES = 6
CLASS_NAMES = ['Building', 'Clutter', 'Vegetation', 'Water', 'Background', 'Car']
class_cols = [f'{i}: {cls}' for i, cls in enumerate(CLASS_NAMES)]

from IPython.display import display


def filter_for_water_stage(df):
    """
    Filters a dataframe (already reduced to top 27% quality chips) to focus on chips
    containing Water, Car, or Building classes for stage 1 training.
    Removes chips with only one unique class, and drops the worst 10% of Building-only chips.
    Keeps more 2-class chips including Building + Vegetation.
    Returns the filtered dataframe and prints key stats.
    """

    if not any(col.endswith('_norm') for col in df.columns):
        df['total'] = df[class_cols].sum(axis=1)
        for col in class_cols:
            df[col + '_norm'] = df[col] / df['total']

    # Identify chips containing the focus classes
    df_focus = df[
        (df["3: Water_norm"] > 0) |
        (df["5: Car_norm"] > 0) |
        (df["0: Building_norm"] > 0)
    ].copy()

    # Calculate how many unique classes each chip has
    norm_cols = [f"{i}: {cls}_norm" for i, cls in enumerate(CLASS_NAMES)]
    df_focus["unique_class_count"] = df_focus[norm_cols].gt(0.001).sum(axis=1)

    # Drop chips that only have Vegetation and Background
    veg_bg_mask = df_focus[norm_cols].gt(0.001)
    veg_bg_only = veg_bg_mask.sum(axis=1) == 2
    veg_bg_only &= veg_bg_mask["2: Vegetation_norm"] & veg_bg_mask["4: Background_norm"]
    df_focus = df_focus[~veg_bg_only]

    # Remove chips with only 1 unique class
    df_focus = df_focus[df_focus["unique_class_count"] > 1]

    # Drop some 2-class chips (only keep ones with Water, Car, or Building + Vegetation)
    '''
    two_class = df_focus[df_focus["unique_class_count"] == 2]
    two_class_keep = two_class[
        (two_class["3: Water_norm"] > 0) |
        (two_class["5: Car_norm"] > 0) |
        ((two_class["0: Building_norm"] > 0) & (two_class["2: Vegetation_norm"] > 0))
    ]
    df_focus = pd.concat([
        df_focus[df_focus["unique_class_count"] != 2],
        two_class_keep
    ])'''

    # Remove the worst 10% of Building chips (by Building_norm), excluding those with Water or Car
    building_only = df_focus[
        (df_focus["0: Building_norm"] > 0) &
        (df_focus["3: Water_norm"] == 0) &
        (df_focus["5: Car_norm"] == 0)
    ]
    cutoff = np.percentile(building_only["0: Building_norm"], 5)
    df_focus = df_focus[~((df_focus.index.isin(building_only.index)) & (df_focus["0: Building_norm"] <= cutoff))]

    # Recount after filtering
    print(f"📦 Total chips for stage 1: {len(df_focus)}")
    print(f"💧 Chips with Water: {len(df_focus[df_focus['3: Water_norm'] > 0])}")
    print(f"🚗 Chips with Car: {len(df_focus[df_focus['5: Car_norm'] > 0])}")
    print(f"🏢 Chips with Building: {len(df_focus[df_focus['0: Building_norm'] > 0])}")
    print("🎨 Distribution of unique classes per chip:")
    print(df_focus["unique_class_count"].value_counts().sort_index())

    return df_focus


def balanced_stage1_filter(df):
    """
    Keep:
    - Any chip with a car
    - Any chip with water between 0% and 90%
    """
    car_mask = (df["5: Car"] > 0.1) & (df["5: Car"] < 0.9)
    water_mask = (df["3: Water"] > 0) & (df["3: Water"] < 0.9)
    stage1_df = df[car_mask | water_mask].copy()

    return stage1_df



def plot_class_distribution_from_df(dataframe, title="Training Class Distribution"):
    pixel_sums = dataframe[class_cols].sum()
    pixel_props = pixel_sums / pixel_sums.sum()

    class_labels = [f"{i}: {CLASS_NAMES[i]}" for i in range(NUM_CLASSES)]
    colours = [np.array(CLASS_TO_COLOR[i]) / 255.0 for i in range(NUM_CLASSES)]

    plt.figure(figsize=(10, 5))
    bars = plt.bar(class_labels, pixel_props, color=colours, edgecolor='black')
    plt.title(title)
    plt.xlabel("Class")
    plt.ylabel("Proportion")
    plt.grid(True, axis='y', linestyle='--', alpha=0.5)

    for bar, prop in zip(bars, pixel_props):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{prop:.2%}",
                 ha='center', va='bottom', fontsize=9)

    plt.tight_layout()
    plt.show()
