In [1]:
import yaml
import pandas as pd
from sklearn.utils import resample
import os 


In [2]:
def load_config(config_path):
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
CONFIG_PATH = "/home/julian/git-repo/juliangdz/GovernanceIRP/Autonomous-Governance-in-Disaster-Management/post_disaster_classification/configs/satellite_config.yaml"

In [4]:
config = load_config(CONFIG_PATH)

train_csv_path = os.path.join(config['original_data']['root_dir'],'multi_label_dataset','val','logs/dataset.csv')

# Load the dataset
dataset_df = pd.read_csv(train_csv_path)

# Separate majority and minority classes
no_damage_df = dataset_df[dataset_df['no_damage'] == 1]
minor_damage_df = dataset_df[dataset_df['minor_damage'] == 1]
major_damage_df = dataset_df[dataset_df['major_damage'] == 1]
destroyed_df = dataset_df[dataset_df['destroyed'] == 1]

# Find the number of instances in the minority classes
minority_count = min(len(minor_damage_df), len(major_damage_df), len(destroyed_df))

# Undersample the majority class
no_damage_undersampled = resample(no_damage_df,
                                  replace=False,  # sample without replacement
                                  n_samples=minority_count,  # match minority count
                                  random_state=42)  # reproducible results

# Undersample the majority class
minor_damage_undersampled = resample(minor_damage_df,
                                  replace=False,  # sample without replacement
                                  n_samples=minority_count,  # match minority count
                                  random_state=42)  # reproducible results

# Combine undersampled majority class with minority classes
undersampled_df = pd.concat([no_damage_undersampled, minor_damage_undersampled, major_damage_df, destroyed_df])

# Shuffle the resulting dataframe
undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the undersampled dataset
undersampled_df.to_csv(os.path.join(config['original_data']['root_dir'],'multi_label_dataset','val','logs/sampled_dataset.csv'), index=False)


In [5]:
# Display basic information about the dataset
dataset_info = undersampled_df.info()
dataset_head = undersampled_df.head()

# Count the number of patches for each class
label_counts = undersampled_df.iloc[:, 1:].sum()

# Calculate the distribution of labels
label_distribution = label_counts / label_counts.sum()

(dataset_info, dataset_head, label_counts, label_distribution)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5012 entries, 0 to 5011
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   image_path    5012 non-null   object
 1   no_damage     5012 non-null   int64 
 2   minor_damage  5012 non-null   int64 
 3   major_damage  5012 non-null   int64 
 4   destroyed     5012 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 195.9+ KB


(None,
                                           image_path  no_damage  minor_damage  \
 0  /home/julian/datasets/XBD/multi_label_dataset/...          1             0   
 1  /home/julian/datasets/XBD/multi_label_dataset/...          1             0   
 2  /home/julian/datasets/XBD/multi_label_dataset/...          1             0   
 3  /home/julian/datasets/XBD/multi_label_dataset/...          1             0   
 4  /home/julian/datasets/XBD/multi_label_dataset/...          1             0   
 
    major_damage  destroyed  
 0             0          1  
 1             0          0  
 2             1          0  
 3             1          1  
 4             0          1  ,
 no_damage       5012
 minor_damage    3024
 major_damage    1854
 destroyed       1747
 dtype: int64,
 no_damage       0.430695
 minor_damage    0.259861
 major_damage    0.159319
 destroyed       0.150125
 dtype: float64)