# Split data
***

## 1. Set up environment
***

Next, let's import some necessary libraries of the usual suspects:

In [1]:
# Data Science libraries
import pandas as pd # data processing, CSV file I/O
import numpy as np

# Sklearn libraries
from sklearn.model_selection import StratifiedGroupKFold

# System libraries
import os
import shutil

## 2. Load data
***

In [2]:
os.chdir(r'C:\GitHub\CameraTrap-Animal-Classification')
# print(os.getcwd())

Read in the train and test CSVs first and see what they look like.

In [3]:
# dataset_path = r'C:\GitHub\CameraTrap-Animal-Classification\data\raw'
dataset_path = 'data/raw'
os.makedirs(dataset_path, exist_ok=True)

In [4]:
train_features = pd.read_csv(os.path.join(dataset_path, 'train_features.csv'), index_col="id")
test_features = pd.read_csv(os.path.join(dataset_path, 'test_features.csv'), index_col="id")
train_labels = pd.read_csv(os.path.join(dataset_path, 'train_labels.csv'), index_col="id")

In [5]:
train_features_images = [f for f in os.listdir(os.path.join(dataset_path, 'train_features')) if f.endswith('.jpg')]
test_features_images = [f for f in os.listdir(os.path.join(dataset_path, 'test_features')) if f.endswith('.jpg')]

print("Number of image files: train: {} test: {}".format(len(train_features_images), len(test_features_images)))

Number of image files: train: 16488 test: 4464


Let's store a sorted list of the labels, so that we can sort the inputs and outputs to our model in a consistent way.

In [6]:
species_labels = sorted(train_labels.columns.unique())
print(species_labels)

['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']


## 3. Split data into train and validation sets
***

Next, we'll need to split the images in train_features folder into train and validation sets. We'll put aside 20% of the data for evaluation and stratify by the target labels to ensure we have similar relative frequencies of each class in the train and validation sets.

You can feel free to adjust `frac` or remove it entirely if you want to run the training on the another set.

In [7]:
subsets = ['train', 'validation']
classes = ['antelope_duiker', 'bird', 'blank', 'civet_genet', 'hog', 'leopard', 'monkey_prosimian', 'rodent']

for subset in subsets:
    for class_name in classes:
        os.makedirs(os.path.join(dataset_path, subset, class_name), exist_ok=True)

train_features_dir = os.path.join(dataset_path, 'train_features')
test_features_dir = os.path.join(dataset_path, 'test_features')
train_labels = pd.read_csv(os.path.join(dataset_path, 'train_labels.csv'), index_col="id")

Segregation of the images in the train_features folder by copying them to the species subfolders of the same folder (run only once)

In [11]:
# # Function to segregate images into 8 class in folders: train_features
# def segregate_images(src_dir, dest_dir):

#     for img_id, row in dest_dir.iterrows():

#         img_id = row.name  # Assuming 'id' is the index of the dataframe
#         img_file = f"{img_id}.jpg"  # Ensure the file extension matches your dataset
#         src_path = os.path.join(src_dir, img_file)

#         if not os.path.exists(src_path):
#             print(f"Image {img_file} does not exist in the source directory.")
#             continue

#         # Copy image to the specific class folder based on binary class columns
#         for species in classes:
#             if row[species] == 1.0:
#                 specific_dest_path = os.path.join(src_dir, species, img_file)

#                 if src_path != specific_dest_path:
#                     shutil.copy(src_path, specific_dest_path)
#                 break

# segregate_images(train_features_dir, train_labels)

First function to clear directory before use, second function to copy images from variables (x_train, x_val, y_train, y_val) after split to folders: train and validation

In [8]:
# Function to clear the contents of a directory
def clear_directory(dir_path):
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
    os.makedirs(dir_path, exist_ok=True)

clear_directory(os.path.join(dataset_path, 'train'))
clear_directory(os.path.join(dataset_path, 'validation'))


# Function to copy images to directory
def copy_images(x_df, y_df, dest_dir):

    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir, exist_ok=True)

    # Iterate over the indices of x_df which should match y_df
    for idx in x_df.index:
        # Construct full image path from the base directory and the filepath in the DataFrame
        image_path = os.path.join(dataset_path, x_df.loc[idx, 'filepath'])

        if not os.path.isfile(image_path):
            print(f"Image {image_path} does not exist.")
            continue

        # Identify the class label by finding the column with value 1 (assumes one-hot encoding)
        class_label = y_df.loc[idx].idxmax()
        # Specific destination path for the class within the designated train, validation directory
        class_dir = os.path.join(dest_dir, class_label)
        os.makedirs(class_dir, exist_ok=True)
        # Destination file paths
        class_dest_path = os.path.join(class_dir, os.path.basename(image_path))
        # Check if the file already exists in the class-specific directory
        if not os.path.isfile(class_dest_path):
            # Copy the image to the class-specific directory
            shutil.copy(image_path, class_dest_path)

        # # Create general destination file path to copy images
        # general_dest_path = os.path.join(dest_dir, os.path.basename(image_path))

        # if not os.path.isfile(general_dest_path):
        #     # Copy the image to the general destination directory (optional if only class-specific folders are needed)
        #     shutil.copy(image_path, general_dest_path)

### StratifiedGroup K-Fold

In [9]:
# Sampling a fraction of the data for stratification and setting up the file paths for the corresponding labels
frac = 1
y = train_labels.sample(frac=frac, random_state=42)
x = train_features.loc[y.index].filepath.to_frame()

# Preparing lists of necessary features for StratifiedGroupKFold: site, sample IDs, and class labels
list_of_sites = train_features.loc[y.index]['site'].tolist()
list_of_ids = train_features.loc[y.index].index.tolist()
list_of_class_numbers = y.idxmax(axis=1).astype('category').cat.codes.tolist()

# Define the test set proportion and calculate the number of folds for StratifiedGroupKFold
test_size = 0.2
n_splits = int(1 / test_size)

# Initialize StratifiedGroupKFold for creating splits based on stratification criteria
sgkf = StratifiedGroupKFold(n_splits=n_splits, shuffle=False, random_state=None)

# Generate stratified train/validation splits
iteration_splits = list(sgkf.split(list_of_ids, list_of_class_numbers, list_of_sites))

# Create a dictionary to store train/validation data for each fold
fold_data = {f"fold_{i+1}": {"x_train": [], "y_train": [], "x_val": [], "y_val": []} for i in range(n_splits)}

# Iterate through the splits, process each fold, and store results for later evaluation
fold_results = []
for iteration, (train_idx, val_idx) in enumerate(iteration_splits, start=1):
    # Extracting train and validation IDs for the current fold
    train_ids = [list_of_ids[i] for i in train_idx]
    val_ids = [list_of_ids[i] for i in val_idx]

    # Prepare train and validation datasets based on the current split
    x_train = train_features.loc[train_ids]
    y_train = train_labels.loc[train_ids]
    x_val = train_features.loc[val_ids]
    y_val = train_labels.loc[val_ids]

    # Storing the split data for later access
    fold_data[f"fold_{iteration}"]["x_train"] = x_train
    fold_data[f"fold_{iteration}"]["y_train"] = y_train
    fold_data[f"fold_{iteration}"]["x_val"] = x_val
    fold_data[f"fold_{iteration}"]["y_val"] = y_val

    # Evaluate class distribution for both train and validation sets
    train_distribution = y_train.sum()
    val_distribution = y_val.sum()

    # Compute the percentage representation of each class in train and validation datasets
    train_proportion = train_distribution / len(y_train) * 100
    val_proportion = val_distribution / len(y_val) * 100

    fold_results.append((train_proportion, val_proportion))

    # Calculate the mean difference in class distribution between train and validation
    differences = np.abs(train_proportion - val_proportion)
    mean_difference = differences.mean()

    # Debug: Validate the sample count and proportions in the split
    num_train = len(x_train)
    num_val = len(x_val)
    total = num_train + num_val
    actual_train_ratio = num_train / total
    actual_val_ratio = num_val / total

    print(f"Iteration {iteration}")
    print(f"Number of training samples: {num_train}")
    print(f"Number of validation samples: {num_val}")
    print(f"Total samples: {total}")
    print(f"Actual train ratio: {actual_train_ratio:.2f} (should be {1 - test_size})")
    print(f"Actual validation ratio: {actual_val_ratio:.2f} (should be {test_size})")
    print(f"Mean difference: {mean_difference:.4f}")
    print()

    # Display the class distribution percentages for both train and validation sets
    split_pcts = pd.DataFrame(
        {
            "train": y_train.idxmax(axis=1).value_counts(normalize=True),
            "val": y_val.idxmax(axis=1).value_counts(normalize=True),
        }
    )

    print("Species percentages by split:\n")
    print((split_pcts.fillna(0) * 100).astype(int))

    # Calculate and score the split based on class distribution variance
    train_std = split_pcts['train'].std()
    val_std = split_pcts['val'].std()
    score = train_std + val_std

# Post-process: Calculate mean differences for all splits and select the best one based on minimal difference
mean_differences = []
for iteration, (train_proportion, val_proportion) in enumerate(fold_results, start=1):
    differences = np.abs(train_proportion - val_proportion)
    mean_difference = differences.mean()
    mean_differences.append(mean_difference)

best_iteration_index = np.argmin(mean_differences)
print(f"The best iteration is iteration {best_iteration_index + 1} with a mean difference of {mean_differences[best_iteration_index]:.4f}")

# Select the best iteration and retrieve the corresponding train/validation datasets
best_iteration_key = f"fold_{best_iteration_index + 1}"
x_train = fold_data[best_iteration_key]["x_train"]
y_train = fold_data[best_iteration_key]["y_train"]
x_val = fold_data[best_iteration_key]["x_val"]
y_val = fold_data[best_iteration_key]["y_val"]

# Clear any previously stored train and validation data from directories
clear_directory(os.path.join(dataset_path, 'train'))
clear_directory(os.path.join(dataset_path, 'validation'))

# Copy the new train/validation data into the appropriate directories for model training
copy_images(x_train, y_train, os.path.join(dataset_path, 'train'))
copy_images(x_val, y_val, os.path.join(dataset_path, 'validation'))

Iteration 1
Number of training samples: 13305
Number of validation samples: 3183
Total samples: 16488
Actual train ratio: 0.81 (should be 0.8)
Actual validation ratio: 0.19 (should be 0.2)
Mean difference: 1.0099

Species percentages by split:

                  train  val
antelope_duiker      14   15
bird                  9   10
blank                13   13
civet_genet          15   11
hog                   5    6
leopard              13   14
monkey_prosimian     15   15
rodent               12   12
Iteration 2
Number of training samples: 13350
Number of validation samples: 3138
Total samples: 16488
Actual train ratio: 0.81 (should be 0.8)
Actual validation ratio: 0.19 (should be 0.2)
Mean difference: 1.3790

Species percentages by split:

                  train  val
antelope_duiker      14   15
bird                  9   10
blank                13   14
civet_genet          15   10
hog                   5    6
leopard              13   14
monkey_prosimian     14   15
rodent           

Function to check for overlap of locations between training and validation sets and lack of locations

In [10]:
def check_site_overlap_and_missing_sites(x_train, x_val, train_features):

    # Print the sites for debugging purposes
    # print("Train sites:\n", train_sites.value_counts())
    # print("\nValidation sites:\n", val_sites.value_counts())

    # Extract the 'site' column from the train and validation sets
    train_sites = train_features.loc[x_train.index]['site']
    val_sites = train_features.loc[x_val.index]['site']

    # Count the unique sites in training and validation sets
    unique_train_sites = set(train_sites)
    unique_val_sites = set(val_sites)

    # Set of all unique sites in the dataset
    all_sites = set(train_features['site'])

    # Identify common sites between training and validation sets
    common_sites = unique_train_sites.intersection(unique_val_sites)

    # Find sites missing from both training and validation sets
    missing_sites = all_sites - (unique_train_sites.union(unique_val_sites))

    # Output the number of unique sites in each set
    print("Number of unique sites in the training set:", len(unique_train_sites))
    print("Number of unique sites in the validation set:", len(unique_val_sites))
    print("Total number of unique sites in the dataset:", len(all_sites))

    # If there are common sites, display them
    if common_sites:
        print("\nOverlap between training and validation sets:")
        for site in common_sites:
            print(f"Site {site} appears in both the training and validation sets.")
    else:
        print("\nNo overlap between training and validation sets.")

    # If there are missing sites, display them
    if missing_sites:
        print("\nSites that are missing from both training and validation sets:")
        for site in missing_sites:
            print(f"Site {site} is missing from both the training and validation sets.")
    else:
        print("\nNo sites are missing from both training and validation sets.")

# Example usage of the function
check_site_overlap_and_missing_sites(x_train, x_val, train_features)


Number of unique sites in the training set: 120
Number of unique sites in the validation set: 28
Total number of unique sites in the dataset: 148

No overlap between training and validation sets.

No sites are missing from both training and validation sets.


Here's what `x_train` and `y_train` look like now:

In [11]:
x_train.head()

Unnamed: 0_level_0,filepath,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1
ZJ008416,train_features/ZJ008416.jpg,S0036
ZJ009568,train_features/ZJ009568.jpg,S0088
ZJ012401,train_features/ZJ012401.jpg,S0198
ZJ008603,train_features/ZJ008603.jpg,S0031
ZJ009236,train_features/ZJ009236.jpg,S0002


In [12]:
y_train.head()

Unnamed: 0_level_0,antelope_duiker,bird,blank,civet_genet,hog,leopard,monkey_prosimian,rodent
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ZJ008416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ZJ009568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ZJ012401,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
ZJ008603,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZJ009236,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [13]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

((13305, 2), (13305, 8), (3183, 2), (3183, 8))

Next, let's validate that our split has resulted in roughly similar relative distributions of species across the train and val sets (because of how we passed `stratify=y` above).

In [14]:
split_pcts = pd.DataFrame(
    {
        "train": y_train.idxmax(axis=1).value_counts(normalize=True),
        "val": y_val.idxmax(axis=1).value_counts(normalize=True),
    }
)

print("Species percentages by split:")
(split_pcts.fillna(0) * 100).astype(int)
(split_pcts.fillna(0) * 100).round(2)

Species percentages by split:


Unnamed: 0,train,val
antelope_duiker,14.84,15.71
bird,9.87,10.3
blank,13.3,13.92
civet_genet,15.48,11.44
hog,5.89,6.09
leopard,13.54,14.2
monkey_prosimian,15.0,15.58
rodent,12.08,12.76


In [15]:
# Define dictionaries to hold data for each category
data_train = []
data_validation = []
data_test_features = []

# Iterate over each subdirectory
for dirpath, dirnames, filenames in os.walk(dataset_path):
    # Count number of images in the current directory
    num_images = sum(1 for filename in filenames if filename.lower().endswith('.jpg'))
    if num_images > 0:
        # Check the category of the current directory based on the path
        parts = dirpath.split(os.sep)
        if "train" in parts:
            data_train.append({'train': dirpath, 'number of images': num_images})
        elif "validation" in parts:
            data_validation.append({'validation': dirpath, 'number of images': num_images})
        elif "test_features" in parts:
            data_test_features.append({'test_features': dirpath, 'number of images': num_images})

# Create DataFrames for each category with the specified column names
df_train = pd.DataFrame(data_train)
df_validation = pd.DataFrame(data_validation)
df_test_features = pd.DataFrame(data_test_features)

# Display each sorted DataFrame
display(df_train)
display(df_validation)
display(df_test_features)


Unnamed: 0,train,number of images
0,data/raw\train\antelope_duiker,1974
1,data/raw\train\bird,1313
2,data/raw\train\blank,1770
3,data/raw\train\civet_genet,2059
4,data/raw\train\hog,784
5,data/raw\train\leopard,1802
6,data/raw\train\monkey_prosimian,1996
7,data/raw\train\rodent,1607


Unnamed: 0,validation,number of images
0,data/raw\validation\antelope_duiker,500
1,data/raw\validation\bird,328
2,data/raw\validation\blank,443
3,data/raw\validation\civet_genet,364
4,data/raw\validation\hog,194
5,data/raw\validation\leopard,452
6,data/raw\validation\monkey_prosimian,496
7,data/raw\validation\rodent,406


Unnamed: 0,test_features,number of images
0,data/raw\test_features,4464


The final file structure after split images look like this:

```
dataset/
├── train_features/
│   ├── ZJ000001.jpg
│   └── ...
│
├── test_features/
│   ├── ZJ16488.jpg
│   └── ...
│
├── train/
│   ├── antelope_duiker (class 1)/
│   │   ├── ZJ000001.jpg
│   │   └── ...
│   ├── ...
│   └── hog (class 8)/
│       ├── ZJ000008.jpg
│       └── ...
│
└── validation/
    ├── antelope_duiker (class 1)/
    │   ├── ZJ000009.jpg
    │   └── ...
    ├── ...
    └── hog (class 8)/
        ├── ZJ000016.jpg
        └── ...

```