In [2]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Original dataset folder
original_folder = '/content/drive/MyDrive/ML-GroupProject/Data'

# New folders for Kaggle test set and the remaining data
kaggle_test_folder = '/content/drive/MyDrive/ML-GroupProject/KaggleTest'
remaining_data_folder = '/content/drive/MyDrive/ML-GroupProject/RemainingData'

# Create new directories
os.makedirs(kaggle_test_folder, exist_ok=True)
os.makedirs(remaining_data_folder, exist_ok=True)

# CSV file to hold the solution for Kaggle scoring
solution_csv = []
solution_csv_path = '/content/drive/MyDrive/ML-GroupProject/solution.csv'

# Loop through each sub-folder (i.e., each class) in the original dataset
for label in os.listdir(original_folder):
    src_path = os.path.join(original_folder, label)

    # Create a destination folder for each class in the Kaggle test folder and remaining data folder
    dest_path_test = os.path.join(kaggle_test_folder, label)
    dest_path_remaining = os.path.join(remaining_data_folder, label)

    os.makedirs(dest_path_test, exist_ok=True)
    os.makedirs(dest_path_remaining, exist_ok=True)

    # Get all files in the source folder
    files = [f for f in os.listdir(src_path) if os.path.isfile(os.path.join(src_path, f))]
    labels = [label] * len(files)

    # Stratified 80-20 split
    remaining_files, test_files = train_test_split(files, test_size=0.2, random_state=42, stratify=labels)

    # Move the files
    for f in remaining_files:
        os.rename(os.path.join(src_path, f), os.path.join(dest_path_remaining, f))

    for f in test_files:
        os.rename(os.path.join(src_path, f), os.path.join(dest_path_test, f))
        solution_csv.append({'ID': f, 'Class': label, 'Usage': 'Private'})

# Create the solution CSV file for Kaggle scoring
df = pd.DataFrame(solution_csv)
df.to_csv(solution_csv_path, index=False)
