In [1]:
import os
import pandas as pd
import shutil   #for copying the files
from sklearn.model_selection import train_test_split

# Original dataset folder
original_folder = r'original_dataset'  

# New folders for Kaggle test set and the remaining data
kaggle_test_folder = r'kaggle_solutionSet'
remaining_data_folder = r'competition_data'

# Create new directories
os.makedirs(kaggle_test_folder, exist_ok=True)
os.makedirs(remaining_data_folder, exist_ok=True)

# CSV file to hold the solution for Kaggle scoring
solution_csv = []
solution_csv_path = 'solution.csv'

# Check current working directory
print("Current Working Directory:", os.getcwd())

# Check if 'original_folder' exists
if os.path.exists(original_folder):
    print(f"The folder {original_folder} exists.")
else:
    print(f"The folder {original_folder} does not exist.")

# Loop through each sub-folder (i.e., each class) in the original dataset
for label in os.listdir(original_folder):
    src_path = os.path.join(original_folder, label)

    # Create a destination folder for each class in the Kaggle test folder and remaining data folder
    dest_path_test = os.path.join(kaggle_test_folder, label)
    dest_path_remaining = os.path.join(remaining_data_folder, label)

    os.makedirs(dest_path_test, exist_ok=True)
    os.makedirs(dest_path_remaining, exist_ok=True)

    # Get all files in the source folder
    files = [f for f in os.listdir(src_path) if os.path.isfile(os.path.join(src_path, f))]
    labels = [label] * len(files)

    # Stratified 80-20 split
    remaining_files, test_files = train_test_split(files, test_size=0.2, random_state=42, stratify=labels)

    # Copy the files
    for f in remaining_files:
        shutil.copy(os.path.join(src_path, f), os.path.join(dest_path_remaining, f))

    for f in test_files:
        shutil.copy(os.path.join(src_path, f), os.path.join(dest_path_test, f))
        solution_csv.append({'ID': f, 'Class': label, 'Usage': 'Public'})

# Create dataframe for solution
df = pd.DataFrame(solution_csv)
# Shuffle the rows of the DataFrame
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# Create the solution CSV file for Kaggle scoring
df.to_csv(solution_csv_path, index=False)




Current Working Directory: f:\Grad_School\CSCE_5215\GroupProject\Git\BrainSynergy\data
The folder original_dataset exists.


Current solution set images contain class information in file name. Code below will reassign unique file names while masking associated class. Will also aggregate all files into one folder, instead of subfolders corresponding to each class.

In [2]:
# Load the current solution CSV
df = pd.read_csv('solution.csv')

# Flatten kaggle_solutionSet directory and obfuscate filenames
counter = 1
for label in os.listdir(original_folder):
    class_folder_path = os.path.join(kaggle_test_folder, label)

    for filename in os.listdir(class_folder_path):
        old_filepath = os.path.join(class_folder_path, filename)
        new_filename = f"image_{counter}.jpg"
        new_filepath = os.path.join(kaggle_test_folder, new_filename)

        # Move and rename the file
        shutil.move(old_filepath, new_filepath)

        # Update solution.csv
        df.loc[df['ID'] == filename, 'ID'] = new_filename

        counter += 1

    # Remove the now-empty class subfolder
    os.rmdir(class_folder_path)

# Save the updated solution.csv
df.to_csv('solution.csv', index=False)

Code below will create *sandbox.csv* to test scoring metric on kaggle. 

*sandbox.csv* will be same format as *solution.csv*, but with randomized class labels. 

Will need to compare class columns of *sandbox.csv* & *solution.csv* to validate kaggle scoring metric  

In [9]:
# Create dataframes
solution_df = df
randomized_class = df['Class'].sample(frac=1).reset_index(drop=True) #randomize class column

sandbox_df = solution_df.copy()
sandbox_df['Class'] = randomized_class

# Drop the 'Usage' column
sandbox_df = sandbox_df[['ID', 'Class']]

# Export the randomized DataFrame to sandbox.csv
sandbox_df.to_csv('sandbox.csv', index=False)

*sandbox.csv*  Kaggle Score: 0.27053

Being that there are 4 classes, this result seems reasonable as anywhere near 25% would account for random chance.

Let us validate score anyway.

In [10]:
# Compare the 'Class' columns
matching_rows = (solution_df['Class'] == sandbox_df['Class']).sum()

# Calculate the match percentage
match_percentage = (matching_rows / len(solution_df)) * 100

print(f"Percentage of matching rows: {match_percentage:.2f}%")

Percentage of matching rows: 27.05%


Results validated!