In [2]:
cat_folder = "PetImages/Cat"
dog_folder = "PetImages/Dog"

In [5]:
import os
import shutil
import pandas as pd
import random
from pathlib import Path

def create_pet_dataset(cat_folder="../PetImages/Cat", dog_folder="../PetImages/Dog", train_ratio=0.7):
    """
    Creates train/test dataset from cat and dog image folders.
    Images are stored in single folders with numeric filenames.
    Class is indicated in CSV file only.
    
    Args:
        cat_folder: Path to cat images folder
        dog_folder: Path to dog images folder  
        train_ratio: Ratio for training data (default 0.7 for 70%)
    """
    
    # Create output directories
    os.makedirs('train', exist_ok=True)
    os.makedirs('test', exist_ok=True)
    
    # Collect all images first, then shuffle and assign numbers
    all_train_images = []
    all_test_images = []
    
    # Process cats
    if os.path.exists(cat_folder):
        cat_files = [f for f in os.listdir(cat_folder) 
                    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
        random.shuffle(cat_files)
        
        train_count = int(len(cat_files) * train_ratio)
        
        # Add cat images to train list
        for filename in cat_files[:train_count]:
            src = os.path.join(cat_folder, filename)
            all_train_images.append((src, 'cat'))
        
        # Add cat images to test list
        for filename in cat_files[train_count:]:
            src = os.path.join(cat_folder, filename)
            all_test_images.append((src, 'cat'))
        
        print(f"Processed {len(cat_files)} cat images:")
        print(f"  - Train: {train_count}")
        print(f"  - Test: {len(cat_files) - train_count}")
    
    # Process dogs
    if os.path.exists(dog_folder):
        dog_files = [f for f in os.listdir(dog_folder) 
                    if f.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp', '.tiff'))]
        random.shuffle(dog_files)
        
        train_count = int(len(dog_files) * train_ratio)
        
        # Add dog images to train list
        for filename in dog_files[:train_count]:
            src = os.path.join(dog_folder, filename)
            all_train_images.append((src, 'dog'))
        
        # Add dog images to test list
        for filename in dog_files[train_count:]:
            src = os.path.join(dog_folder, filename)
            all_test_images.append((src, 'dog'))
        
        print(f"Processed {len(dog_files)} dog images:")
        print(f"  - Train: {train_count}")
        print(f"  - Test: {len(dog_files) - train_count}")
    
    # Shuffle both lists to mix cats and dogs randomly
    random.shuffle(all_train_images)
    random.shuffle(all_test_images)
    
    # Now copy images with sequential numbering
    train_data = []
    test_data = []
    
    # Process shuffled training images
    for i, (src, label) in enumerate(all_train_images, 1):
        new_filename = f"{i:04d}.jpg"
        dst = os.path.join('train', new_filename)
        shutil.copy2(src, dst)
        train_data.append({
            'filename': new_filename,
            'label': label
        })
    
    # Process shuffled test images
    for i, (src, label) in enumerate(all_test_images, 1):
        new_filename = f"{i:04d}.jpg"
        dst = os.path.join('test', new_filename)
        shutil.copy2(src, dst)
        test_data.append({
            'filename': new_filename,
            'label': label
        })
    
    # Create CSV files
    train_df = pd.DataFrame(train_data)
    test_df = pd.DataFrame(test_data)
    
    # Shuffle the data
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    test_df = test_df.sample(frac=1).reset_index(drop=True)
    
    # Save CSV files
    train_df.to_csv('train.csv', index=False)
    test_df.to_csv('test.csv', index=False)
    
    print(f"\nDataset created successfully!")
    print(f"Train samples: {len(train_df)}")
    print(f"Test samples: {len(test_df)}")
    print(f"Train CSV saved as 'train.csv'")
    print(f"Test CSV saved as 'test.csv'")
    
    # Display sample of CSV files
    print(f"\nTrain CSV sample:")
    print(train_df.head())
    print(f"\nTest CSV sample:")
    print(test_df.head())
    
    return train_df, test_df

# Run the function
if __name__ == "__main__":
    # Set random seed for reproducibility
    random.seed(42)
    
    # Create the dataset
    train_df, test_df = create_pet_dataset()
    
    # Print dataset statistics
    print(f"\n=== Dataset Statistics ===")
    print(f"Training set:")
    print(train_df['label'].value_counts())
    print(f"\nTest set:")
    print(test_df['label'].value_counts())

Processed 12499 cat images:
  - Train: 8749
  - Test: 3750
Processed 12499 dog images:
  - Train: 8749
  - Test: 3750

Dataset created successfully!
Train samples: 17498
Test samples: 7500
Train CSV saved as 'train.csv'
Test CSV saved as 'test.csv'

Train CSV sample:
    filename label
0  17221.jpg   dog
1  11371.jpg   cat
2   1869.jpg   dog
3   5411.jpg   cat
4   1065.jpg   cat

Test CSV sample:
   filename label
0  4209.jpg   dog
1  4161.jpg   cat
2  0841.jpg   dog
3  2311.jpg   cat
4  4429.jpg   cat

=== Dataset Statistics ===
Training set:
label
dog    8749
cat    8749
Name: count, dtype: int64

Test set:
label
dog    3750
cat    3750
Name: count, dtype: int64


In [6]:
# open train and test csv and sort by filename
def sort_csv_by_filename(csv_file):
    """
    Sorts a CSV file by the 'filename' column and saves it back.
    
    Args:
        csv_file: Path to the CSV file to sort
    """
    df = pd.read_csv(csv_file)
    df_sorted = df.sort_values(by='filename').reset_index(drop=True)
    df_sorted.to_csv(csv_file, index=False)
    print(f"Sorted {csv_file} by filename.")
if __name__ == "__main__":
    sort_csv_by_filename('train.csv')
    sort_csv_by_filename('test.csv')
    
    print("CSV files sorted by filename.")

Sorted train.csv by filename.
Sorted test.csv by filename.
CSV files sorted by filename.


In [1]:
import pandas as pd
# remove label on test.csv
def remove_label_from_test_csv(test_csv_file):
    """
    Removes the 'label' column from the test CSV file.
    
    Args:
        test_csv_file: Path to the test CSV file
    """
    df = pd.read_csv(test_csv_file)
    if 'label' in df.columns:
        df.drop(columns=['label'], inplace=True)
        df.to_csv(test_csv_file, index=False)
        print(f"Removed 'label' column from {test_csv_file}.")
    else:
        print(f"'label' column not found in {test_csv_file}.")
if __name__ == "__main__":
    remove_label_from_test_csv('test.csv')
    
    print("Label column removed from test.csv.")

Removed 'label' column from test.csv.
Label column removed from test.csv.


In [3]:
# add column in test_answer.csv named "Usage" that have two values ["Public", "Private"]
# randomly assign "Public" or "Private" to each row
# make sure that the column has 50% "Public" and 50% "Private"
import pandas as pd
import random
def add_usage_column_to_test_answer(test_answer_csv_file):
    """
    Adds a 'Usage' column to the test answer CSV file with random values 'Public' or 'Private'.
    
    Args:
        test_answer_csv_file: Path to the test answer CSV file
    """
    df = pd.read_csv(test_answer_csv_file)
    
    # Create a new column 'Usage' with 50% 'Public' and 50% 'Private'
    usage_values = ['Public', 'Private']
    df['Usage'] = [random.choice(usage_values) for _ in range(len(df))]
    
    # Ensure equal distribution
    public_count = df['Usage'].value_counts().get('Public', 0)
    private_count = df['Usage'].value_counts().get('Private', 0)
    
    if public_count != private_count:
        print("Warning: The distribution of 'Public' and 'Private' is not equal.")
    
    df.to_csv(test_answer_csv_file, index=False)
    print(f"Added 'Usage' column to {test_answer_csv_file} with random values.")
if __name__ == "__main__":
    add_usage_column_to_test_answer('test_answer.csv')
    
    print("Usage column added to test.csv.")

Added 'Usage' column to test_answer.csv with random values.
Usage column added to test.csv.


In [5]:
import numpy as np
df = pd.read_csv('test_answer.csv')
print(np.sum(df["Usage"] == "Private"))

3752


In [6]:
# create a sample submission file from test.csv filename column answer only ["cat", "dog"]
# randomly assign "cat" or "dog" to each row

def create_sample_submission(test_csv_file, sample_submission_file='sample_submission.csv'):
    """
    Creates a sample submission file from the test CSV file with random 'cat' or 'dog' labels.
    
    Args:
        test_csv_file: Path to the test CSV file
        sample_submission_file: Path to save the sample submission file
    """
    df = pd.read_csv(test_csv_file)
    
    # Randomly assign 'cat' or 'dog' to each row
    df['label'] = np.random.choice(['cat', 'dog'], size=len(df))
    
    # Save the sample submission file
    df[['filename', 'label']].to_csv(sample_submission_file, index=False)
    print(f"Sample submission file created: {sample_submission_file}")
if __name__ == "__main__":
    create_sample_submission('test.csv', 'sample_submission.csv')
    
    print("Sample submission file created with random labels.")
    
    # Display the first few rows of the sample submission file
    sample_df = pd.read_csv('sample_submission.csv')
    print(sample_df.head())

Sample submission file created: sample_submission.csv
Sample submission file created with random labels.
   filename label
0  0001.jpg   dog
1  0002.jpg   dog
2  0003.jpg   cat
3  0004.jpg   cat
4  0005.jpg   dog
