In [1]:
import pandas as pd
import os

# Read both CSV files
fashion_dataset_images = pd.read_csv('fashion-dataset/images.csv', on_bad_lines='skip')
fashion_dataset_styles = pd.read_csv('fashion-dataset/styles.csv', on_bad_lines='skip')


In [2]:
fashion_dataset_images['id'] = fashion_dataset_images['filename'].str.replace('.jpg', '').astype(int)

# Merge the datasets
fashion_dataset_merged = pd.merge(fashion_dataset_images, fashion_dataset_styles, on='id', how='inner')

# Display the results
print("Fashion Dataset Merged Shape:", fashion_dataset_merged.shape)

# Optional: Display first few rows to verify the merge
print("\nFashion Dataset Merged Preview:")
print(fashion_dataset_merged.head())

Fashion Dataset Merged Shape: (44424, 12)

Fashion Dataset Merged Preview:
    filename                                               link     id gender  \
0  15970.jpg  http://assets.myntassets.com/v1/images/style/p...  15970    Men   
1  39386.jpg  http://assets.myntassets.com/v1/images/style/p...  39386    Men   
2  59263.jpg  http://assets.myntassets.com/v1/images/style/p...  59263  Women   
3  21379.jpg  http://assets.myntassets.com/v1/images/style/p...  21379    Men   
4  53759.jpg  http://assets.myntassets.com/v1/images/style/p...  53759    Men   

  masterCategory subCategory  articleType baseColour  season    year   usage  \
0        Apparel     Topwear       Shirts  Navy Blue    Fall  2011.0  Casual   
1        Apparel  Bottomwear        Jeans       Blue  Summer  2012.0  Casual   
2    Accessories     Watches      Watches     Silver  Winter  2016.0  Casual   
3        Apparel  Bottomwear  Track Pants      Black    Fall  2011.0  Casual   
4        Apparel     Topwear      Tshi

In [3]:
fashion_dataset_merged.drop(columns=['link','gender', 'masterCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName'], inplace=True)

In [4]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
2,59263.jpg,59263,Watches
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
...,...,...,...
44419,17036.jpg,17036,Shoes
44420,6461.jpg,6461,Flip Flops
44421,18842.jpg,18842,Topwear
44422,46694.jpg,46694,Fragrance


In [None]:
fashion_dataset_merged.dtypes


In [5]:
import os
import shutil

# Create new directory for matched images if it doesn't exist
os.makedirs('matched_images', exist_ok=True)

# Get list of actual image files in images folder
image_files = set(f for f in os.listdir('fashion-dataset/images') if f.endswith('.jpg'))

# Filter dataset to only include rows where filename exists in images folder
fashion_dataset_filtered = fashion_dataset_merged[fashion_dataset_merged['filename'].isin(image_files)].copy()

print(f"Original dataset size: {len(fashion_dataset_merged)}")
print(f"Filtered dataset size: {len(fashion_dataset_filtered)}")





Original dataset size: 44424
Filtered dataset size: 44419


In [6]:
# Move matched images to new folder
for filename in fashion_dataset_filtered['filename']:
    src = os.path.join('fashion-dataset/images', filename)
    dst = os.path.join('matched_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)



In [7]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
2,59263.jpg,59263,Watches
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
...,...,...,...
44419,17036.jpg,17036,Shoes
44420,6461.jpg,6461,Flip Flops
44421,18842.jpg,18842,Topwear
44422,46694.jpg,46694,Fragrance


In [8]:
fashion_dataset_merged.subCategory.value_counts()

subCategory
Topwear                     15402
Shoes                        7343
Bags                         3055
Bottomwear                   2694
Watches                      2542
Innerwear                    1808
Jewellery                    1079
Eyewear                      1073
Fragrance                    1011
Sandal                        963
Wallets                       933
Flip Flops                    913
Belts                         811
Socks                         698
Lips                          527
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Nails                         329
Makeup                        307
Headwear                      293
Ties                          258
Accessories                   129
Scarves                       118
Cufflinks                     108
Apparel Set                   106
Free Gifts                    104
Stoles                         90
Skin Care                      77
Sk

In [9]:
clothing_items_top_10 = [
    "Topwear",
    "Bottomwear", 
    "Innerwear",
    "Dress",
    "Loungewear and Nightwear",
    "Saree",
    "Headwear", 
    "Ties",
    "Scarves",
    "Apparel Set"
]

In [10]:
os.makedirs('categorized_matched_images', exist_ok=True)

In [11]:
image_files = set(f for f in os.listdir('matched_images') if f.endswith('.jpg'))


In [12]:
fashion_dataset_merged


Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
2,59263.jpg,59263,Watches
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
...,...,...,...
44419,17036.jpg,17036,Shoes
44420,6461.jpg,6461,Flip Flops
44421,18842.jpg,18842,Topwear
44422,46694.jpg,46694,Fragrance


In [13]:
# Filter dataset to only include rows where filename exists AND subcategory is in our list
fashion_dataset_filtered = fashion_dataset_merged[
    (fashion_dataset_merged['filename'].isin(image_files)) & 
    (fashion_dataset_merged['subCategory'].isin(clothing_items_top_10))
].copy()

In [14]:
fashion_dataset_filtered

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
5,1855.jpg,1855,Topwear
...,...,...,...
44414,30614.jpg,30614,Topwear
44415,13496.jpg,13496,Topwear
44417,12544.jpg,12544,Topwear
44418,42234.jpg,42234,Topwear


In [15]:
for filename in fashion_dataset_filtered['filename']:
    src = os.path.join('matched_images', filename)
    dst = os.path.join('categorized_matched_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)

# Update the main dataframe
fashion_dataset_merged = fashion_dataset_filtered

In [16]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
5,1855.jpg,1855,Topwear
...,...,...,...
44414,30614.jpg,30614,Topwear
44415,13496.jpg,13496,Topwear
44417,12544.jpg,12544,Topwear
44418,42234.jpg,42234,Topwear


In [17]:
fashion_dataset_merged['subCategory'].value_counts()

subCategory
Topwear                     15398
Bottomwear                   2693
Innerwear                    1808
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Headwear                      293
Ties                          258
Scarves                       118
Apparel Set                   106
Name: count, dtype: int64

In [25]:
# Get the count of images per subcategory
subcategory_counts = fashion_dataset_merged['subCategory'].value_counts()

In [18]:
keep_safe = fashion_dataset_merged

In [19]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
5,1855.jpg,1855,Topwear
...,...,...,...
44414,30614.jpg,30614,Topwear
44415,13496.jpg,13496,Topwear
44417,12544.jpg,12544,Topwear
44418,42234.jpg,42234,Topwear


In [20]:
keep_safe

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
5,1855.jpg,1855,Topwear
...,...,...,...
44414,30614.jpg,30614,Topwear
44415,13496.jpg,13496,Topwear
44417,12544.jpg,12544,Topwear
44418,42234.jpg,42234,Topwear


In [21]:
# Get current counts
subcategory_counts = fashion_dataset_merged['subCategory'].value_counts()
print("Before capping:")
print(subcategory_counts)

# Initialize empty dataframe for results
capped_dataset = pd.DataFrame()

# For each subcategory, take up to 1000 images
for subcategory in clothing_items_top_10:
    subset = fashion_dataset_merged[fashion_dataset_merged['subCategory'] == subcategory]
    if len(subset) > 1000:
        subset = subset.sample(n=1000, random_state=42)  # random_state for reproducibility
    capped_dataset = pd.concat([capped_dataset, subset])

# Update the main dataframe
fashion_dataset_merged = capped_dataset

print("\nAfter capping at 1000:")
print(fashion_dataset_merged['subCategory'].value_counts())

Before capping:
subCategory
Topwear                     15398
Bottomwear                   2693
Innerwear                    1808
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Headwear                      293
Ties                          258
Scarves                       118
Apparel Set                   106
Name: count, dtype: int64

After capping at 1000:
subCategory
Topwear                     1000
Bottomwear                  1000
Innerwear                   1000
Dress                        478
Loungewear and Nightwear     470
Saree                        427
Headwear                     293
Ties                         258
Scarves                      118
Apparel Set                  106
Name: count, dtype: int64


In [22]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
1227,27087.jpg,27087,Topwear
14454,11023.jpg,11023,Topwear
23585,26930.jpg,26930,Topwear
24109,12731.jpg,12731,Topwear
11789,38335.jpg,38335,Topwear
...,...,...,...
43353,54703.jpg,54703,Apparel Set
43479,39135.jpg,39135,Apparel Set
43571,54704.jpg,54704,Apparel Set
43966,25597.jpg,25597,Apparel Set


In [23]:
# Create new directory for the capped dataset
os.makedirs('capped_images', exist_ok=True)

# Move images based on the capped dataset
for filename in fashion_dataset_merged['filename']:
    src = os.path.join('categorized_matched_images', filename)
    dst = os.path.join('capped_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)

# Verify the count of moved files
moved_files = len([f for f in os.listdir('capped_images') if f.endswith('.jpg')])
print(f"Moved {moved_files} files to capped_images directory")
print(f"Should match dataframe size: {len(fashion_dataset_merged)}")

Moved 5150 files to capped_images directory
Should match dataframe size: 5150


In [25]:
import numpy as np
from PIL import Image
import tensorflow as tf
import random

# Get current max ID to ensure new IDs don't overlap
max_id = fashion_dataset_merged['id'].max()
current_new_id = max_id + 1

# Create augmentation layer
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
    tf.keras.layers.RandomBrightness(0.2),
    tf.keras.layers.RandomContrast(0.2),
])

# Function to augment single image
def augment_image(image_path):
    # Read and convert image to tensor
    img = tf.keras.preprocessing.image.load_img(image_path)
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)
    
    # Augment
    augmented_img = data_augmentation(img_array)
    
    # Convert back to PIL Image
    augmented_img = tf.keras.preprocessing.image.array_to_img(augmented_img[0])
    return augmented_img

# Process each subcategory
new_rows = []
for category in clothing_items_top_10:
    category_df = fashion_dataset_merged[fashion_dataset_merged['subCategory'] == category]
    current_count = len(category_df)
    
    if current_count < 1000:
        needed_augmentations = 1000 - current_count
        print(f"Augmenting {category}: need {needed_augmentations} more images")
        
        # Randomly select images to augment (with replacement)
        source_files = category_df['filename'].tolist()
        
        for _ in range(needed_augmentations):
            # Select random source image
            source_filename = random.choice(source_files)
            source_path = os.path.join('capped_images', source_filename)
            
            # Generate new filename and ID
            new_filename = f"{current_new_id}.jpg"
            
            # Augment and save image
            augmented_img = augment_image(source_path)
            augmented_img.save(os.path.join('capped_images', new_filename))
            
            # Add new row to dataframe
            new_rows.append({
                'filename': new_filename,
                'id': current_new_id,
                'subCategory': category
            })
            
            current_new_id += 1

# Add new rows to dataframe
if new_rows:
    new_df = pd.DataFrame(new_rows)
    fashion_dataset_merged = pd.concat([fashion_dataset_merged, new_df], ignore_index=True)

# Verify results
print("\nFinal counts per category:")
print(fashion_dataset_merged['subCategory'].value_counts())

Augmenting Dress: need 522 more images
Augmenting Loungewear and Nightwear: need 530 more images
Augmenting Saree: need 573 more images
Augmenting Headwear: need 707 more images
Augmenting Ties: need 742 more images
Augmenting Scarves: need 882 more images
Augmenting Apparel Set: need 894 more images

Final counts per category:
subCategory
Topwear                     1000
Bottomwear                  1000
Innerwear                   1000
Dress                       1000
Loungewear and Nightwear    1000
Saree                       1000
Headwear                    1000
Ties                        1000
Scarves                     1000
Apparel Set                 1000
Name: count, dtype: int64


In [2]:
import pandas as pd
df = pd.read_csv('../data/fashion_dataset_merged.csv')

In [3]:
df.head()

Unnamed: 0,filename,id,subCategory
0,27087.jpg,27087,Topwear
1,11023.jpg,11023,Topwear
2,26930.jpg,26930,Topwear
3,12731.jpg,12731,Topwear
4,38335.jpg,38335,Topwear


In [4]:
df['label'] = df['subCategory']

In [5]:
df.label

0           Topwear
1           Topwear
2           Topwear
3           Topwear
4           Topwear
           ...     
9995    Apparel Set
9996    Apparel Set
9997    Apparel Set
9998    Apparel Set
9999    Apparel Set
Name: label, Length: 10000, dtype: object

In [None]:
df_test = df

In [7]:
import os
import shutil

# Create base directory for categorized images if it doesn't exist
base_dir = '../data/categorized_images'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Create subdirectories for each unique category
categories = df['label'].unique()
for category in categories:
    category_dir = os.path.join(base_dir, category)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)

# Move files from capped_images to their category folders
for _, row in df.iterrows():
    src_path = os.path.join('../data/capped_images', row['filename'])
    dst_path = os.path.join(base_dir, row['label'], row['filename'])
    
    # Check if source file exists before attempting to move
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)  # Using copy2 to preserve metadata

print("Images have been organized into their respective category folders")


Images have been organized into their respective category folders


In [10]:
# Create a new column for GCS paths using same folder structure
bucket_name = "pa-poc-mlspec-3-cs"
df['image_path'] = df.apply(
    lambda row: f"gs://{bucket_name}/categorised_images/{row['label']}/{row['filename']}", 
    axis=1
)


In [11]:
df.head()

Unnamed: 0,filename,id,subCategory,label,gcs_path,image_path
0,27087.jpg,27087,Topwear,Topwear,gs://pa-poc-mlspec-3-cs/categorized_images/Top...,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
1,11023.jpg,11023,Topwear,Topwear,gs://pa-poc-mlspec-3-cs/categorized_images/Top...,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
2,26930.jpg,26930,Topwear,Topwear,gs://pa-poc-mlspec-3-cs/categorized_images/Top...,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
3,12731.jpg,12731,Topwear,Topwear,gs://pa-poc-mlspec-3-cs/categorized_images/Top...,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
4,38335.jpg,38335,Topwear,Topwear,gs://pa-poc-mlspec-3-cs/categorized_images/Top...,gs://pa-poc-mlspec-3-cs/categorised_images/Top...


In [15]:
df = df.drop(columns=['filename', 'id', 'gcs_path'], inplace=True)

KeyError: "['filename', 'id', 'gcs_path'] not found in axis"

In [18]:
df.to_csv('gs://pa-poc-mlspec-3-cs/fashion_dataset_processed.csv', index=False)

In [16]:
df.drop(columns=['subCategory'], inplace=True)

In [17]:
df

Unnamed: 0,label,image_path
0,Topwear,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
1,Topwear,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
2,Topwear,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
3,Topwear,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
4,Topwear,gs://pa-poc-mlspec-3-cs/categorised_images/Top...
...,...,...
9995,Apparel Set,gs://pa-poc-mlspec-3-cs/categorised_images/App...
9996,Apparel Set,gs://pa-poc-mlspec-3-cs/categorised_images/App...
9997,Apparel Set,gs://pa-poc-mlspec-3-cs/categorised_images/App...
9998,Apparel Set,gs://pa-poc-mlspec-3-cs/categorised_images/App...


In [None]:
# Update the 'link' column to reflect the GCS path
bucket_name = "your-gcs-bucket-name"

df['gcs_path'] 
# Create the GCS file path for each image
df['gcs_path'] = df.apply(
    lambda row: f"gs://{bucket_name}/{row['subCategory']}/{row['filename']}", axis=1
)