In [None]:
# ==========
# IMPORT
# ==========

from google.colab import files, drive
import os
import pandas as pd
import requests
import shutil
import zipfile
import concurrent.futures
from PIL import Image
from multiprocessing.pool import ThreadPool
from functools import partial
from multiprocessing import Pool
import matplotlib.pyplot as plt

# CSV Extracting

In [None]:
uploaded_files = files.upload()

Saving styles.csv.zip to styles.csv.zip
Saving images.csv.zip to images.csv.zip


In [None]:
!unzip images.csv.zip
!unzip styles.csv.zip
os.remove("images.csv.zip")
os.remove("styles.csv.zip")

Archive:  images.csv.zip
  inflating: images.csv              
Archive:  styles.csv.zip
  inflating: styles.csv              


# Dataset Merging, Dataset Downloading

In [None]:
# ==============================
# DATASET LOADING AND PROCESSING
# ==============================

# read the csvs
df_images = pd.read_csv("images.csv")
df_styles = pd.read_csv("styles.csv", on_bad_lines='skip')

# create filename, join it using inner
df_styles["filename"] = df_styles["id"].astype(str) + ".jpg"
print(f"df_images shape = {df_images.shape} vs df_styles shape = {df_styles.shape}")
df = df_images.merge(df_styles, on="filename", how="inner")

# Take only the Apparel
df = df[df["masterCategory"] == "Apparel"]

# Change the subCategory to Top and Bottom
df["subCategory"].replace({"Topwear": "Top", "Bottomwear": "Bottom"}, inplace=True)

# Remove that is not top or bottom
df = df[(df['subCategory'] == 'Top') | (df['subCategory'] == 'Bottom')]

# Count the occurrences of each category
category_counts = df['subCategory'].value_counts()

# Specify the target number of samples per category
target_count = 2694

# Sample an equal number of rows for each category
balanced_df = pd.DataFrame()
for category in category_counts.index:
    category_df = df[df['subCategory'] == category]
    sampled_df = category_df.sample(n=target_count, random_state=42)  # Adjust the random_state if needed
    balanced_df = balanced_df.append(sampled_df)

# Shuffle the rows of the balanced DataFrame
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"df_shape: {df.shape}")
print(f"The df subCategory:")
print(df["subCategory"].value_counts())
print(f"The balanced_df subCategory:")
print(balanced_df["subCategory"].value_counts())

df_images shape = (44446, 2) vs df_styles shape = (44424, 11)
df_shape: (18096, 12)
The df subCategory:
Top       15402
Bottom     2694
Name: subCategory, dtype: int64
The balanced_df subCategory:
Bottom    2694
Top       2694
Name: subCategory, dtype: int64


  balanced_df = balanced_df.append(sampled_df)
  balanced_df = balanced_df.append(sampled_df)


In [None]:
df = balanced_df

In [None]:
# =================
# IMAGE DOWNLOADING
# =================

# Function to download an image given the URL and local directory
def download_image(image_url, local_dir):
    try:
        response = requests.get(image_url)
        response.raise_for_status()
        with open(local_dir, "wb") as file:
            file.write(response.content)
        return True  # Return True to indicate successful download
    except Exception as e:
        return False  # Return False to indicate failed download

# Initiate local_path
df["local_dir"] = ""
os.makedirs("dataset", exist_ok=True)

# Variables to track progress
total_rows = len(df)
completed_count = 0

# Iterate over the DataFrame rows
with concurrent.futures.ThreadPoolExecutor() as executor:
    futures = []
    for index, row in df.iterrows():
        image_url = row["link"]
        file_name = row["filename"]
        local_dir = os.path.join("dataset", file_name)
        df.at[index, "local_dir"] = local_dir

        # Check if the file already exists
        if os.path.exists(local_dir):
            print(f"Skipping download for file: {file_name} (already exists)")
            completed_count += 1  # Increment completed count
            continue

        # Submit the download task to the executor
        future = executor.submit(download_image, image_url, local_dir)
        futures.append(future)

    # Track the progress of the tasks
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        if result:
            completed_count += 1  # Increment completed count
        progress = (completed_count / total_rows) * 100
        print(f"Progress: {progress:.2f}% ({completed_count}/{total_rows} rows)")

print("All downloads completed.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Progress: 7.24% (390/5388 rows)
Progress: 7.26% (391/5388 rows)
Progress: 7.28% (392/5388 rows)
Progress: 7.29% (393/5388 rows)
Progress: 7.31% (394/5388 rows)
Progress: 7.33% (395/5388 rows)
Progress: 7.35% (396/5388 rows)
Progress: 7.37% (397/5388 rows)
Progress: 7.39% (398/5388 rows)
Progress: 7.41% (399/5388 rows)
Progress: 7.42% (400/5388 rows)
Progress: 7.44% (401/5388 rows)
Progress: 7.46% (402/5388 rows)
Progress: 7.48% (403/5388 rows)
Progress: 7.50% (404/5388 rows)
Progress: 7.52% (405/5388 rows)
Progress: 7.54% (406/5388 rows)
Progress: 7.55% (407/5388 rows)
Progress: 7.57% (408/5388 rows)
Progress: 7.59% (409/5388 rows)
Progress: 7.61% (410/5388 rows)
Progress: 7.63% (411/5388 rows)
Progress: 7.65% (412/5388 rows)
Progress: 7.67% (413/5388 rows)
Progress: 7.68% (414/5388 rows)
Progress: 7.70% (415/5388 rows)
Progress: 7.72% (416/5388 rows)
Progress: 7.74% (417/5388 rows)
Progress: 7.76% (418/5388 rows)
Progres

In [None]:
# Remove error images from rows
for index, row in df.iterrows():
    local_dir = row["local_dir"]
    try:
        image = Image.open(local_dir)
    except (IOError, OSError) as e:
        df.drop(index, inplace=True)

# Reset the index of the DataFrame after removing rows
df.reset_index(drop=True, inplace=True)

df.to_csv("dataset.csv", index=False)

In [None]:
print(f"df_shape: {df.shape}")

df_shape: (5387, 13)


# Modeling

In [None]:
df.head()

Unnamed: 0,filename,link,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,local_dir
0,46893.jpg,http://assets.myntassets.com/v1/images/style/p...,46893,Men,Apparel,Bottom,Jeans,Blue,Summer,2012.0,Casual,Locomotive Men Blue Jeans,dataset/46893.jpg
1,27914.jpg,http://assets.myntassets.com/v1/images/style/p...,27914,Women,Apparel,Bottom,Trousers,Brown,Summer,2012.0,Formal,Scullers For Her Women Brown Trousers,dataset/27914.jpg
2,6860.jpg,http://assets.myntassets.com/v1/images/style/p...,6860,Women,Apparel,Bottom,Skirts,Black,Summer,2011.0,Casual,Forever New Women's Short Black Skirt,dataset/6860.jpg
3,19981.jpg,http://assets.myntassets.com/v1/images/style/p...,19981,Men,Apparel,Top,Tshirts,White,Summer,2011.0,Casual,United Colors of Benetton Men Printed White TS...,dataset/19981.jpg
4,9960.jpg,http://assets.myntassets.com/v1/images/style/p...,9960,Men,Apparel,Top,Tshirts,Black,Fall,2011.0,Sports,Nike Men Jdi Remix Crew Black T-Shirts,dataset/9960.jpg


In [None]:
# ==========
# MODELLING
# ==========

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.3)

# Create an ImageDataGenerator for data augmentation and preprocessing
train_datagen = ImageDataGenerator(rescale=1.0/255,
                                   shear_range=0.2,
                                   zoom_range=0.2,
                                   horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1.0/255)

# Specify the target image dimensions
image_width, image_height = 224, 224  # VGG16 input size

# Create the training set generator
train_generator = train_datagen.flow_from_dataframe(dataframe=train_df,
                                                    x_col="local_dir",
                                                    y_col="subCategory",
                                                    target_size=(image_width, image_height),
                                                    batch_size=32,
                                                    class_mode="binary",
                                                    shuffle=True)

# Create the testing set generator
test_generator = test_datagen.flow_from_dataframe(dataframe=test_df,
                                                  x_col="local_dir",
                                                  y_col="subCategory",
                                                  target_size=(image_width, image_height),
                                                  batch_size=32,
                                                  class_mode="binary",
                                                  shuffle=False)

# Load the VGG16 pre-trained model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(image_width, image_height, 3))
base_model.trainable=False

# Create the model architecture
model = Sequential()
model.add(base_model)
model.add(GlobalAveragePooling2D())
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.01)))  # Apply L2 regularization
model.add(Dropout(0.3))  # Add dropout layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Set up early stopping callback
early_stopping = EarlyStopping(patience=3, monitor='val_loss', min_delta=0.001, restore_best_weights=True)

# Train the model with early stopping
model.fit(train_generator, epochs=10, validation_data=test_generator, callbacks=[early_stopping])

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_generator)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Found 3770 validated image filenames belonging to 2 classes.
Found 1617 validated image filenames belonging to 2 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.12023846805095673
Test Accuracy: 0.9851577281951904


In [None]:
model.save("top_down_new_model.h5")