In [23]:
import pandas as pd
import os
import configparser
config = configparser.ConfigParser()
config.read("../config.ini")

# Read both CSV files
fashion_dataset_images = pd.read_csv('../data/images.csv', on_bad_lines='skip')
fashion_dataset_styles = pd.read_csv('../data/styles.csv', on_bad_lines='skip')


In [24]:
fashion_dataset_images['id'] = fashion_dataset_images['filename'].str.replace('.jpg', '').astype(int)

# Merge the datasets
fashion_dataset_merged = pd.merge(fashion_dataset_images, fashion_dataset_styles, on='id', how='inner')

# Display the results
print("Fashion Dataset Merged Shape:", fashion_dataset_merged.shape)

# Optional: Display first few rows to verify the merge
print("\nFashion Dataset Merged Preview:")
print(fashion_dataset_merged.head())

Fashion Dataset Merged Shape: (44424, 12)

Fashion Dataset Merged Preview:
    filename                                               link     id gender  \
0  15970.jpg  http://assets.myntassets.com/v1/images/style/p...  15970    Men   
1  39386.jpg  http://assets.myntassets.com/v1/images/style/p...  39386    Men   
2  59263.jpg  http://assets.myntassets.com/v1/images/style/p...  59263  Women   
3  21379.jpg  http://assets.myntassets.com/v1/images/style/p...  21379    Men   
4  53759.jpg  http://assets.myntassets.com/v1/images/style/p...  53759    Men   

  masterCategory subCategory  articleType baseColour  season    year   usage  \
0        Apparel     Topwear       Shirts  Navy Blue    Fall  2011.0  Casual   
1        Apparel  Bottomwear        Jeans       Blue  Summer  2012.0  Casual   
2    Accessories     Watches      Watches     Silver  Winter  2016.0  Casual   
3        Apparel  Bottomwear  Track Pants      Black    Fall  2011.0  Casual   
4        Apparel     Topwear      Tshi

In [25]:
fashion_dataset_merged.drop(columns=['link','gender', 'masterCategory', 'articleType', 'baseColour', 'season', 'year', 'usage', 'productDisplayName'], inplace=True)

In [26]:
fashion_dataset_merged

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
2,59263.jpg,59263,Watches
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
...,...,...,...
44419,17036.jpg,17036,Shoes
44420,6461.jpg,6461,Flip Flops
44421,18842.jpg,18842,Topwear
44422,46694.jpg,46694,Fragrance


In [None]:
fashion_dataset_merged.dtypes


In [None]:
import os
import shutil

# Create new directory for matched images if it doesn't exist
os.makedirs('../data/raw_images', exist_ok=True)

# Get list of actual image files in images folder
image_files = set(f for f in os.listdir('../data/fashion-dataset/images') if f.endswith('.jpg'))

# Filter dataset to only include rows where filename exists in images folder
fashion_dataset_filtered = fashion_dataset_merged[fashion_dataset_merged['filename'].isin(image_files)].copy()

print(f"Original dataset size: {len(fashion_dataset_merged)}")
print(f"Filtered dataset size: {len(fashion_dataset_filtered)}")





In [11]:
# Move matched images to new folder
for filename in fashion_dataset_filtered['filename']:
    src = os.path.join('../data/fashion-dataset/images', filename)
    dst = os.path.join('../data/raw_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)



In [None]:
fashion_dataset_merged

In [None]:
fashion_dataset_merged.subCategory.value_counts()

In [28]:
clothing_items_top_10 = [
    "Topwear",
    "Bottomwear", 
    "Innerwear",
    "Dress",
    "Loungewear and Nightwear",
    "Saree",
    "Headwear", 
    "Ties",
    "Scarves",
    "Apparel Set"
]

In [12]:
os.makedirs('../data/raw_matched_images', exist_ok=True)

In [27]:
image_files = set(f for f in os.listdir('../data/raw_images') if f.endswith('.jpg'))


In [None]:
fashion_dataset_merged


In [29]:
# Filter dataset to only include rows where filename exists AND subcategory is in our list
fashion_dataset_filtered = fashion_dataset_merged[
    (fashion_dataset_merged['filename'].isin(image_files)) & 
    (fashion_dataset_merged['subCategory'].isin(clothing_items_top_10))
].copy()

In [30]:
fashion_dataset_filtered

Unnamed: 0,filename,id,subCategory
0,15970.jpg,15970,Topwear
1,39386.jpg,39386,Bottomwear
3,21379.jpg,21379,Bottomwear
4,53759.jpg,53759,Topwear
5,1855.jpg,1855,Topwear
...,...,...,...
44414,30614.jpg,30614,Topwear
44415,13496.jpg,13496,Topwear
44417,12544.jpg,12544,Topwear
44418,42234.jpg,42234,Topwear


In [31]:
for filename in fashion_dataset_filtered['filename']:
    src = os.path.join('../data/raw_images', filename)
    dst = os.path.join('../data/raw_matched_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)

# Update the main dataframe
fashion_dataset_merged = fashion_dataset_filtered

In [None]:
fashion_dataset_merged

In [32]:
fashion_dataset_merged['subCategory'].value_counts()

subCategory
Topwear                     15398
Bottomwear                   2693
Innerwear                    1808
Dress                         478
Loungewear and Nightwear      470
Saree                         427
Headwear                      293
Ties                          258
Scarves                       118
Apparel Set                   106
Name: count, dtype: int64

In [25]:
# Get the count of images per subcategory
subcategory_counts = fashion_dataset_merged['subCategory'].value_counts()

In [18]:
keep_safe = fashion_dataset_merged

In [None]:
fashion_dataset_merged

In [None]:
keep_safe

In [None]:
# Get current counts
subcategory_counts = fashion_dataset_merged['subCategory'].value_counts()
print("Before capping:")
print(subcategory_counts)

# Initialize empty dataframe for results
capped_dataset = pd.DataFrame()

# For each subcategory, take up to 1000 images
for subcategory in clothing_items_top_10:
    subset = fashion_dataset_merged[fashion_dataset_merged['subCategory'] == subcategory]
    if len(subset) > 1000:
        subset = subset.sample(n=1000, random_state=42)  # random_state for reproducibility
    capped_dataset = pd.concat([capped_dataset, subset])

# Update the main dataframe
fashion_dataset_merged = capped_dataset

print("\nAfter capping at 1000:")
print(fashion_dataset_merged['subCategory'].value_counts())

In [None]:
fashion_dataset_merged

In [None]:
# Create new directory for the capped dataset
os.makedirs('capped_images', exist_ok=True)

# Move images based on the capped dataset
for filename in fashion_dataset_merged['filename']:
    src = os.path.join('categorized_matched_images', filename)
    dst = os.path.join('capped_images', filename)
    if os.path.exists(src):
        shutil.move(src, dst)

# Verify the count of moved files
moved_files = len([f for f in os.listdir('capped_images') if f.endswith('.jpg')])
print(f"Moved {moved_files} files to capped_images directory")
print(f"Should match dataframe size: {len(fashion_dataset_merged)}")

In [None]:
import numpy as np
from PIL import Image
import tensorflow as tf
import random

# Get current max ID to ensure new IDs don't overlap
max_id = fashion_dataset_merged['id'].max()
current_new_id = max_id + 1

# Create augmentation layer
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
    tf.keras.layers.RandomBrightness(0.2),
    tf.keras.layers.RandomContrast(0.2),
])

# Function to augment single image
def augment_image(image_path):
    # Read and convert image to tensor
    img = tf.keras.preprocessing.image.load_img(image_path)
    img_array = tf.keras.preprocessing.image.img_to_array(img)
    img_array = tf.expand_dims(img_array, 0)
    
    # Augment
    augmented_img = data_augmentation(img_array)
    
    # Convert back to PIL Image
    augmented_img = tf.keras.preprocessing.image.array_to_img(augmented_img[0])
    return augmented_img

# Process each subcategory
new_rows = []
for category in clothing_items_top_10:
    category_df = fashion_dataset_merged[fashion_dataset_merged['subCategory'] == category]
    current_count = len(category_df)
    
    if current_count < 1000:
        needed_augmentations = 1000 - current_count
        print(f"Augmenting {category}: need {needed_augmentations} more images")
        
        # Randomly select images to augment (with replacement)
        source_files = category_df['filename'].tolist()
        
        for _ in range(needed_augmentations):
            # Select random source image
            source_filename = random.choice(source_files)
            source_path = os.path.join('capped_images', source_filename)
            
            # Generate new filename and ID
            new_filename = f"{current_new_id}.jpg"
            
            # Augment and save image
            augmented_img = augment_image(source_path)
            augmented_img.save(os.path.join('capped_images', new_filename))
            
            # Add new row to dataframe
            new_rows.append({
                'filename': new_filename,
                'id': current_new_id,
                'subCategory': category
            })
            
            current_new_id += 1

# Add new rows to dataframe
if new_rows:
    new_df = pd.DataFrame(new_rows)
    fashion_dataset_merged = pd.concat([fashion_dataset_merged, new_df], ignore_index=True)

# Verify results
print("\nFinal counts per category:")
print(fashion_dataset_merged['subCategory'].value_counts())

In [2]:
import pandas as pd
df = pd.read_csv('../data/fashion_dataset_merged.csv')

In [None]:
df.head()

In [4]:
df['label'] = df['subCategory']

In [None]:
df.label

In [None]:
df_test = df

In [None]:
import os
import shutil

# Create base directory for categorized images if it doesn't exist
base_dir = '../data/categorized_images'
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Create subdirectories for each unique category
categories = df['label'].unique()
for category in categories:
    category_dir = os.path.join(base_dir, category)
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)

# Move files from capped_images to their category folders
for _, row in df.iterrows():
    src_path = os.path.join('../data/capped_images', row['filename'])
    dst_path = os.path.join(base_dir, row['label'], row['filename'])
    
    # Check if source file exists before attempting to move
    if os.path.exists(src_path):
        shutil.copy2(src_path, dst_path)  # Using copy2 to preserve metadata

print("Images have been organized into their respective category folders")


In [None]:
import configparser
config = configparser.ConfigParser()
config.read("../config.ini")

In [10]:
# Create a new column for GCS paths using same folder structure
bucket_name = config['gcp']['bucket']   
df['image_path'] = df.apply(
    lambda row: f"gs://{bucket_name}/categorised_images/{row['label']}/{row['filename']}", 
    axis=1
)


In [None]:
df.head()

In [None]:
df = df.drop(columns=['filename', 'id', 'gcs_path'], inplace=True)

In [18]:
df.to_csv('gs://{bucket}/fashion_dataset_processed.csv', index=False)

In [16]:
df.drop(columns=['subCategory'], inplace=True)

In [None]:
df

In [None]:
# Update the 'link' column to reflect the GCS path
# bucket_name = "your-gcs-bucket-name"

df['gcs_path'] 
# Create the GCS file path for each image
df['gcs_path'] = df.apply(
    lambda row: f"gs://{bucket_name}/{row['subCategory']}/{row['filename']}", axis=1
)

In [None]:
project_id = config['gcp']['project']
region = config['gcp']['region']
gcs_source = [f"gs://{bucket_name}/clothing_dataset_processed.csv"]
from google.cloud import aiplatform
aiplatform.init(project=project_id, location=region)
dataset = aiplatform.ImageDataset.create(
    display_name="multi_class_image_dataset",
    gcs_source=gcs_source,
    import_schema_uri=aiplatform.schema.dataset.ioformat.image.single_label_classification,
)

print(f"Dataset resource name: {dataset.resource_name}")

In [None]:
df

In [26]:
df['gcs_file_path'] = df['image_path']

In [None]:
df

In [28]:
df.drop(columns=['image_path'], inplace=True)

In [None]:
df

In [30]:
# Reorder columns to put gcs_file_path before label
df = df[['gcs_file_path', 'label']]


In [None]:
df

In [38]:
# Create and train the AutoML model
model = aiplatform.AutoMLImageTrainingJob(
    display_name="image_classification_training",
    prediction_type="classification",  # Use "classification" for multi-class classification
    multi_label=False,  # Set to True if it's a multi-label classification problem
)


In [None]:
model = model.run(
    dataset=dataset,
    model_display_name="image_classification_model",
    budget_milli_node_hours=8000,  # Budget in milli node hours (8,000 = 8 node hours)
    disable_early_stopping=False,  # Early stopping for efficiency
)

In [None]:
endpoint = model.deploy(
    machine_type="n1-standard-4",  # Adjust machine type as needed
    min_replica_count=1,          # Minimum number of replicas
    max_replica_count=1,          # Maximum number of replicas must match minimum for this model type
)

print(f"Model deployed to endpoint: {endpoint.resource_name}")

In [None]:
import base64
import json
import requests
from google.auth.transport.requests import Request
from google.auth import default

# Replace with your endpoint URL
ENDPOINT_URL = ""

# Authenticate and get an access token
def get_access_token():
    credentials, project = default()
    credentials.refresh(Request())
    return credentials.token

# Prepare the payload
def make_prediction(image_path):
    # Load the image and encode it as base64
    with open(image_path, "rb") as image_file:
        image_content = base64.b64encode(image_file.read()).decode("utf-8")

    # Prepare the payload in the required format
    payload = {
        "instances": [
            {
                "content": image_content
            }
        ]
    }

    # Get the access token
    token = get_access_token()

    # Set up headers
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }

    # Make the request
    response = requests.post(ENDPOINT_URL, headers=headers, json=payload)

    # Check response
    if response.status_code == 200:
        return response.json()
    else:
        raise Exception(f"Prediction failed: {response.status_code}, {response.text}")

# Call the function with your image
image_path = "../data/23451234567.jpg"  # Replace with the path to your image
try:
    prediction = make_prediction(image_path)
    print("Prediction result:")
    print(json.dumps(prediction, indent=2))  # Pretty print the result
except Exception as e:
    print(f"Error: {e}")
