# 1.  Stratified Splitting of the Dataset
Load json

In [1]:
subcategory_to_category = {
    '2-seaters':'sofas',
    '3-seaters': 'sofas',
    '4-seaters-up': 'sofas',
    'sofa-beds': 'sofas',
    'armchairs': 'sofas',
    'l-shape': 'sofas',
    'leather-sofas': 'sofas',
    'lounge-chairs': 'sofas',
    'outdoor-sofas': 'sofas',
    'recliners': 'sofas',
    'sofa-beds': 'sofas',
    'sofa-sets': 'sofas',
    'bar-stools': 'chairs',
    'bean-bags-poufs': 'chairs',
    'benches': 'chairs',
    'dining-benches': 'chairs',
    'dining-chairs': 'chairs',
    'office-chairs': 'chairs',
    'outdoor-dining-sets': 'chairs',
    'stools-ottomans': 'chairs',
}

In [2]:
import json
import pandas as pd

with open('manual-clusters-sofas-chairs.json') as f:
    data = json.load(f)

records = []

for subcategory, products in data.items():
    for product_id, group_id in products.items():
        records.append({
            "subcategory": subcategory,
            "product_id": product_id,
            "group_id": group_id
        })

df = pd.DataFrame(records)
df['product_id'] = df['product_id'].astype(str)
df.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,subcategory,product_id,group_id
0,2-seaters,54312,13
1,2-seaters,52072,13
2,2-seaters,91532,13
3,2-seaters,54882,13
4,2-seaters,92674,13


`manual-clusters-sofas-chairs.json` contains products which do not have images, need to filter and drop them from the df

In [3]:
import os

def image_exists(subcategory, product_id, base_dir='all'):
    subcategory_path = os.path.join(base_dir, subcategory_to_category[subcategory], subcategory)
    for filename in os.listdir(subcategory_path):
        if filename.startswith(f"{product_id}_") and filename.endswith("_resized.jpg"):
            return True
    return False

print(df.shape[0])
df['image_exists'] = df.apply(lambda row: image_exists(row['subcategory'], row['product_id'], base_dir='all'), axis=1)
df_filtered = df[df['image_exists']]
df_filtered = df_filtered.drop(columns=['image_exists'])
df_filtered.head()
print(df_filtered.shape[0])

2620
2288


Filter out products without the primary image (index 0)

In [4]:
import os

# Base directory where your images are stored
base_dir = 'all'

# Initialize an empty set to store product IDs with a primary image
product_ids_with_primary_image = set()

# Walk through the file system
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if "_image_0_resized.jpg" in file:
            # Extract product ID from filename
            product_id = file.split('_')[0]
            product_ids_with_primary_image.add(product_id)
            
# Filter the DataFrame
df_filtered = df_filtered[df_filtered['product_id'].isin(product_ids_with_primary_image)]

Filter out groups with one product

In [5]:
group_counts = df_filtered['group_id'].value_counts()

valid_groups = group_counts[group_counts > 1].index
print(df_filtered.shape[0])
df_filtered = df_filtered[df_filtered['group_id'].isin(valid_groups)]
print(df_filtered.shape[0])

2287
2274


Split dataset. 80% training, 10% for testing, 10% for validation

In [6]:
df_filtered['group_id'].nunique()

455

Do not have enough group ids to do stratify splitting with scikit train_test_split \n
1. Ensure train dataset has one product from each group first
2. Perform random splitting for the rest
3. Verify distribution after

In [67]:
from sklearn.model_selection import train_test_split

train_df_initial = pd.DataFrame()

groups = df_filtered['group_id'].unique()

for group in groups:
    group_subset = df_filtered[df_filtered['group_id'] == group]
    train_df_initial = pd.concat([train_df_initial, group_subset.sample(n=2, random_state=42)])

total_dataset_size = len(df_filtered)
desired_training_size = total_dataset_size * 0.8
additional_training_needed = max(0, desired_training_size - len(train_df_initial))

df_remaining = df_filtered.drop(train_df_initial.index)
split_size_for_remaining = (len(df_remaining) - additional_training_needed) / len(df_remaining)

if additional_training_needed > 0:
    additional_train_df, df_remaining = train_test_split(df_remaining, test_size=(split_size_for_remaining), random_state=42)
    train_df = pd.concat([train_df_initial, additional_train_df], ignore_index=True)
else:
    train_df = train_df_initial.copy()

validation_df, test_df = train_test_split(df_remaining, test_size=0.5, random_state=42)

print(f"Total dataset size: {total_dataset_size}")
print(f"Training set size: {len(train_df)} ({(len(train_df)/total_dataset_size)*100:.2f}%)")
print(f"Validation set size: {len(validation_df)} ({(len(validation_df)/total_dataset_size)*100:.2f}%)")
print(f"Test set size: {len(test_df)} ({(len(test_df)/total_dataset_size)*100:.2f}%)")

Total dataset size: 2274
Training set size: 1819 (79.99%)
Validation set size: 227 (9.98%)
Test set size: 228 (10.03%)


Ensure no overlap of product ids between datasets

In [112]:
train_product_ids = set(train_df['image_path'])
validation_product_ids = set(validation_df['image_path'])
test_product_ids = set(test_df['image_path'])

train_validation_overlap = train_product_ids.intersection(validation_product_ids)
train_test_overlap = train_product_ids.intersection(test_product_ids)
validation_test_overlap = validation_product_ids.intersection(test_product_ids)

print(f"Overlap between training and validation sets: {len(train_validation_overlap)} products")
print(f"Overlap between training and testing sets: {len(train_test_overlap)} products")
print(f"Overlap between validation and testing sets: {len(validation_test_overlap)} products")

Overlap between training and validation sets: 0 products
Overlap between training and testing sets: 0 products
Overlap between validation and testing sets: 0 products


Checking entropy scores to identify diverstiy and uniformity within each dataset

In [72]:
from scipy.stats import entropy

train_distribution = train_df['group_id'].value_counts()
validation_distribution = validation_df['group_id'].value_counts()
test_distribution = test_df['group_id'].value_counts()

train_entropy = entropy(train_distribution.values)
validation_entropy = entropy(validation_distribution.values)
test_entropy = entropy(test_distribution.values)

print(train_entropy, validation_entropy, test_entropy)

5.832410477296748 4.580859369951971 4.529220633633233


In [73]:
groups_with_less_than_2_products = train_distribution[train_distribution < 2]

print("Groups with less than 2 products in the training set:\n", groups_with_less_than_2_products)
print("\nNumber of groups with at less than 2 products in the training set:", len(groups_with_less_than_2_products))

Groups with less than 2 products in the training set:
 Series([], Name: count, dtype: int64)

Number of groups with at less than 2 products in the training set: 0


Verify split ratio

In [110]:
print(train_df.shape[0], validation_df.shape[0], test_df.shape[0])
total_instances = len(train_df) + len(validation_df) + len(test_df)
train_ratio = len(train_df) / total_instances
validation_ratio = len(validation_df) / total_instances
test_ratio = len(test_df) / total_instances

print(f"Training set ratio: {train_ratio:.2f}")
print(f"Validation set ratio: {validation_ratio:.2f}")
print(f"Test set ratio: {test_ratio:.2f}")


1819 227 228
Training set ratio: 0.80
Validation set ratio: 0.10
Test set ratio: 0.10


In [81]:
# Original distribution
original_distribution = df_filtered['group_id'].value_counts(normalize=True)

# Training distribution
train_distribution = train_df['group_id'].value_counts(normalize=True)

# Validation distribution
validation_distribution = validation_df['group_id'].value_counts(normalize=True)

# Test distribution
test_distribution = test_df['group_id'].value_counts(normalize=True)


# 2. Copy images over to their respective folders

In [82]:
import os
import shutil

def copy_images(df, dest_dir, source_dir='all'):
    if dest_dir == None or dest_dir == '':
        raise ValueError("Please provide a destination directory.")

    dest_path = dest_dir
    
    for _, row in df.iterrows():
        subcategory = row['subcategory']
        product_id = str(row['product_id'])
        
        src_path = os.path.join(source_dir, subcategory_to_category[subcategory], subcategory, f"{product_id}_image_0_resized.jpg")
        dst_path = os.path.join(dest_path,  subcategory_to_category[subcategory], subcategory, f"{product_id}_image_0_resized.jpg")
        
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        
        shutil.copy2(src_path, dst_path)

In [83]:
# source_images_dir = 'all'
# train_images_dir = 'training'
# validation_images_dir = 'validation'
# test_images_dir = 'test'

# copy_images(train_df, source_dir=source_images_dir, dest_dir=train_images_dir)
# copy_images(validation_df, source_dir=source_images_dir, dest_dir=validation_images_dir)
# copy_images(test_df, source_dir=source_images_dir, dest_dir=test_images_dir)

Save dataframes into csv

In [84]:
def save_csv(df, filename):
    df['image_path'] = df.apply(lambda row: f"{subcategory_to_category[row['subcategory']]}/{row['subcategory']}/{row['product_id']}_image_0_resized.jpg", axis=1)
    df.drop(columns=['subcategory', 'product_id'], inplace=True)
    df.rename(columns={'group_id': 'group'}, inplace=True)
    df.reset_index(drop=True, inplace=True)
    display(df.head())
    df.to_csv(f'new_images_csv/{filename}', index=False)

save_csv(train_df, 'training_images.csv')
save_csv(validation_df, 'validation_images.csv')
save_csv(test_df, 'testing_images.csv')

Unnamed: 0,group,image_path
0,13,sofas/2-seaters/91513_image_0_resized.jpg
1,13,sofas/2-seaters/87437_image_0_resized.jpg
2,14,sofas/2-seaters/88716_image_0_resized.jpg
3,14,sofas/2-seaters/94018_image_0_resized.jpg
4,15,sofas/2-seaters/91579_image_0_resized.jpg


Unnamed: 0,group,image_path
0,391,sofas/outdoor-sofas/14619_image_0_resized.jpg
1,93,sofas/3-seaters/46964_image_0_resized.jpg
2,61,sofas/3-seaters/92228_image_0_resized.jpg
3,332,sofas/lounge-chairs/49322_image_0_resized.jpg
4,879,chairs/stools-ottomans/41477_image_0_resized.jpg


Unnamed: 0,group,image_path
0,453,sofas/sofa-beds/91503_image_0_resized.jpg
1,104,sofas/4-seaters-up/94024_image_0_resized.jpg
2,185,sofas/l-shape/91717_image_0_resized.jpg
3,115,sofas/4-seaters-up/88163_image_0_resized.jpg
4,811,chairs/office-chairs/25435_image_0_resized.jpg


# 3. Generation of triplet pairs

In [3]:
import pandas as pd

train_df = pd.read_csv('new_images_csv/training_images.csv')
validation_df = pd.read_csv('new_images_csv/validation_images.csv')
test_df = pd.read_csv('new_images_csv/testing_images.csv')

Using simple generation by sampling rows

In [34]:
from itertools import permutations
from math import factorial
from random import shuffle


def calculate_permutations(n, k):
    return factorial(n) // factorial(n - k)

def ensure_representation_and_all_orderings_triplets(dataset, max_rows=None):
    grouped_data = dataset.groupby('group')['image_path'].apply(list).to_dict()
    all_groups = list(grouped_data.keys())
    no_of_triplets_per_group = max_rows // len(all_groups) if max_rows else None
    no_of_triplets_generated_per_group = {group: 0 for group in all_groups}
    
    if not max_rows:
        max_triplets = 0
        for images in grouped_data.values():
            if len(images) >= 2:
                n = len(images)
                k = 2
                max_triplets += calculate_permutations(n, k) * (len(dataset) - n)
        max_rows = int(max_triplets)
    
    triplets = []

    for group in all_groups:
        for other_group in all_groups:
            if group == other_group:
                continue
            positive_images = grouped_data[group]
            negative_images = grouped_data[other_group]
            
            if len(positive_images) < 2 or not negative_images:
                continue 
            
            A1, A2 = positive_images[:2]
            B = negative_images[0]
            triplets.append((A1, A2, B, group, other_group))
            triplets.append((A2, A1, B, group, other_group))
            no_of_triplets_generated_per_group[group] += 1
            
            if len(triplets) >= max_rows:
                return triplets

    for group in all_groups:
        positive_images = grouped_data[group]
        shuffle(positive_images)
        for A1, A2 in permutations(positive_images, 2):
            if no_of_triplets_per_group and no_of_triplets_generated_per_group[group] >= no_of_triplets_per_group:
                break
            
            for other_group in all_groups:
                if group == other_group:
                    continue
                
                negative_images = grouped_data[other_group]
                shuffle(negative_images)
                
                for B in negative_images:
                    triplets.append((A1, A2, B, group, other_group))
                    triplets.append((A2, A1, B, group, other_group))
                    no_of_triplets_generated_per_group[group] += 2
                    
                    if no_of_triplets_per_group and no_of_triplets_generated_per_group[group] >= no_of_triplets_per_group:
                        break
                    
                    if len(triplets) >= max_rows:
                        return triplets
    
    return triplets

In [38]:
train_triplets = ensure_representation_and_all_orderings_triplets(train_df, max_rows=10000000)
triplets_train_df = pd.DataFrame(train_triplets, columns=["anchor", "similar", "dissimilar", "similar_group", "dissimilar_group"])
print(triplets_train_df.shape[0])
triplets_train_df.drop_duplicates(subset=['anchor', 'similar', 'dissimilar'], inplace=True)
print(triplets_train_df.shape[0])

21978
7415056
4953630


In [39]:
# reduced training set for hyperparameter tuning ~10% of the original training size
max_no_of_triplets = round(0.1 * triplets_train_df.shape[0])
reduced_train_triplets = ensure_representation_and_all_orderings_triplets(train_df, max_rows=max_no_of_triplets)
triplets_reduced_train_df = pd.DataFrame(reduced_train_triplets, columns=["anchor", "similar", "dissimilar", "similar_group", "dissimilar_group"])
print(triplets_reduced_train_df.shape[0])
triplets_reduced_train_df.drop_duplicates(subset=['anchor', 'similar', 'dissimilar'], inplace=True)
print(triplets_reduced_train_df.shape[0])

1088
495364
485614


In [40]:
# ensure that training triplets and reduced training triplets set has all groups as well
train_df['group'].nunique(), triplets_train_df['similar_group'].nunique(), triplets_reduced_train_df['similar_group'].nunique()

(455, 455, 455)

In [19]:
triplets_reduced_train_df = triplets_reduced_train_df.sample(frac=1).reset_index(drop=True)
triplets_reduced_train_df.to_csv('new_images_csv/reduced_training_triplets.csv', index=False)

In [None]:
triplets_train_df = triplets_train_df.sample(frac=1).reset_index(drop=True)
triplets_train_df.to_csv('new_images_csv/training_triplets.csv', index=False)

In [107]:
triplets_train_df = triplets_train_df.sample(frac=1).reset_index(drop=True)
triplets_train_df.to_csv('new_images_csv/training_triplets.csv', index=False)

In [2]:
total_rows = 116614
batch_size = 32
steps_per_epoch = total_rows // batch_size

# If you want to include the last partial batch in the count, you can do the following instead:
import math
steps_per_epoch = math.ceil(total_rows / batch_size)

print(f"Steps per epoch: {steps_per_epoch}")


Steps per epoch: 3645


check if triplet pairs missing anything

In [22]:
import numpy as np

def jensen_shannon_divergence(p, q):
    """
    Compute Jensen-Shannon Divergence between two probability distributions.
    """
    p = np.array(p)
    q = np.array(q)
    m = 0.5 * (p + q)
    jsd = 0.5 * (np.sum(p * np.log2(p / m)) + np.sum(q * np.log2(q / m)))
    return jsd

print(triplets_train_df['similar_group'].nunique(), train_df['group'].nunique())

triplet_group_distribution = triplets_train_df['similar_group'].value_counts(normalize=True)
original_group_distribution = train_df['group'].value_counts(normalize=True)

jsd_score = jensen_shannon_divergence(triplet_group_distribution, original_group_distribution)

print("Jensen-Shannon Divergence between distribution of triplet pairs and original dataset:", jsd_score)

455 455
Jensen-Shannon Divergence between distribution of triplet pairs and original dataset: 0.03763558725148959


In [23]:
common_groups = set(test_df['group'].unique()).intersection(set(triplets_reduced_train_df['similar_group'].unique()))
# Convert the set of common groups to a list
common_groups_list = list(common_groups)

# Filter distributions to include only common groups
filtered_distribution_original = original_group_distribution.loc[common_groups_list]
filtered_distribution_triplets = triplet_group_distribution.loc[common_groups_list]

# Normalize distributions if not already normalized
filtered_distribution_original /= filtered_distribution_original.sum()
filtered_distribution_triplets /= filtered_distribution_triplets.sum()

# Calculate JSD
jsd_score = jensen_shannon_divergence(filtered_distribution_original, filtered_distribution_triplets)
print("Jensen-Shannon Divergence between distribution of triplet pairs and original dataset:", jsd_score)


Jensen-Shannon Divergence between distribution of triplet pairs and original dataset: 0.05404983284481284
