In [32]:
import os
import glob
import math
import re
import shutil
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sn
from pathlib import Path

from collections import Counter
from itertools import cycle, islice

from PIL import Image
from sys import argv
import random

from warnings import filterwarnings
filterwarnings("ignore")

In [2]:
df = pd.read_csv("images.csv")
df

Unnamed: 0,image,sender_id,label,kids
0,4285fab0-751a-4b74-8e9b-43af05deee22,124,Not sure,False
1,ea7b6656-3f84-4eb3-9099-23e623fc1018,148,T-Shirt,False
2,00627a3f-0477-401c-95eb-92642cbe078d,94,Not sure,False
3,ea2ffd4d-9b25-4ca8-9dc2-bd27f1cc59fa,43,T-Shirt,False
4,3b86d877-2b9e-4c8b-a6a2-1d87513309d0,189,Shoes,False
...,...,...,...,...
5398,dfd4079d-967b-4b3e-8574-fbac11b58103,204,Shorts,False
5399,befa14be-8140-4faf-8061-1039947e329d,204,Body,True
5400,5379356a-40ee-4890-b416-2336a7d84061,310,Shorts,False
5401,65507fb8-3456-4c15-b53e-d1b03bf71a59,204,Shoes,False


### Remove "Not Sure" and "Other" labels data from dataframe 

In [4]:
df = df[~df["label"].isin(["Not sure", "Other"])].reset_index()
df

Unnamed: 0,level_0,index,image,sender_id,label,kids
0,0,1,ea7b6656-3f84-4eb3-9099-23e623fc1018,148,T-Shirt,False
1,1,3,ea2ffd4d-9b25-4ca8-9dc2-bd27f1cc59fa,43,T-Shirt,False
2,2,4,3b86d877-2b9e-4c8b-a6a2-1d87513309d0,189,Shoes,False
3,3,5,5d3a1404-697f-479f-9090-c1ecd0413d27,138,Shorts,False
4,4,6,b0c03127-9dfb-4573-8934-1958396937bf,138,Shirt,False
...,...,...,...,...,...,...
5103,5103,5398,dfd4079d-967b-4b3e-8574-fbac11b58103,204,Shorts,False
5104,5104,5399,befa14be-8140-4faf-8061-1039947e329d,204,Body,True
5105,5105,5400,5379356a-40ee-4890-b416-2336a7d84061,310,Shorts,False
5106,5106,5401,65507fb8-3456-4c15-b53e-d1b03bf71a59,204,Shoes,False


### Change labels names of labels with kids as true for differentation 

In [5]:
df.loc[df["kids"] == True, "label"] = df.loc[df["kids"] == True, "label"].astype(str) + "-kids"
df

Unnamed: 0,level_0,index,image,sender_id,label,kids
0,0,1,ea7b6656-3f84-4eb3-9099-23e623fc1018,148,T-Shirt,False
1,1,3,ea2ffd4d-9b25-4ca8-9dc2-bd27f1cc59fa,43,T-Shirt,False
2,2,4,3b86d877-2b9e-4c8b-a6a2-1d87513309d0,189,Shoes,False
3,3,5,5d3a1404-697f-479f-9090-c1ecd0413d27,138,Shorts,False
4,4,6,b0c03127-9dfb-4573-8934-1958396937bf,138,Shirt,False
...,...,...,...,...,...,...
5103,5103,5398,dfd4079d-967b-4b3e-8574-fbac11b58103,204,Shorts,False
5104,5104,5399,befa14be-8140-4faf-8061-1039947e329d,204,Body-kids,True
5105,5105,5400,5379356a-40ee-4890-b416-2336a7d84061,310,Shorts,False
5106,5106,5401,65507fb8-3456-4c15-b53e-d1b03bf71a59,204,Shoes,False


In [6]:
unique_labels = df["label"].unique().tolist()
unique_labels

['T-Shirt',
 'Shoes',
 'Shorts',
 'Shirt',
 'Pants',
 'Skirt',
 'Top',
 'Outwear',
 'Dress',
 'Body-kids',
 'Longsleeve',
 'T-Shirt-kids',
 'Undershirt',
 'Pants-kids',
 'Hat',
 'Top-kids',
 'Hat-kids',
 'Polo',
 'Blouse',
 'Dress-kids',
 'Body',
 'Hoodie',
 'Skip',
 'Undershirt-kids',
 'Outwear-kids',
 'Blazer',
 'Longsleeve-kids',
 'Shoes-kids',
 'Polo-kids',
 'Shorts-kids',
 'Skirt-kids',
 'Shirt-kids',
 'Skip-kids',
 'Blazer-kids',
 'Hoodie-kids']

In [24]:

path = "images_compressed/"
for label in sorted(unique_labels):
    print(label)
        
    df_unique = df[df["label"] == label]
    #display(df_unique)
    
    image_list = df_unique["image"].to_list() #extract image names
    image_list = [image+".jpg" for image in image_list]  #add file extension to image names
    #print(image_list)
    
    #ensure that only labels with more than 10 images are copied
    if len(image_list) >= 10 and label not in ["Skip", "Skip-kids"]:
        print(f"{label}: {len(image_list)}")
        
        #create label path
        if not os.path.isdir(f"sorted_data/{label}"):
            os.makedirs(f"sorted_data/{label}")
            
        #copy images to newly created label pth
        for img in image_list:
            shutil.copy(f"{path}{img}", f"sorted_data/{label}/{img}")
            pass
    
    print()
    

Blazer
Blazer: 107

Blazer-kids

Blouse
Blouse: 23

Body

Body-kids
Body-kids: 68

Dress
Dress: 309

Dress-kids
Dress-kids: 48

Hat
Hat: 161

Hat-kids
Hat-kids: 10

Hoodie
Hoodie: 97

Hoodie-kids

Longsleeve
Longsleeve: 661

Longsleeve-kids
Longsleeve-kids: 38

Outwear
Outwear: 277

Outwear-kids
Outwear-kids: 35

Pants
Pants: 605

Pants-kids
Pants-kids: 87

Polo
Polo: 115

Polo-kids

Shirt
Shirt: 372

Shirt-kids

Shoes
Shoes: 371

Shoes-kids
Shoes-kids: 60

Shorts
Shorts: 284

Shorts-kids
Shorts-kids: 24

Skip

Skip-kids

Skirt
Skirt: 148

Skirt-kids

T-Shirt
T-Shirt: 984

T-Shirt-kids
T-Shirt-kids: 27

Top
Top: 38

Top-kids

Undershirt
Undershirt: 103

Undershirt-kids
Undershirt-kids: 15



### Prepare to split images to train and validation 

In [26]:
path = "sorted_data"

In [27]:
def get_section_images(section: str) -> list:  
    
    """
    Function to get all iamges in a specified label directory"""
    path_img_jpg = glob.glob(f"{path}/{section}/*?.jpg")
    path_img_webp = glob.glob(f"{path}/{section}/*.webp")
    path_img_jpeg = glob.glob(f"{path}/{section}/*.jpeg")  # Match jpeg files
    paths = path_img_jpg + path_img_jpeg + path_img_webp
    
    return paths

In [28]:
#get list of newly sorted labels
labels = sorted(os.listdir(path))
print(labels)

['Blazer', 'Blouse', 'Body-kids', 'Dress', 'Dress-kids', 'Hat', 'Hat-kids', 'Hoodie', 'Longsleeve', 'Longsleeve-kids', 'Outwear', 'Outwear-kids', 'Pants', 'Pants-kids', 'Polo', 'Shirt', 'Shoes', 'Shoes-kids', 'Shorts', 'Shorts-kids', 'Skirt', 'T-Shirt', 'T-Shirt-kids', 'Top', 'Undershirt', 'Undershirt-kids']


In [29]:
def images_train_val_split(path: str, test_images: int) -> (list, list): 
    """
    function to split images to train and validation"""
    test_list_images = []   #initialize empty list for storing random unique names for validation datset
    ite = 0

    #while loop to extract random unique names
    while len(test_list_images) < test_images:
        ran = random.randint(0, len(path)-1)  #generate random numbers from total length of unique names
        curr_name = path[ran]
        if len(test_list_images) == 0:  #save first name into list
            test_list_images.append(curr_name)
        elif len(test_list_images) != 0 and curr_name not in test_list_images:  #check if every other name does not exist before appending
            test_list_images.append(curr_name)

        ite += 1


    train_list_images = [i for i in path if i not in test_list_images]
    return train_list_images, test_list_images

### Create directories to store train and validation data if they do not exist 

In [30]:
main_dir = "Clothes_Classification_Data"
train_dir = "train"
val_dir = "test"
img_dir = "images"



if os.path.isdir(f"{main_dir}"):
    #os.rmdir(f"{main_dir}")
    shutil.rmtree(f"{main_dir}")


if not os.path.isdir(f"{main_dir}"):
    os.makedirs(f"{main_dir}")
    
#main train directory
if not os.path.isdir(f"{main_dir}/{train_dir}"):
    os.makedirs(f"{main_dir}/{train_dir}")

#main test directory   
if not os.path.isdir(f"{main_dir}/{val_dir}"):
    os.makedirs(f"{main_dir}/{val_dir}")
    
    

for label in labels:
    #train label directory
    if not os.path.isdir(f"{main_dir}/{train_dir}/{label}"):
        os.makedirs(f"{main_dir}/{train_dir}/{label}")

    #test test directory
    if not os.path.isdir(f"{main_dir}/{val_dir}/{label}"):
        os.makedirs(f"{main_dir}/{val_dir}/{label}")

In [33]:
#iterate through labels split images by label
for label in labels:

    label_images = get_section_images(label)   
    #print(label, len(label_images))
    
    training_images = int(len(label_images) * 0.85)
    test_images = len(label_images) - training_images
    print(f"{label}: {training_images} {test_images}")
    
    train_image_list, test_image_list = images_train_val_split(label_images, test_images)
    print(len(train_image_list), len(test_image_list))
    
    tr = 0
    for imgs in train_image_list:
        img_name = imgs.split("/")[-1]
        shutil.copy(f"{imgs}", f"{main_dir}/{train_dir}/{label}")
        tr += 1

    te = 0
    for imgs in test_image_list:
        img_name = imgs.split("/")[-1]
        shutil.copy(f"{imgs}", f"{main_dir}/{val_dir}/{label}")
        te += 1
    print(tr, te)
    print()

Blazer: 90 17
90 17
90 17

Blouse: 19 4
19 4
19 4

Body-kids: 57 11
57 11
57 11

Dress: 262 47
262 47
262 47

Dress-kids: 40 8
40 8
40 8

Hat: 136 25
136 25
136 25

Hat-kids: 8 2
8 2
8 2

Hoodie: 82 15
82 15
82 15

Longsleeve: 561 100
561 100
561 100

Longsleeve-kids: 32 6
32 6
32 6

Outwear: 235 42
235 42
235 42

Outwear-kids: 29 6
29 6
29 6

Pants: 514 91
514 91
514 91

Pants-kids: 73 14
73 14
73 14

Polo: 97 18
97 18
97 18

Shirt: 316 56
316 56
316 56

Shoes: 315 56
315 56
315 56

Shoes-kids: 51 9
51 9
51 9

Shorts: 241 43
241 43
241 43

Shorts-kids: 20 4
20 4
20 4

Skirt: 125 23
125 23
125 23

T-Shirt: 836 148
836 148
836 148

T-Shirt-kids: 22 5
22 5
22 5

Top: 32 6
32 6
32 6

Undershirt: 87 16
87 16
87 16

Undershirt-kids: 12 3
12 3
12 3

