In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import os 
from os import listdir
from random import sample
import pandas as pd
import random
np.random.seed(2)
np.random.RandomState(2)
import sklearn
import cv2
from skimage.filters import threshold_otsu
from sklearn.utils import resample
from google.colab.patches import cv2_imshow

# A Note on File Paths

The below code was written in Google Colab due to the time required to model the data. This notebook was mounted on the Google Drive, on the code file paths reflect as such. 

Similarly named files, i.e. the original images saved down in a file called 'raw_dataset' should result in the code running without issues.

If any guidance is required, users are encouraged to reach out via Github.

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


Set up path to image dataset, create list of file names.

In [None]:
data_path = '/content/drive/MyDrive/clothing_class/raw_dataset'

In [None]:
data_files = os.listdir(data_path)

## Write Image Folder to CSV

The image name and path are now written to a CSV that is saved in the file.

Code guidance received from:

https://gist.github.com/GermanCM/03754e11ac7e9a6343754ff389eb47f0

https://stackoverflow.com/questions/58625312/printing-file-name-and-file-path-to-a-csv

In [None]:
with open('/content/drive/MyDrive/clothing_class/dataset.csv', 'w') as nf:
    nf.write('file_name,file_path\n')
    for root, dirs, files in os.walk(data_path):
        for x in files:
            file_path= os.path.join(root, x)
            file_name= os.path.basename(x)
            nf.write('{},{}\n'.format(file_name,file_path))

In [None]:
clothes_df = pd.read_csv('/content/drive/MyDrive/clothing_class/dataset.csv')

In [None]:
clothes_df.head()

Unnamed: 0,file_name,file_path
0,vertical_top_1012.jpg,/content/drive/MyDrive/Capstone/raw_dataset/ve...
1,horiz_tshirt_1020.jpg,/content/drive/MyDrive/Capstone/raw_dataset/ho...
2,horiz_sweater_1001.jpg,/content/drive/MyDrive/Capstone/raw_dataset/ho...
3,horiz_top_1000.jpg,/content/drive/MyDrive/Capstone/raw_dataset/ho...
4,horiz_top_1003.jpg,/content/drive/MyDrive/Capstone/raw_dataset/ho...


The naming convention of the files, when saved into the raw dataset, included the assigned pattern label, an underscore, and the assigned garment label.

In [None]:
clothes_df['pattern_label'] = [i.split('_')[0] for i in clothes_df['file_name']]
clothes_df['garment_label'] = [i.split('_')[1] for i in clothes_df['file_name']]

Fixing typo in naming convention.

In [None]:
clothes_df['pattern_label'] = clothes_df['pattern_label'].replace('Copy of animal','animal')

There is an imbalance in the dataset regarding pattern labeling, with solid patterned clothing comprising 56.7% of the images.

In [None]:
clothes_df['pattern_label'].value_counts(normalize=True)

solid        0.567527
horiz        0.161798
polka        0.091426
vertical     0.040971
checkered    0.039264
chevron      0.039264
animal       0.036039
paisley      0.023710
Name: pattern_label, dtype: float64

In [None]:
clothes_df['garment_label'].value_counts(normalize=True)

dress      0.139226
sweater    0.136002
coat       0.134484
pants      0.129932
tshirt     0.116654
top        0.107360
jacket     0.087253
shorts     0.086495
skirt      0.062595
Name: garment_label, dtype: float64

## Resampling for Pattern Modeling

In [None]:
solids = clothes_df[clothes_df["pattern_label"] == "solid"]

The number of solids in the resampled dataset will be reduced by a factor of 5.

In [None]:
solid_downsample = sklearn.utils.resample(solids,
             replace=True,
             n_samples=int(len(solids)/5),
             random_state=2)

non_solid = clothes_df[clothes_df["pattern_label"] != "solid"]


data_resampled = pd.concat([solid_downsample, non_solid])

The value count breakdown of the resampled dataset are below. While there remains an imbalance in the dataset, it has been reduced.

In [None]:
data_resampled['pattern_label'].value_counts(normalize=True)

horiz        0.296386
solid        0.207783
polka        0.167477
vertical     0.075052
chevron      0.071925
checkered    0.071925
animal       0.066018
paisley      0.043433
Name: pattern_label, dtype: float64

In [None]:
clothes_df['garment_label'].value_counts(normalize=True)

dress      0.139226
sweater    0.136002
coat       0.134484
pants      0.129932
tshirt     0.116654
top        0.107360
jacket     0.087253
shorts     0.086495
skirt      0.062595
Name: garment_label, dtype: float64

## The datasets will now be shuffled and saved down.

In [None]:
clothes_df = clothes_df.sample(frac=1,random_state=2)
clothes_df.to_csv('/content/drive/MyDrive/clothing_class/dataset.csv',index=False)

In [None]:
data_resampled = data_resampled.sample(frac=1,random_state=2)
data_resampled.to_csv('/content/drive/MyDrive/clothing_class/data_resampled.csv',index=False)

## Creating New Image Datasets

Four new image datasets are created below, containing a modified version of each image in the dataset. Each uses a different technique of image segmentation, which will then be analyzed for impact on model performance.

OpenCV library was used to create the new images, coding guidance provided via the below link.

https://machinelearningknowledge.ai/image-segmentation-in-python-opencv/

# Image Segmentation - KMeans, 5 Clusters

In [None]:
for file in data_files:
  img = cv2.imread(data_path+'/'+file)
  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img = cv2.resize(img,(224,224))
  twoDimage = img.reshape((-1,3))
  twoDimage = np.float32(twoDimage)
  criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2)
  K = 5
  attempts=10
  ret,label,center=cv2.kmeans(twoDimage,K,None,criteria,attempts,cv2.KMEANS_PP_CENTERS)
  center = np.uint8(center)
  res = center[label.flatten()]
  result_image = res.reshape((img.shape))
  os.chdir('/content/drive/MyDrive/clothing_class/segmented_images/cv2images_kmeans')
  cv2.imwrite(file, result_image)

K-Means segmented images saved to new datafile. 5 clusters were used for the segmentation.

# Image Segmentation - KMeans, 3 Clusters

In [None]:
for file in data_files:
  img = cv2.imread(data_path+'/'+file)
  img = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img = cv2.resize(img,(224,224))
  twoDimage = img.reshape((-1,3))
  twoDimage = np.float32(twoDimage)
  criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.2)
  K = 3
  attempts=10
  ret,label,center=cv2.kmeans(twoDimage,K,None,criteria,attempts,cv2.KMEANS_PP_CENTERS)
  center = np.uint8(center)
  res = center[label.flatten()]
  result_image = res.reshape((img.shape))
  os.chdir('/content/drive/MyDrive/clothing_class/segmented_images/cv2images_kmeans3')
  cv2.imwrite(file, result_image)

An addiitonal K-Means image dataset was created, with 3 clusters used for the segmentation.

## Image Segmentation - Contour

In [None]:
for file in data_files:
  img = cv2.imread(data_path+'/'+file)
  gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
  gray = cv2.resize(gray,(224,224))

  _,thresh = cv2.threshold(gray, np.mean(gray), 255, cv2.THRESH_BINARY_INV)
  edges = cv2.dilate(cv2.Canny(thresh,0,255),None)
  contours = sorted(cv2.findContours(edges, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)[-2], key=cv2.contourArea)[-1]
  mask = np.zeros(img.shape[0:2], np.uint8)
  masked = cv2.drawContours(mask, [contours],-1, 255, -1)
  dst = cv2.bitwise_and(img, img, mask=mask)
  segmented = cv2.cvtColor(dst, cv2.COLOR_BGR2RGB)
  os.chdir('/content/drive/MyDrive/clothing_class/segmented_images/cv2images_contour')
  cv2.imwrite(file, segmented)

Images created via contouring saved to new datafile.

## Image Segmentation - Thresholding

In [None]:
def filter_image(image, mask):
  r = image[:,:,0] * mask
  g = image[:,:,1] * mask
  b = image[:,:,2] * mask
  return np.dstack([r,g,b])

for file in data_files:
  img = cv2.imread(data_path+'/'+file)
  img = cv2.resize(img,(224,224))
  img_rgb = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
  img_gray = cv2.cvtColor(img_rgb,cv2.COLOR_RGB2GRAY)
  img_gray = cv2.resize(img_gray,(224,224))
  thresh = threshold_otsu(img_gray)
  img_otsu  = img_gray < thresh
  filtered = filter_image(img, img_otsu)
  os.chdir('/content/drive/MyDrive/clothing_class/segmented_images/cv2images_thresholding')
  cv2.imwrite(file, filtered)

Images created via thresholding saved to new datafile.