This piece of code is used to filter to generate the data used for training/testing/validation. The data is filtered by the following criteria:
1. We filter on the following material classes: "plastic", "metal", "wood/paper"
2. we only consider the singular images
3. We know that each item contains 12 of the same images so we only consider the first one and will use random transformations on the remaining 11 images

In [10]:
import numpy as np
import pandas as pd
import os
import shutil
import torchvision.transforms as transforms
import PIL

In [11]:
annotations = pd.read_csv('./annotations_text.csv')
annotations

Unnamed: 0.1,Unnamed: 0,No.,Object name,Material,Stained,Surface properties
0,0,1,Christmas bear,['cloth'],['pluriform'],['']
1,1,2,Lab-keys,"['metal', 'cloth']",[''],"['composite', 'shiny']"
2,2,3,Apricot,['fruit'],['uniform'],['']
3,3,4,Round candle,"['candle', 'metal']",[''],"['composite', 'shiny']"
4,4,5,Nut,['wood'],['uniform'],['ribbed']
...,...,...,...,...,...,...
995,995,996,lotion,"['plastic', 'paper']",['pluriform'],"['shiny', 'composite']"
996,996,997,cleansing milk,"['plastic', 'paper']",['pluriform'],"['shiny', 'composite']"
997,997,998,vogue deodorant,"['metal', 'plastic']",['pluriform'],"['shiny', 'composite']"
998,998,999,ALOI recording,"['metal', 'plastic']",['uniform'],"['shiny', 'composite']"


In [12]:
#convert material string to list
def convert_to_list(x):
    x = x.replace('[','')
    x = x.replace(']','')
    x = x.replace('\'','')
    x = x.replace('\'','')
    x = x.replace(' ','')
    x = x.split(',')
    return x

annotations['Material'] = annotations['Material'].apply(convert_to_list)

Transform the duplicate images

In [13]:
source_dir = "./aloi_red4_col/png4/"
target_dir = "./Transformed/"

In [26]:
#remove target directory if it exists
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

In [29]:
#make target directory
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

In [30]:
#list of transformations
transform_list = [transforms.RandomHorizontalFlip(p=1), transforms.RandomVerticalFlip(p=1), 
transforms.RandomRotation(90), transforms.RandomRotation(180), 
transforms.RandomAffine(0, shear=10, scale=(0.8,1.2)), transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)),
transforms.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)), transforms.RandomAdjustSharpness(sharpness_factor= 0.5), transforms.AugMix(),
transforms.RandomPerspective(distortion_scale=0.5, p=1, interpolation=3), transforms.RandomVerticalFlip(p=1)]
len(transform_list)
    
    



11

In [31]:
for folder in os.listdir(source_dir):
    os.mkdir(target_dir + folder)
    count = 0
    for file in os.listdir(source_dir + folder):
        if (count > 0):
            img = PIL.Image.open(source_dir + folder + "/" + file)
            transformed_img = transform_list[count-1](img)
            transformed_img.save(target_dir + folder + "/" + file)
        else:
            shutil.copy(source_dir + folder + "/" + file, target_dir + folder + "/" + file)
        count += 1

Filter the transformed data such that only:
1. singular material classes are considered
2. "plastic", "metal", "wood/paper"
3. Ignore the other material classes

Finally create label csv files

In [32]:
source_dir = "./Transformed/"
target_dir = "./DataFiltered/"

In [33]:
#delete target directory if it exists
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

In [34]:
if not os.path.exists(target_dir):
    os.mkdir(target_dir)

In [35]:
pd_label = pd.DataFrame(columns=['filename', 'item_type' ,'label'])

In [36]:
for folder in os.listdir(source_dir):
    item_number = int(folder)
    material = annotations.loc[annotations['No.'] == item_number, 'Material'].iloc[0]
    if (len(material) == 1):
        material = material[0]
        if (material == 'metal'):
            for file in os.listdir(source_dir + folder):
                shutil.copy(source_dir + folder + "/" + file, target_dir + file)
                pd_label = pd_label.append({'filename': file, 'item_type': int(folder), 'label': 'metal'}, ignore_index=True)
        elif (material == 'plastic'):
            for file in os.listdir(source_dir + folder):
                shutil.copy(source_dir + folder + "/" + file, target_dir + file)
                pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
        elif (material == 'wood' or material == 'paper'):
            for file in os.listdir(source_dir + folder):
                shutil.copy(source_dir + folder + "/" + file, target_dir + file)
                pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'wood,paper'}, ignore_index=True)

  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ignore_index=True)
  pd_label = pd_label.append({'filename': file,'item_type': int(folder), 'label': 'plastic'}, ig

In [37]:
pd_label

Unnamed: 0,filename,item_type,label
0,100_i110.png,100,plastic
1,100_i120.png,100,plastic
2,100_i130.png,100,plastic
3,100_i140.png,100,plastic
4,100_i150.png,100,plastic
...,...,...,...
5959,994_i180.png,994,"wood,paper"
5960,994_i190.png,994,"wood,paper"
5961,994_i210.png,994,"wood,paper"
5962,994_i230.png,994,"wood,paper"


In [38]:
pd_label.to_csv('labels.csv', index=False)

Balancing the data:
We downsample the data based on minimum class frequency
we'll safe the data in a new folder called convnetdownsample


In [39]:
pd_label = pd.read_csv('labels.csv')
pd_label

Unnamed: 0,filename,item_type,label
0,100_i110.png,100,plastic
1,100_i120.png,100,plastic
2,100_i130.png,100,plastic
3,100_i140.png,100,plastic
4,100_i150.png,100,plastic
...,...,...,...
5959,994_i180.png,994,"wood,paper"
5960,994_i190.png,994,"wood,paper"
5961,994_i210.png,994,"wood,paper"
5962,994_i230.png,994,"wood,paper"


In [40]:
#frequency of each class
pd_label['label'].value_counts()

plastic       3480
wood,paper    1740
metal          744
Name: label, dtype: int64

In [41]:
#downsample the data based on the frequency of each class we'll use the minimum frequency as the threshold
min_freq = pd_label['label'].value_counts().min()
min_freq

744

In [42]:
#downsample the data
pd_label_downsampled = pd_label.groupby('label').apply(lambda x: x.sample(min_freq)).reset_index(drop=True)
pd_label_downsampled

Unnamed: 0,filename,item_type,label
0,859_i180.png,859,metal
1,328_i190.png,328,metal
2,376_i160.png,376,metal
3,848_i160.png,848,metal
4,789_i160.png,789,metal
...,...,...,...
2227,456_i110.png,456,"wood,paper"
2228,321_i180.png,321,"wood,paper"
2229,940_i190.png,940,"wood,paper"
2230,946_i180.png,946,"wood,paper"


In [43]:
pd_label_downsampled['label'].value_counts()

metal         744
plastic       744
wood,paper    744
Name: label, dtype: int64

In [44]:
#create new directory for the downsampled data
source_dir = "./DataFiltered/"
target_dir = "./convnetdownsample/DataFilteredDownsampled/"

#remove the directory if it already exists
if os.path.exists(target_dir):
    shutil.rmtree(target_dir)

#create the directory
os.mkdir(target_dir)

#copy the files to the new directory
for file in pd_label_downsampled['filename']:
    shutil.copy(source_dir + file, target_dir + file)


In [None]:
#save the downsampled labels
pd_label_downsampled.to_csv('./convnetdownsample/labels_downsampled.csv', index=False)