In [1]:
import numpy as np
import pandas as pd
import os
import random
import shutil
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
%matplotlib inline

In [62]:
## Uploads names of the image files in a list for shuffling
## Make sure that the images are in the current working directory
files=[]
for file in os.listdir(os.getcwd()):
    if '.jpg' in file:
        files.append(file)

In [63]:
## This function shuffles the images inplace
random.shuffle(files)

In [65]:
## Splits the images into 80% training, 10% validation and 10% testing
split_1 = int(0.8 * len(files))
split_2 = int(0.9 * len(files))
train_filenames = files[:split_1]
val_filenames = files[split_1:split_2]
test_filenames = files[split_2:]

In [66]:
## Creates train, validation and test folders
if not os.path.exists('train'):
    os.makedirs('train')
if not os.path.exists('validation'):
    os.makedirs('validation')
if not os.path.exists('test'):
    os.makedirs('test')

In [None]:
## Resize and upload the images to the respective folders
## Training set
for file in train_filenames:
    try:
        image = Image.open(file)
        new_image = image.resize((224, 224))
        new_image.save('train/'+file)
    except:
        continue

In [69]:
## Resize and upload the images to the respective folders
## Validation set
for file in val_filenames:
    try:
        image = Image.open(file)
        new_image = image.resize((224, 224))
        new_image.save('validation/'+file)
    except:
        continue

Create the metadata file(lst file). The file should be in the format of

<b>image_index, label, image_name/image_path</b>

Since this is a multilabel classification problem 

<b>image_index, label_1, label_2,..label_n, image_name/image_path</b>

In [82]:
train_df=pd.DataFrame()
filenames=[]
labels=[]
for file in os.listdir('train'):
    filenames.append(file)
    if 'cloudy' in file:
        labels.append(0)
    elif 'rain' in file:
        labels.append(1)
    elif 'shine' in file:
        labels.append(2)
    else:
        labels.append(3)
        
train_df['labels']=labels
train_df['s3_path']=filenames

train_df=train_df.reset_index().rename(columns={'index':'row_id'})

train_df=pd.concat([train_df,pd.get_dummies(train_df['labels'],prefix='label')],axis=1).drop('labels',axis=1)
new_columns=['row_id','label_0','label_1','label_2','label_3','s3_path']
train_df=train_df.reindex(columns=new_columns)

train_df.to_csv('train.lst',sep='\t',index=False,header=False)

In [84]:
validation_df=pd.DataFrame()
filenames=[]
labels=[]
for file in os.listdir('validation'):
    filenames.append(file)
    if 'cloudy' in file:
        labels.append(0)
    elif 'rain' in file:
        labels.append(1)
    elif 'shine' in file:
        labels.append(2)
    else:
        labels.append(3)
        
validation_df['labels']=labels
validation_df['s3_path']=filenames

validation_df=validation_df.reset_index().rename(columns={'index':'row_id'})

val_df=pd.concat([validation_df,pd.get_dummies(validation_df['labels'],prefix='label')],axis=1).drop('labels',axis=1)
new_columns=['row_id','label_0','label_1','label_2','label_3','s3_path']
val_df=val_df.reindex(columns=new_columns)

val_df.to_csv('validation.lst',sep='\t',index=False,header=False)

Once the training and testing sets along with the metadata files are available upload them to the S3 buckets.