In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

import PIL
from tqdm import tqdm
from PIL import Image
from keras.preprocessing.image import ImageDataGenerator

In [None]:
#If running on Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Replace with required root and folder names
root = '/content/drive/MyDrive/ML_Project'
train_csv = root + '/train.csv'
train_images = root + '/train_images'
test_images = root + '/test_images'

In [None]:
#Read in training data from .csv and address the specie labelling errors
train_df = pd.read_csv(train_csv)
train_df['image_path'] = train_images +'/'+ train_df['image']

train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'globis_whale'

train_df.loc[train_df.species.str.contains('dolpin'), 'species'] = 'bottlenose_dolphin'
train_df.loc[train_df.species.str.contains('kiler'), 'species'] = 'killer_whale'

print(f'List of Unique Species:\n {train_df.species.unique()}')

print(f'\n Number of individual species, updated: {train_df.species.nunique()} \n')

In [None]:
#Get the number of images per individual ID
ID= pd.DataFrame(train_df.individual_id.value_counts())

ID_df = pd.DataFrame(columns = ['individual_id', 'image_freq'])

ID_df['individual_id'] = ID.index
ID_df['image_freq'] = ID.values.astype(int)

#Get all the ID's with 2 images
Two_df = ID_df[ID_df['image_freq'] ==2]
print(len(Two_df))

#Get all the ID's with 1 image
One_df = ID_df[ID_df['image_freq'] ==1]

In [None]:

#Add path to image to be augmented
pd.set_option('display.max_colwidth', None)
low_freq_two_df = train_df[train_df['individual_id'].isin(Two_df['individual_id'])]

#Since there are two of each ID in this dataframe, drop duplicates. Only augmenting one image
low_freq_two_df = low_freq_two_df.drop_duplicates(subset = ['individual_id']).reset_index(drop = True)
print(len(low_freq_two_df))

low_freq_one_df = train_df[train_df['individual_id'].isin(One_df['individual_id'])].reset_index(drop = True)
print(len(low_freq_one_df))


In [None]:
#Augment the images
def augImages(root, imgpaths, spec, indi_id, datagen, numImg):
  augImg = []
  for i in tqdm(range(len(imgpaths))):
    #Open image
    img = Image.open(imgpaths[i]).convert("RGB")
    img = img.resize([80, 56]) #width,height 
    #Convert to an array
    imgAr = np.asarray(img)
    add_dim = np.expand_dims(imgAr, 0)
   
    #Apply datagen function
    augImgBatch = datagen.flow(add_dim, batch_size=1)

    #Make more augmented images, for a total of 3 per ID
    for j in range(numImg):   
        changedImg = augImgBatch.next()
        image = changedImg[0].astype('uint8')
        augImg.append([indi_id[i], spec[i], root +'/final_generated_train_images/'+indi_id[i]+str(j)+'.jpg'])
        img2 = Image.fromarray(image)
        #Save the sugmented image to a folder
        img2 = img2.save(root +'/final_generated_train_images/'+indi_id[i]+str(j)+'.jpg')

    img.close()
  #Get dataframe of labels and oath to all new images
  augImg_df = pd.DataFrame(augImg, columns = ['individual_id', 'species', 'image_path'])
  return augImg_df


In [None]:

#Data Generator with parameter ranges
augDatagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.20,
    brightness_range=[0.7,1.0],
    channel_shift_range=0.7,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    fill_mode='nearest'
) 


In [None]:
#Augment only 1 image for the ID's with 2 already
augImg_Two_df = augImages(root, low_freq_two_df['image_path'], low_freq_two_df['species'], low_freq_two_df['individual_id'], augDatagen, 1)

augImg_One_df = augImages(root, low_freq_one_df['image_path'], low_freq_one_df['species'], low_freq_one_df['individual_id'], augDatagen, 2)

#A future consideration would be to make this code more robust to be able to easiy change the number of images being augmented. 
#Probably create one dataframe and have the number of image to augment be a column variabl

In [None]:
#Combine the dataframes
augImg_df = augImg_Two_df.append(augImg_One_df).reset_index(drop = True)
print(augImg_df)

In [None]:
augImg_df.to_csv('/content/drive/MyDrive/ML_Project/final_generated_train_images.csv')