In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns

from PIL import Image
from tqdm import tqdm
from keras.preprocessing.image import ImageDataGenerator
from matplotlib import rcParams


In [None]:
#If being run on Colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#Replace with root path and names of folders containing data
root = '/content/drive/MyDrive/ML_Project'
train_csv = root + '/train.csv'
train_images = root + '/train_images'
test_images = root + '/test_images'

In [None]:
#Read in the training data .csv into a dataframe
train_df = pd.read_csv(train_csv)
train_df['image_path'] = train_images +'/'+ train_df['image']

print(train_df.head())

print(f'Number of training images: {len(train_df)}')
print(f'Number of individual species: {train_df.species.nunique()}')
print(f'Number of individual IDs: {train_df.individual_id.nunique()}')
    

In [None]:

#Some of the species labels have errors. Replace with the correct label
train_df.loc[train_df.species.str.contains('beluga'), 'species'] = 'beluga_whale'
train_df.loc[train_df.species.str.contains('globis'), 'species'] = 'globis_whale'

train_df.loc[train_df.species.str.contains('dolpin'), 'species'] = 'bottlenose_dolphin'
train_df.loc[train_df.species.str.contains('kiler'), 'species'] = 'killer_whale'

print(f'List of Unique Species:\n {train_df.species.unique()}')

print(f'\n Number of individual species, updated: {train_df.species.nunique()} \n')

#Add a class column
train_df['class'] = 0
train_df.loc[train_df.species.str.contains('whale'), 'class'] = 'whale'
train_df.loc[train_df.species.str.contains('dolphin'), 'class'] = 'dolphin'

print(train_df.head())

In [None]:
#Get the frequency of each individual_id and make a dataframe               
ID= pd.DataFrame(train_df.individual_id.value_counts())

ID_df = pd.DataFrame(columns = ['individual_id', 'image_freq'])

ID_df['individual_id'] = ID.index
ID_df['image_freq'] = ID.values.astype(int)

#Plot the 20 most frequent individual ID's
fig, ax = plt.subplots(dpi = 1200)
plt.bar(x = ID_df['individual_id'].head(20), height = ID_df['image_freq'].head(20))
plt.title("Top 20 Individual IDs\n")
ax.set_ylabel("Number of Images", x = 0.2, fontsize = 12)
ax.set_xlabel("Individual ID",fontsize = 12)
ax.set_xticklabels(ID_df['individual_id'].head(20),rotation = 90)
#plt.savefig(root + '/ImageperID.png', bbox_inches = 'tight')

count = ID_df[ID_df['image_freq'] <= 3]

print(f'Number of IDs with 3 or fewer training images: {len(count)}')
print(f'Percentage with 3 or fewer training images: {len(count)/len(ID_df)}')

count = ID_df[ID_df['image_freq'] <= 50]

print(f'Percentage with 50 or fewer training images: {len(count)/len(ID_df)}')


In [None]:

#Get only one image per individual ID
onePerClass = train_df.drop_duplicates(subset = 'individual_id')

#Plot the number of individual ID's per species
fig, ax = plt.subplots(dpi = 1200)
sns.countplot(x = onePerClass['species'])
sns.despine()
plt.title("Number of Individual ID's per Species\n", fontsize = 14 )
ax.set_ylabel("Number of individual ID's", x = 0.2, fontsize = 12)
ax.set_xlabel("Species",fontsize = 12)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
#plt.savefig(root + '/IDperSpecies.png', bbox_inches = 'tight')

#Get the values of number of Individual ID's per species
ID_spec= pd.DataFrame(onePerClass.species.value_counts())

ID_spec_df = pd.DataFrame(columns = ['species', 'freq'])

ID_spec_df['species'] = ID_spec.index
ID_spec_df['freq'] = ID_spec.values.astype(int)

print(ID_spec_df)


In [None]:

#Plot the number of Images per species
fig, ax = plt.subplots(dpi = 1200)
sns.countplot(x = train_df['species'])
sns.despine()
plt.title("Number of Images per Species\n", fontsize = 14 )
ax.set_ylabel("Number of images", x = 0.2, fontsize = 12)
ax.set_xlabel("Species",fontsize = 12)
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
#plt.savefig(root + '/ImageperSpecies.png', bbox_inches = 'tight')

#Get the count of images per species
ID_image= pd.DataFrame(train_df.species.value_counts())

ID_image_df = pd.DataFrame(columns = ['species', 'freq'])

ID_image_df['species'] = ID_image.index
ID_image_df['freq'] = ID_image.values.astype(int)

print(ID_image_df)

In [None]:
## Plot Image Size ## - require consistent image size to ensure there are no mismatched matrices 

sample = 2000 #number of images to check 
train_df_size = train_df.copy().head(sample)  
height = []
width = []
aspect = []
for i in tqdm(train_df_size['image_path']):
  im = Image.open(i)
  size = im.size
  height.append(size[1])
  width.append(size[0])
  aspect.append(round(size[1]/size[0],2))
  im.close()

In [None]:

#Plot the width vs. height of each of the first 2000 images
train_df_size['height'] = height
train_df_size['width'] = width
train_df_size['aspect_ratio'] = aspect

rcParams['figure.figsize'] = 12,9


plot = sns.scatterplot(data=train_df_size, x="width", y="height", hue="class")
plot.set_title("Image Size of the First 2000 Images", fontsize = 14 )
plot.set_ylabel("Height (pixels)", x = 0.2, fontsize = 12)
plot.set_xlabel("Width (pixels)",fontsize = 12)


In [None]:
#Finding the aspect ratio by calculation 
w_mode = train_df_size.width.mode()
h_mode = train_df_size.height.mode()
aspect_ratio = train_df_size.aspect_ratio.mode()
print("Height Mode: " + str(h_mode[0]))
print("Width Mode: " + str(w_mode[0]))
print("Aspect Ratio Mode: " + str(aspect_ratio[0]) +"\nCalculated Aspect Ratio: " + str(h_mode[0]/w_mode[0]) )

In [None]:
#Plot aspect Ratio

fig, ax = plt.subplots(dpi = 1200)
sns.countplot(x = round(train_df_size['aspect_ratio'],1))
sns.despine()
plt.title("Aspect Ratio Frequency of the First 2000 Images\n", fontsize = 14 )
ax.set_ylabel("Count", x = 0.2, fontsize = 12)
ax.set_xlabel("Aspect Ratio",fontsize = 12)
labels = ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
#plt.savefig(root + '/ImageperSpecies.png', bbox_inches = 'tight')


In [None]:
#Print some random animals, as a treat

fig = plt.figure(figsize = (20,20))

for i in range(2):
  for j in range(3):
    ax = fig.add_subplot(i+1,3,j+1)
    plt.axis('off')
    index = np.random.randint(len(train_df))
    image_path = train_df.iloc[index].image_path
    image_name = train_df.iloc[index].individual_id
    image_species = train_df.iloc[index].species
    img = Image.open(image_path)
    img = img.resize([250, 230])
    img = np.asarray(img)
    plt.title(image_name + ', ' + image_species)
    plt.subplots_adjust(wspace=0.4,hspace=0.4)
    plt.imshow(img)
