# 🐳Happywhale 🐬
Identify and group all images that contain the same individual through time.

# 1. Initial Setup
## Importing libraries

In [4]:
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import numpy as np
import sklearn
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, Layout
import IPython.display
from IPython.display import display, clear_output

# import cudf, cupy, cuml
# from cuml.neighbors import NearestNeightbors
# from cuml.manifold import TSNE, PCE

import os
import imageio
import gc
import random
import warnings
import functools
!pip install imagesize --quiet
import imagesize
from tqdm.notebook import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
from matplotlib import animation, rc
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
import PIL
import plotly
import cv2
from PIL import Image, ImageEnhance
import plotly.io as pio
print(pio.renderers)

%config Completer.use_jedi = False

print(f"TensorFlow version: {tf.__version__}")
print(f"TensorFlow addons version: {tfa.__version__}")
print(f"SkLearn version: {sklearn.__version__}")

In [5]:
TRAIN_PATH = "../input/happy-whale-and-dolphin/train_images/"
TEST_PATH = "../input/happy-whale-and-dolphin/test_images/"

In [6]:
# Matplotlib and Seaborn configuration
sns.set_style("dark")
print("Notebbook Color Scheme:")
color_palette = sns.color_palette("viridis", as_cmap=True).colors

sns.color_palette("viridis")

In [None]:
class color:
   GREEN = '\033[92m'
   BOLD = '\033[1m'
   END = '\033[0m'
    
class Config:
    import os
    ROOT_LOGDIR = os.path.join(os.curdir, "logdir") # Logs dir

    def __init__(self, project=None, model=None,
                 disable_wandb=False, seed=42):
        import os
        import time
        import wandb
        import json
        import random
        
        self.DISABLE_WANDB = disable_wandb
        self.BATCH_SIZE = 32
        self.FOLD = 4
        self.EPOCHS = 30
        self.MODEL_PATH = f"{model}.h5"
        self.SEED = 42
        
        """ Attempt to be Reproducible """
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)

        
        """Get the current run logdir inside root_logdir and run_id
           Weights and Bias experiment tracking"""
        self.run_id = model + time.strftime("-run_%Y_%m_%d-%H_%M_%S") if model else time.strftime("run_%Y_%m_%d-%H_%M_%S")            
        if not disable_wandb: 
            self.WANDB_RUN = wandb.init(project=project, name=self.run_id, config=vars(self),
                                        sync_tensorboard=True)
            self.TENSORBOARD_LOGDIR = os.path.join(Config.ROOT_LOGDIR, self.run_id)
            
        print(color.BOLD + 'Config created with run ID: ' + color.END + color.BOLD + color.GREEN + self.run_id + color.END)
        
    def log_artifact(self, artifact_name, type_, file_path):
        """Log a artifact like preprocess file to wandb"""
        if self.DISABLE_WANDB: return
        artifact = wandb.Artifact(artifact_name, type=type_)
        artifact.add_file(file_path)
        self.WANDB_RUN.log_artifact(artifact)
    
    def finish(self):
        """Call this function to finish this run/experiment"""
        self.WANDB_RUN.finish()
        
        
config = Config(project='Happywhale', model='')

### Helper Functions

In [7]:
def show_values_on_bars(axs, h_v="v", space=0.4):
    '''Plots the value at the end of the a seaborn barplot.
    axs: the ax of the plot
    h_v: weather or not the barplot is vertical/ horizontal'''
    
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, format(value, ','), ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, format(value, ','), ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)
        

def show_image_species(species_name, sample_size):
    """
    Shows a sample of n random images from a certain species.
    ### Parameters:
        `species_name`: string containing the desired species name
        `sample_size`: number of random images to be plotted in a row
    """
    # Get Image Info
    data = train_df[ train_df['species']==species_name ].sample(sample_size, random_state=config.SEED)
    image = data["image"].to_list()
    image_path = data["path"].to_list()
    
    # Plot 
    fig, axs = plt.subplots(1, sample_size, figsize=(23, 4))
    axs = axs.flatten()
    
    for k, path in enumerate(image_path):
        axs[k].set_title(f"[{k+1}] {image[k]}", fontsize=13, weight='bold')
        axs[k].imshow(plt.imread(path))
        plt.suptitle(f"{species_name}",x=0, y=1, fontsize=15, weight='bold')
        axs[k].axis("off")
    plt.tight_layout()
    plt.show()

# 2. Data Preparation
## Metadata Cleaning
**Species column typos adjustment**
- `bottlenose_dolpin` -> `bottlenose_dolphin`
- `kiler_whale` -> `killer_whale`
- `beluga` -> `beluga_whale`
- `globis` & `pilot_whale` -> `short_finned_pilot_whale` (due to extreme similarities [according to this discussion](https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305909)

In [8]:
# Importing the training data
train_df = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")

# Adjust typos in "species" column
train_df["species"] = train_df["species"].replace({
    "bottlenode_dolpin": "bottlenose_dolphin",
    "bottlenose_dolpin": "bottelenose_dolphin",
    "kiler_whale": "killer_whale",
    "beluga": "beluga_whale",
    "globis": "short_finned_pilot_whale",
    "pilot_whale": "short_finned_pilot_whale"
})

# Create a "class" columns
train_df["class"] = train_df["species"].apply(lambda x: x.split("_")[-1])

# Add train path
train_df["path"] = TRAIN_PATH + train_df["image"]

print(f"Test set Number of Samples: {len(os.listdir(TEST_PATH))}\n")
print(f"Train Data shape: {train_df.shape}")
print(f"Number of missing values: \n{train_df.isna().sum()}")
train_df.head()

# 3. Data Analysis
## 3.a Individual Analysis
- 

In [None]:
fig, ax = plt.subplots(figsize=(20,5))
individuals_count = train_df["individual_id"].value_counts()
sns.histplot(data=individuals_count.reset_index().head(2000), 
             x="individual_id", 
             color=color_palette[1])
plt.title("Distribution of individual appearances in data set (Top 100)", size=15, weight='bold')
plt.xlabel("Individual Numbers")

length = fig.get_size_inches()[0]*fig._dpi * 0.3      # only show 30 % of total length
width = fig.get_size_inches()[1]*fig._dpi * 2         # width for some reason is half of fig width
offset = 70
ax.axvspan(offset, length, color=color_palette[0], alpha=0.05)
ax.annotate("",(offset, width/2), (length, width/2), arrowprops={'arrowstyle':'<->'}, color=color_palette[1])
ax.annotate("~95% have less than 10 apparitions", (length/2, width/2 + 20))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,10))

individuals_id_count = individuals_count.head(30).copy()
individuals_id_count.index = individuals_count.head(30).index.map(
    lambda x: x + " " + train_df[ train_df["individual_id"] == x].iloc[0]["species"]) 

sns.barplot(x=individuals_id_count.values, y=individuals_id_count.index, palette="viridis")
show_values_on_bars(ax, h_v="h", space=0.4)
plt.title("Top 30 IDs with most appearces", size=15, weight='bold')
plt.ylabel("Individual ID", size=13)
plt.xlabel("Frequency", size=13)
plt.xticks([])

ax.axhspan(-0.5, 10.5, color=color_palette[0], alpha=0.05)
ax.text(200, 5, 'Individuals with high\nnumber of apparitions', 
         size=13, color=color_palette[1], weight='bold')
ax.yaxis.set_tick_params(labelsize=12)
plt.show()

## 3.b Species Analysis
- There are ~70% whales and 30% unique (or even not unique) dolphins withing the dataset
- The most uniquely appearing (has **distinct individuals** within the count) are `dusky dolphin`, `humpback` and `blue whale`.
- And most commonly appearing (includes **recurring occurance of same individual**) are `bottlenose dolphin`, `beluga` and `humpback whale`. This is because:
    - Many of the top *20 individual* with most appearances are from bottlenose dolphins and the humpback whales species.
    - for beluga whale there are 1000 unique individuals, but they appear on an average less than 10 times withing the dataset.

In [None]:
class_count = train_df["class"].value_counts()
ind_class_count = train_df.groupby(["individual_id", "class"]).count()\
    .reset_index()["class"].value_counts()#.reset_index()

species_count = train_df["species"].value_counts()
ind_species_count = train_df.groupby(["individual_id", "species"]).count()\
    .reset_index()["species"].value_counts()

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(25, 15))

# 1st plot: pie chart
ax1.pie(
    class_count,
    startangle=240, 
    explode=[0, 0.1],
    autopct='%1.1f%%',
    colors=[color_palette[0], color_palette[-1]],
    labels=["Whales: " + str(class_count["whale"]), 
            "Dolphins: " + str(class_count["dolphin"])],
    wedgeprops={"alpha": 0.7}
)
ax1.set_title('Relative % & frequency\nof Whales and Dolphins in dataset', size=15, weight='bold')

# 2nd plot: pie chart
ax3.pie(
    ind_class_count,
    startangle=240, 
    explode=[0, 0.1],
    autopct='%1.1f%%',
    colors=[color_palette[0], color_palette[-1]],
    labels=["Whales: " + str(ind_class_count["whale"]), 
            "Dolphins: " + str(ind_class_count["dolphin"])],
    wedgeprops={"alpha": 0.7}
)
ax3.set_title('Relative Individual % & frequency\nof Whales and Dolphins in dataset', size=15, weight='bold')


# 3rd plot: Count of species in complete dataset
# Includes recurrence apparations of an individual
sns.barplot(species_count, x="species", y=species_count.index, ax=ax2, palette="viridis")
show_values_on_bars(ax2, h_v="h", space=0.4)
ax2.set_title("Count of Species in dataset", size=15, weight='bold')
ax2.set_ylabel("Species", size=13)
ax2.set_xlabel("")
ax2.set_xticks([])
ax2.yaxis.set_tick_params(labelsize=12)

# 4th plot: Count of species in complete dataset
sns.barplot(ind_species_count, x="species", y=ind_species_count.index, ax=ax4, palette="viridis")
show_values_on_bars(ax4, h_v="h", space=0.4)
ax4.set_title("Unique (Individual) count of Species in dataset", size=15, weight='bold')
ax4.set_ylabel("Species", size=13)
ax4.set_xlabel("")
ax4.set_xticks([])
ax4.yaxis.set_tick_params(labelsize=12)

In [None]:
top_species = train_df[ train_df["individual_id"].isin(
    individuals_count.head(30).index) ]["species"].value_counts()

fig = plt.figure(figsize=(20, 7))
sns.barplot(x=top_species.values, y=top_species.index, palette="viridis", orient='h')
show_values_on_bars(plt.gca(), h_v="h")
plt.title("Top 30 IDs's Species", size=15, weight='bold')
plt.ylabel("Species", size = 13)
plt.xlabel("Frequency", size = 13)
plt.gca().yaxis.set_tick_params(labelsize=13)

# 4. Image Analysis
## 4.1 Whales and Dolphin specimen analysis

#### Things to Note:
- **image_size**: the images width and height is very different from one picture to another
- **night view**: not all the pictures were made during the day. Some of them were also caught during the night.
- **multiple individuals**: Some pictures has 2 or more subjects in it.
- **landscape**: some images has the subject in a close up shot, while other has landscape in dominance. 
- **additional noise**: there are some images that have digital marking on them that could pollute the algorithm.

In [None]:
for species in train_df['species'].unique():
    show_image_species(species, sample_size=4)

In [None]:
train_df[ (train_df['species'] == 'minke_whale' ) &
          (train_df['individual_id'] == '37c7aba965a5') ].shape[0]

## 4.2 Comparing species/Same Individuals
Copy to run Interactive widgets.

#### Things to Note:
- **could there be duplicated image?**: There are pictures that are taken mere moments appart.
- **increased noise**: there are many cases where the same individual appears in very differnt backgrounds, angles or shapes (tail or fin)
- **lighting**: ligthing is another pretty noisy aspect and it should be dealt with during the image augmentation phase.

In [None]:
# Comparison widget row 1
first_species_dropdown = widgets.Dropdown(
    options=train_df['species'].unique(),
    value='humpback_whale',
    description='Species:',
    disabled=False,
)
first_ind_dropdown = widgets.Dropdown(
    options=train_df[ train_df['species'] == first_species_dropdown.value ]['individual_id'].unique(),
    value=None,
    description='Individual:',
    disabled=False,
)
first_numOfimg = widgets.IntSlider(min=1, max=1, step=1, value=1)
first_output = widgets.Output()

# Coparison widget row 2
second_species_dropdown = widgets.Dropdown(
    options=train_df['species'].unique(),
    value='humpback_whale',
    description='Species:',
    disabled=False,
)
second_ind_dropdown = widgets.Dropdown(
    options=train_df[ train_df['species'] == second_species_dropdown.value ]['individual_id'].unique(),
    value=None,
    description='Individual:',
    disabled=False,
)
second_numOfimg = widgets.IntSlider(min=1, max=1, step=1, value=1)
second_output = widgets.Output()

comp1 = widgets.VBox([widgets.HBox([first_species_dropdown, first_ind_dropdown, first_numOfimg]),
                     first_output])
comp2 = widgets.VBox([widgets.HBox([second_species_dropdown, second_ind_dropdown, second_numOfimg]),
                     second_output])
input_widgets = widgets.VBox([comp1, comp2])
display(input_widgets)

# Event handlers for First set of widgets
def first_ind_dropdown_handler(change):
    display(input_widgets)
    first_numOfimg.max = train_df[ (train_df['species'] == first_species_dropdown.value ) &
                                   (train_df['individual_id'] == first_ind_dropdown.value) ].shape[0]
    with first_output:
        clear_output()
        data = train_df[ (train_df['species'] == first_species_dropdown.value ) &
                         (train_df['individual_id'] == change.new) ].iloc[0][['image', 'path']]
        fig = plt.figure(figsize=(23, 4))
        plt.imshow(plt.imread(data['path']))
        plt.title(f"{data['image']}")
        plt.gca().axis('off')
        plt.show()
    clear_output(wait=True)
        
def on_first_species_change(change):
    """Changes options for first 'Individual' dropdown"""
    species = change.new
    first_ind_dropdown.options = train_df[ train_df['species'] == species ]['individual_id'].unique()
    first_numOfimg.max = train_df[ (train_df['species'] == first_species_dropdown.value ) &
                                   (train_df['individual_id'] == first_ind_dropdown.value) ].shape[0]
def first_numOfimg_handler(change):
    display(input_widgets)
    with first_output:
        clear_output()
        data = train_df[ (train_df['species'] == first_species_dropdown.value ) &
                         (train_df['individual_id'] == first_ind_dropdown.value ) ]\
                       [['image', 'path']].head(change.new).reset_index()
        fig, axs = plt.subplots(1, change.new, figsize=(50, 10))
        axs = axs.flatten() if change.new is not 1 else axs
        for i, row in data.iterrows():
            axs[i].set_title(f"[{i+1}] {row['image']}")
            axs[i].imshow(plt.imread(row['path']))
            axs[i].axis('off')
        fig.tight_layout()
        plt.show()
    clear_output(wait=True)

# Event handlers for Second set of widgets
def second_ind_dropdown_handler(change):
    display(input_widgets)
    first_numOfimg.max = train_df[ (train_df['species'] == second_species_dropdown.value ) &
                                   (train_df['individual_id'] == second_ind_dropdown.value) ].shape[0]
    with second_output:
        clear_output()
        data = train_df[ (train_df['species'] == second_species_dropdown.value ) &
                         (train_df['individual_id'] == change.new) ].iloc[0][['image', 'path']]
        fig = plt.figure(figsize=(23, 4))
        plt.imshow(plt.imread(data['path']))
        plt.title(f"{data['image']}")
        plt.gca().axis('off')
        plt.show()
    clear_output(wait=True)
        
def on_second_species_change(change):
    """Changes options for first 'Individual' dropdown"""
    species = change.new
    second_ind_dropdown.options = train_df[ train_df['species'] == species ]['individual_id'].unique()
    second_numOfimg.max = train_df[ (train_df['species'] == second_species_dropdown.value ) &
                                    (train_df['individual_id'] == second_ind_dropdown.value) ].shape[0]
def second_numOfimg_handler(change):
    display(input_widgets)
    with second_output:
        clear_output()
        data = train_df[ (train_df['species'] == second_species_dropdown.value ) &
                         (train_df['individual_id'] == second_ind_dropdown.value ) ]\
                       [['image', 'path']].head(change.new).reset_index()
        fig, axs = plt.subplots(1, change.new, figsize=(50, 10))
        axs = axs.flatten() if change.new is not 1 else axs
        for i, row in data.iterrows():
            axs[i].set_title(f"[{i+1}] {row['image']}")
            axs[i].imshow(plt.imread(row['path']))
            axs[i].axis('off')
        fig.tight_layout()
        plt.show()
    clear_output(wait=True)


first_species_dropdown.observe(on_first_species_change, names='value')
first_ind_dropdown.observe(first_ind_dropdown_handler, names='value')
first_numOfimg.observe(first_numOfimg_handler, names='value')

second_species_dropdown.observe(on_second_species_change, names='value')
second_ind_dropdown.observe(second_ind_dropdown_handler, names='value')
second_numOfimg.observe(second_numOfimg_handler, names='value')

## 4.3 Image sizes
> 

In [36]:
train_df[['width','height']] = train_df['path'].progress_apply(
    lambda x: pd.Series(imagesize.get(x))).values
train_df["dimension"] = train_df['width'] * train_df['height']
train_df.head(5)

#### Things to note:
- 