# 🐳Happywhale 🐬
Identify and group all images that contain the same individual through time.

# 1. Initial Setup
## Importing libraries

In [134]:
import tensorflow as tf
import tensorflow_addons as tfa
import pandas as pd
import numpy as np
import sklearn

# import cudf, cupy, cuml
# from cuml.neighbors import NearestNeightbors
# from cuml.manifold import TSNE, PCE

import os
import imageio
import gc
import random
import warnings
from tqdm.notebook import tqdm

tqdm.pandas()
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.colors import ListedColormap
import matplotlib.patches as patches
from matplotlib import animation, rc
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
import PIL
import plotly
import cv2
from PIL import Image, ImageEnhance
import plotly.io as pio
print(pio.renderers)

%config Completer.use_jedi = False

print(f"TensorFlow version: {tf.__version__}")
print(f"TensorFlow addons version: {tfa.__version__}")
print(f"SkLearn version: {sklearn.__version__}")

In [2]:
TRAIN_PATH = "../input/happy-whale-and-dolphin/train_images/"
TEST_PATH = "../input/happy-whale-and-dolphin/test_images/"

In [3]:
# Matplotlib and Seaborn configuration
sns.set_style("dark")
print("Notebbook Color Scheme:")
color_palette = sns.color_palette("viridis", as_cmap=True).colors

sns.color_palette("viridis")

In [None]:
class color:
   GREEN = '\033[92m'
   BOLD = '\033[1m'
   END = '\033[0m'
    
class Config:
    import os
    ROOT_LOGDIR = os.path.join(os.curdir, "logdir") # Logs dir

    def __init__(self, project=None, model=None,
                 disable_wandb=False, seed=42):
        import os
        import time
        import wandb
        import json
        import random
        
        self.DISABLE_WANDB = disable_wandb
        self.BATCH_SIZE = 32
        self.FOLD = 4
        self.EPOCHS = 30
        self.MODEL_PATH = f"{model}.h5"
        self.SEED = 42
        
        """ Attempt to be Reproducible """
        os.environ['PYTHONHASHSEED'] = str(seed)
        random.seed(seed)
        np.random.seed(seed)
        tf.random.set_seed(seed)

        
        """Get the current run logdir inside root_logdir and run_id
           Weights and Bias experiment tracking"""
        self.run_id = model + time.strftime("-run_%Y_%m_%d-%H_%M_%S") if model else time.strftime("run_%Y_%m_%d-%H_%M_%S")            
        if not disable_wandb: 
            self.WANDB_RUN = wandb.init(project=project, name=self.run_id, config=vars(self),
                                        sync_tensorboard=True)
            self.TENSORBOARD_LOGDIR = os.path.join(Config.ROOT_LOGDIR, self.run_id)
            
        print(color.BOLD + 'Config created with run ID: ' + color.END + color.BOLD + color.GREEN + self.run_id + color.END)
        
    def log_artifact(self, artifact_name, type_, file_path):
        """Log a artifact like preprocess file to wandb"""
        if self.DISABLE_WANDB: return
        artifact = wandb.Artifact(artifact_name, type=type_)
        artifact.add_file(file_path)
        self.WANDB_RUN.log_artifact(artifact)
    
    def finish(self):
        """Call this function to finish this run/experiment"""
        self.WANDB_RUN.finish()
        
        
config = Config(project='Happywhale', model='')

### Helper Functions

In [4]:
def show_values_on_bars(axs, h_v="v", space=0.4):
    '''Plots the value at the end of the a seaborn barplot.
    axs: the ax of the plot
    h_v: weather or not the barplot is vertical/ horizontal'''
    
    def _show_on_single_plot(ax):
        if h_v == "v":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() / 2
                _y = p.get_y() + p.get_height()
                value = int(p.get_height())
                ax.text(_x, _y, format(value, ','), ha="center") 
        elif h_v == "h":
            for p in ax.patches:
                _x = p.get_x() + p.get_width() + float(space)
                _y = p.get_y() + p.get_height()
                value = int(p.get_width())
                ax.text(_x, _y, format(value, ','), ha="left")

    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)

# 2. Data Preparation
## Metadata Cleaning
**Species column typos adjustment**
- `bottlenose_dolpin` -> `bottlenose_dolphin`
- `kiler_whale` -> `killer_whale`
- `beluga` -> `beluga_whale`
- `globis` & `pilot_whale` -> `short_finned_pilot_whale` (due to extreme similarities [according to this discussion](https://www.kaggle.com/c/happy-whale-and-dolphin/discussion/305909)

In [59]:
# Importing the training data
train_df = pd.read_csv("../input/happy-whale-and-dolphin/train.csv")

# Adjust typos in "species" column
train_df["species"] = train_df["species"].replace({
    "bottlenode_dolpin": "bottlenose_dolphin",
    "bottlenose_dolpin": "bottelenose_dolphin",
    "kiler_whale": "killer_whale",
    "beluga": "beluga_whale",
    "globis": "short_finned_pilot_whale",
    "pilot_whale": "short_finned_pilot_whale"
})

# Create a "class" columns
train_df["class"] = train_df["species"].apply(lambda x: x.split("_")[-1])

# Add train path
train_df["path"] = TRAIN_PATH + train_df["image"]

print(f"Test set Number of Samples: {len(os.listdir(TEST_PATH))}\n")
print(f"Train Data shape: {train_df.shape}")
print(f"Number of missing values: \n{train_df.isna().sum()}")
train_df.head()

# 3. Data Analysis
## 3.a Individual Analysis
- 

In [186]:
fig, ax = plt.subplots(figsize=(20,5))
individuals_count = train_df["individual_id"].value_counts()
sns.histplot(data=individuals_count.reset_index().head(2000), 
             x="individual_id", 
             color=color_palette[1])
plt.title("Distribution of individual appearances in data set (Top 100)", size=15, weight='bold')
plt.xlabel("Individual Numbers")

length = fig.get_size_inches()[0]*fig._dpi * 0.3      # only show 30 % of total length
width = fig.get_size_inches()[1]*fig._dpi * 2         # width for some reason is half of fig width
offset = 70
ax.axvspan(offset, length, color=color_palette[0], alpha=0.05)
ax.annotate("",(offset, width/2), (length, width/2), arrowprops={'arrowstyle':'<->'}, color=color_palette[1])
ax.annotate("~95% have less than 10 apparitions", (length/2, width/2 + 20))
plt.show()

In [187]:
fig, ax = plt.subplots(figsize=(20,10))
individuals_count.index = individuals_count.index.map(
    lambda x: x + " " + train_df[ train_df["individual_id"] == x]["species"].iloc[0]) 
sns.barplot(data=individuals_count.head(30).reset_index(), x="individual_id", y="index", palette="viridis")
show_values_on_bars(ax, h_v="h", space=0.4)
plt.title("Top 30 IDs with most appearces", size=15, weight='bold')
plt.ylabel("Individual ID", size=13)
plt.xlabel("Frequency", size=13)
plt.xticks([])

ax.axhspan(-0.5, 10.5, color=color_palette[0], alpha=0.05)
ax.text(200, 5, 'Individuals with high\nnumber of apparitions', 
         size=13, color=color_palette[1], weight='bold')
ax2.yaxis.set_tick_params(labelsize=12)
plt.show()

## 3.b Species Analysis
- There are ~70% whales and 30% unique (or even not unique) dolphins withing the dataset
- The most uniquely appearing (has **distinct individuals** within the count) are `dusky dolphin`, `humpback` and `blue whale`.
- And most commonly appearing (includes **recurring occurance of same individual**) are `bottlenose dolphin`, `beluga` and `humpback whale`. This is because:
    - Many of the top *20 individual* with most appearances are from bottlenose dolphins and the humpback whales species.
    - for beluga whale there are 800 unique individuals, but they appear on an average less than 10 times withing the dataset.

In [133]:
class_count = train_df["class"].value_counts()
ind_class_count = train_df.groupby(["individual_id", "class"]).count()\
    .reset_index()["class"].value_counts()#.reset_index()

species_count = train_df["species"].value_counts()
ind_species_count = train_df.groupby(["individual_id", "species"]).count()\
    .reset_index()["species"].value_counts()

fig, [[ax1, ax2], [ax3, ax4]] = plt.subplots(2, 2, figsize=(25, 15))

# 1st plot: pie chart
ax1.pie(
    class_count,
    startangle=240, 
    explode=[0, 0.1],
    autopct='%1.1f%%',
    colors=[color_palette[0], color_palette[-1]],
    labels=["Whales: " + str(class_count["whale"]), 
            "Dolphins: " + str(class_count["dolphin"])],
    wedgeprops={"alpha": 0.7}
)
ax1.set_title('Relative % & frequency\nof Whales and Dolphins in dataset', size=15, weight='bold')

# 2nd plot: pie chart
ax3.pie(
    ind_class_count,
    startangle=240, 
    explode=[0, 0.1],
    autopct='%1.1f%%',
    colors=[color_palette[0], color_palette[-1]],
    labels=["Whales: " + str(ind_class_count["whale"]), 
            "Dolphins: " + str(ind_class_count["dolphin"])],
    wedgeprops={"alpha": 0.7}
)
ax3.set_title('Relative Individual % & frequency\nof Whales and Dolphins in dataset', size=15, weight='bold')


# 3rd plot: Count of species in complete dataset
# Includes recurrence apparations of an individual
sns.barplot(species_count, x="species", y=species_count.index, ax=ax2, palette="viridis")
show_values_on_bars(ax2, h_v="h", space=0.4)
ax2.set_title("Count of Species in dataset", size=15, weight='bold')
ax2.set_ylabel("Species", size=13)
ax2.set_xlabel("")
ax2.set_xticks([])
ax2.yaxis.set_tick_params(labelsize=12)

# 4th plot: Count of species in complete dataset
sns.barplot(ind_species_count, x="species", y=ind_species_count.index, ax=ax4, palette="viridis")
show_values_on_bars(ax4, h_v="h", space=0.4)
ax4.set_title("Unique (Individual) count of Species in dataset", size=15, weight='bold')
ax4.set_ylabel("Species", size=13)
ax4.set_xlabel("")
ax4.set_xticks([])
ax4.yaxis.set_tick_params(labelsize=12)

In [64]:
whales = train_df[train_df["class"] == 'whale']["species"].unique()

fig, axes = plt.subplots(4, 4, figsize=(20, 15))
n = 0
for i in range(4):
    for j in range(4):
        axes[i][j].imshow(mpimg.imread(
            train_df[ train_df['species'] == whales[n] ].sample(1, random_state=config.SEED)['path'].values[0]))
        axes[i][j].set_title(f"{whales[n]}")
        n+=1
        axes[i][j].set_xticks([])
        axes[i][j].set_yticks([])

In [56]:
dolphins = train_df[train_df["class"] == 'dolphin']["species"].unique()

fig, axes = plt.subplots(2, 5, figsize=(20, 10))
n = 0
for i in range(2):
    for j in range(5):
        axes[i][j].imshow(mpimg.imread(
            train_df[ train_df['species'] == dolphins[n] ].sample(1, random_state=config.SEED)['path'].values[0]))
        axes[i][j].set_title(f"{dolphins[n]}")
        n+=1
        axes[i][j].set_xticks([])
        axes[i][j].set_yticks([])
plt.tight_layout()