In [1]:
import os
import cv2
import glob
import shutil
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mimg
import imgaug.augmenters as iaa
import imgaug as aug
from os import listdir, makedirs, getcwd, remove
from os.path import isfile, join, abspath, exists, isdir, expanduser
from PIL import Image
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict, Counter

In [2]:
seed=1234
np.random.seed(seed)
color=sns.color_palette()
%matplotlib inline
%load_ext line_profiler
%load_ext autoreload 
%autoreload 2

In [3]:
# Define some paths for future use
input_path = Path("../data")
train_dir = input_path / "train_data"
test_dir = input_path / "test_data"

In [4]:
# Load the train csv file
train_df = pd.read_csv("../CSVs/train.csv")
print("Number of training samples: ", len(train_df))

Number of training samples:  31072


In [5]:
train_df["nb_labels"] = train_df["Target"].apply(lambda x: len(x.split(" ")))
print(f"Maximum number of labels attached to a single sample: {train_df['nb_labels'].max()}")
print(f"Minimum number of labels attached to a single sample: {train_df['nb_labels'].min()}")
print("All counts:")
print(train_df["nb_labels"].value_counts())

Maximum number of labels attached to a single sample: 5
Minimum number of labels attached to a single sample: 1
All counts:
1    15126
2    12485
3     3160
4      299
5        2
Name: nb_labels, dtype: int64


In [6]:
# Labelmap

labels_dict={
0:  "Nucleoplasm", 
1:  "Nuclear membrane",   
2:  "Nucleoli",   
3:  "Nucleoli fibrillar center" ,  
4:  "Nuclear speckles"   ,
5:  "Nuclear bodies"   ,
6:  "Endoplasmic reticulum",   
7:  "Golgi apparatus"   ,
8:  "Peroxisomes"   ,
9:  "Endosomes"   ,
10:  "Lysosomes"   ,
11:  "Intermediate filaments",   
12:  "Actin filaments"   ,
13:  "Focal adhesion sites",   
14:  "Microtubules"   ,
15:  "Microtubule ends",   
16:  "Cytokinetic bridge",   
17:  "Mitotic spindle"   ,
18:  "Microtubule organizing center" ,  
19:  "Centrosome"   ,
20:  "Lipid droplets",   
21:  "Plasma membrane",   
22:  "Cell junctions"  , 
23:  "Mitochondria"   ,
24:  "Aggresome"   ,
25:  "Cytosol",
26:  "Cytoplasmic bodies",   
27:  "Rods & rings" 
}

In [7]:
# This will become handy. Wait for it
rev_labels_dict = dict([v, k] for k, v in labels_dict.items())
rev_labels_dict

{'Nucleoplasm': 0,
 'Nuclear membrane': 1,
 'Nucleoli': 2,
 'Nucleoli fibrillar center': 3,
 'Nuclear speckles': 4,
 'Nuclear bodies': 5,
 'Endoplasmic reticulum': 6,
 'Golgi apparatus': 7,
 'Peroxisomes': 8,
 'Endosomes': 9,
 'Lysosomes': 10,
 'Intermediate filaments': 11,
 'Actin filaments': 12,
 'Focal adhesion sites': 13,
 'Microtubules': 14,
 'Microtubule ends': 15,
 'Cytokinetic bridge': 16,
 'Mitotic spindle': 17,
 'Microtubule organizing center': 18,
 'Centrosome': 19,
 'Lipid droplets': 20,
 'Plasma membrane': 21,
 'Cell junctions': 22,
 'Mitochondria': 23,
 'Aggresome': 24,
 'Cytosol': 25,
 'Cytoplasmic bodies': 26,
 'Rods & rings': 27}

In [8]:
train_df["labels"] = [list(map(int, i.split())) for i in train_df['Target']]
train_df.head()

Unnamed: 0,Id,Target,nb_labels,labels
0,00070df0-bbc3-11e8-b2bc-ac1f6b6435d0,16 0,2,"[16, 0]"
1,000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0,7 1 2 0,4,"[7, 1, 2, 0]"
2,000a9596-bbc4-11e8-b2bc-ac1f6b6435d0,5,1,[5]
3,000c99ba-bba4-11e8-b2b9-ac1f6b6435d0,1,1,[1]
4,001838f8-bbca-11e8-b2bc-ac1f6b6435d0,18,1,[18]


In [9]:
def count_labels(df_labels, plot=False, figsize=(20,10), title=None, xlabel=None, ylabel=None):
    sample_labels_count = Counter()
    
    # Update the counter 
    for label in df_labels:
        for l in label:
            sample_labels_count[labels_dict[l]]+=1
    
    # Plot 
    if plot:
        plt.figure(figsize=(20,15))
        sns.barplot(x=list(sample_labels_count.values()), y=list(sample_labels_count.keys()), color=color[3], orient='h')
        if title:
            plt.title(title, fontsize=14)
        if xlabel:
            plt.xlabel(xlabel, fontsize=14)
        if ylabel:
            plt.ylabel(ylabel, fontsize=14)
        plt.show()
    
    return sample_labels_count

In [10]:
# Check the distribution for different label counts in each sample
single_labels_df = train_df[(train_df['nb_labels']==1)]['labels']
single_labels_count = count_labels(single_labels_df)

In [11]:
single_labels_count

Counter({'Nuclear bodies': 983,
         'Nuclear membrane': 271,
         'Microtubule organizing center': 314,
         'Nucleoplasm': 2414,
         'Golgi apparatus': 1163,
         'Mitochondria': 1653,
         'Plasma membrane': 1058,
         'Cytosol': 1470,
         'Intermediate filaments': 601,
         'Focal adhesion sites': 157,
         'Actin filaments': 232,
         'Nucleoli': 808,
         'Lipid droplets': 98,
         'Nucleoli fibrillar center': 654,
         'Microtubules': 484,
         'Centrosome': 522,
         'Endoplasmic reticulum': 622,
         'Nuclear speckles': 1077,
         'Cytoplasmic bodies': 134,
         'Cell junctions': 213,
         'Peroxisomes': 31,
         'Aggresome': 122,
         'Cytokinetic bridge': 27,
         'Endosomes': 17,
         'Rods & rings': 1})

In [12]:
# Make another dict but with reversed order of label and counts
rev_single_labels_count = dict([v,k] for k,v in single_labels_count.items())
rev_single_labels_count

{983: 'Nuclear bodies',
 271: 'Nuclear membrane',
 314: 'Microtubule organizing center',
 2414: 'Nucleoplasm',
 1163: 'Golgi apparatus',
 1653: 'Mitochondria',
 1058: 'Plasma membrane',
 1470: 'Cytosol',
 601: 'Intermediate filaments',
 157: 'Focal adhesion sites',
 232: 'Actin filaments',
 808: 'Nucleoli',
 98: 'Lipid droplets',
 654: 'Nucleoli fibrillar center',
 484: 'Microtubules',
 522: 'Centrosome',
 622: 'Endoplasmic reticulum',
 1077: 'Nuclear speckles',
 134: 'Cytoplasmic bodies',
 213: 'Cell junctions',
 31: 'Peroxisomes',
 122: 'Aggresome',
 27: 'Cytokinetic bridge',
 17: 'Endosomes',
 1: 'Rods & rings'}

In [13]:
# Sort the counter object in descending order
single_labels = sorted(single_labels_count.items(), key=lambda pair: pair[1], reverse=True)
single_labels

[('Nucleoplasm', 2414),
 ('Mitochondria', 1653),
 ('Cytosol', 1470),
 ('Golgi apparatus', 1163),
 ('Nuclear speckles', 1077),
 ('Plasma membrane', 1058),
 ('Nuclear bodies', 983),
 ('Nucleoli', 808),
 ('Nucleoli fibrillar center', 654),
 ('Endoplasmic reticulum', 622),
 ('Intermediate filaments', 601),
 ('Centrosome', 522),
 ('Microtubules', 484),
 ('Microtubule organizing center', 314),
 ('Nuclear membrane', 271),
 ('Actin filaments', 232),
 ('Cell junctions', 213),
 ('Focal adhesion sites', 157),
 ('Cytoplasmic bodies', 134),
 ('Aggresome', 122),
 ('Lipid droplets', 98),
 ('Peroxisomes', 31),
 ('Cytokinetic bridge', 27),
 ('Endosomes', 17),
 ('Rods & rings', 1)]

In [14]:
count_array = np.array([x[1] for x in single_labels])
count_array

array([2414, 1653, 1470, 1163, 1077, 1058,  983,  808,  654,  622,  601,
        522,  484,  314,  271,  232,  213,  157,  134,  122,   98,   31,
         27,   17,    1])

`Roads & Rings` is a single training example. Even though we can augment this but I think from competition point of view, it's not worth it. Let the classifier ignore it as of now. We can think about this later.

Before doing data augmentation here are the few points to keep in mind:
* Doing data augmentation on examples where there is only one label makes more sense as you are generating augmented data for that label only. If you do only examples where there are more than one label present, we might end up making the imbalance much worse


* There are two options for doing data augmentation here
    * **On the fly** This will save a lot of disk space but training will be slower as we will be checking the labels for which we want to do augmenation in every batch. 
    * **Augment and save** The other option is to perform augmentation on the labels you are interested in and save the augmented samples on the disk. We might end up using quite a bit of disk space but in this scenario the training will be blazingly fast
    
    
* Random flips are fine but apart from that, we need to take a look at the augmented samples. One mistake during augmentation and we will be scracthing our head for hours checking what went wrong. **Never make the data worse**


* We need to find the `min_samples` that we want for each class.

In [15]:
print("Minimum number of samples: ", np.min(count_array))
print("Maximum number of samples: ", np.max(count_array))

Minimum number of samples:  1
Maximum number of samples:  2414


In [16]:
def count_pct(arr, nb_count_to_consider, total):
    """
    arr: array containing count corresponding to labels
    nb_count_to_consider: Number of samples that appear in training data corresponding to a label
    total: total number of labels for which there training examples containing only single label
    
    Returns: pct of the labels that falls in this range
    
    """
    
    samples = len(np.where(arr < nb_count_to_consider)[0])
    pct = round((samples/total)*100, 2)
    return pct

In [17]:
total_labels = len(count_array)
print("Percemtage of labels with less than 50 training examples: ", count_pct(count_array, 50, total_labels))
print("Percemtage of labels with less than 75 training examples: ", count_pct(count_array, 75, total_labels))
print("Percemtage of labels with less than 100 training examples: ", count_pct(count_array, 100, total_labels))
print("Percemtage of labels with less than 200 training examples: ", count_pct(count_array, 200, total_labels))
print("Percemtage of labels with less than 500 training examples: ", count_pct(count_array, 500, total_labels))
print("Percemtage of labels with less than 700 training examples: ", count_pct(count_array, 700, total_labels))
print("Percemtage of labels with less than 1000 training examples: ", count_pct(count_array, 1000, total_labels))

Percemtage of labels with less than 50 training examples:  16.0
Percemtage of labels with less than 75 training examples:  16.0
Percemtage of labels with less than 100 training examples:  20.0
Percemtage of labels with less than 200 training examples:  32.0
Percemtage of labels with less than 500 training examples:  52.0
Percemtage of labels with less than 700 training examples:  68.0
Percemtage of labels with less than 1000 training examples:  76.0


So, there are 52% of the labels that are having less than 500 training samples in  the training dataset. 

**Note:** We are considering only those labels that occurs solely somehwere in the training dataset. This percentage is the reflection of those labels only, not all the labels.

Let us check which are the classes that have less than 500 samples in training data.

In [21]:
less_than_500 = count_array[np.where(count_array < 500)]
less_than_500

array([484, 314, 271, 232, 213, 157, 134, 122,  98,  31,  27,  17,   1])

In [23]:
# Print the labels using the reversed dictionary we made above.
print("Classes having less than 500 samples in the whole dataset")
print("")

for item in less_than_500:
    print(rev_single_labels_count[item])

Classes having less than 500 samples in the whole dataset

Microtubules
Microtubule organizing center
Nuclear membrane
Actin filaments
Cell junctions
Focal adhesion sites
Cytoplasmic bodies
Aggresome
Lipid droplets
Peroxisomes
Cytokinetic bridge
Endosomes
Rods & rings
