# Exploratory data analysis

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

import os
import sys
import tqdm
import warnings
import logging
import random
from pathlib import Path
import numpy as np
from collections import defaultdict

warnings.filterwarnings("ignore")
sys.path.append(str(Path(os.getcwd() + '/../../').resolve()))
np.warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.misc import imread
from IPython.core.display import Image, display
import tensorflow as tf

from hupaic.data import Dataset
from hupaic import main

## Get a feel of the dataset

In [3]:
logger = main.getLogger()
input_dir = (Path(os.getcwd()) / ".." / ".." / "input" / "train" / "human-protein-atlas-image-classification").resolve()
train_input_dir = input_dir / "train"
test_input_dir = input_dir / "test"

ds = Dataset(logger, train_input_dir)
train_df = pd.read_csv(train_input_dir / "train.csv")
sample_df = pd.read_csv(train_input_dir / "sample_submission.csv")

FileNotFoundError: File b'/home/tuatini/workspace/remote/Torchlite/examples-tf/input/train/human-protein-atlas-image-classification/train/train.csv' does not exist

## Look at training set

In [None]:
train_df.head()

## Number of samples

In [None]:
len(train_df)

In [None]:
labels_dict = {0: "0. Nucleoplasm",
1: "1.  Nuclear membrane",
2: "2.  Nucleoli",   
3: "3.  Nucleoli fibrillar center",
4: "4.  Nuclear speckles",
5: "5.  Nuclear bodies",
6: "6.  Endoplasmic reticulum",  
7: "7.  Golgi apparatus",
8: "8.  Peroxisomes", 
9: "9.  Endosomes",
10: "10.  Lysosomes",
11: "11.  Intermediate filaments",
12: "12.  Actin filaments",
13: "13.  Focal adhesion sites",
14: "14.  Microtubules",
15: "15.  Microtubule ends",
16: "16.  Cytokinetic bridge",
17: "17.  Mitotic spindle",
18: "18.  Microtubule organizing center",
19: "19.  Centrosome",
20: "20.  Lipid droplets",
21: "21.  Plasma membrane",
22: "22.  Cell junctions",
23: "23.  Mitochondria",
24: "24.  Aggresome",
25: "25.  Cytosol",
26: "26.  Cytoplasmic bodies",
27: "27.  Rods & rings"}

target_count = defaultdict(int)

for id, row in train_df.iterrows():
    labels_indexes = [int(label) for label in row["Target"].split()]
    for label in labels_indexes:
        target_count[labels_dict[label]] += 1
        
target_count = sorted(target_count.items(), key=lambda v: v[1], reverse=True)
print(f"Number of targets: ")
for target, count in target_count:
    print(f"{target:40}: {count}")

In [None]:
tcount_df = pd.DataFrame(target_count, columns=["name", "count"])
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.barplot(x="count", y="name", data=tcount_df);

## Look at sample submission

In [None]:
sample_df.head()

## Look at the images

 - a **green filter** for the target protein structure of interest
 - **blue landmark filter** for the nucleus
 - **red landmark filter** for microtubules
 - **yellow landmark filter** for the endoplasmatic reticulum

In [None]:
def load_image(basepath, image_id):
    images = np.zeros(shape=(4, 512, 512))
    images[0,:,:] = imread(basepath / (image_id + "_green" + ".png"))
    images[1,:,:] = imread(basepath / (image_id + "_red" + ".png"))
    images[2,:,:] = imread(basepath / (image_id + "_blue" + ".png"))
    images[3,:,:] = imread(basepath / (image_id + "_yellow" + ".png"))
    return images

def make_image_row(image, subax, title):
    subax[0].imshow(image[0], cmap="Greens")
    subax[0].set_title(title)
    subax[1].imshow(image[1], cmap="Reds")
    subax[1].set_title("stained microtubules")
    subax[2].imshow(image[2], cmap="Blues")
    subax[2].set_title("stained nucleus")
    subax[3].imshow(image[3], cmap="Oranges")
    subax[3].set_title("stained endoplasmatic reticulum")
    for ax_ind in range(len(subax)):
        subax[ax_ind].grid(False)
    return subax

def make_title(file_id):
    file_targets = train_df.loc[train_df.Id == file_id, "Target"].values[0]
    title = ""
    for t in file_targets.split():
        title += " - " + labels_dict[int(t)] + " - \n"
    return title


file_ids = [train_df.iloc[random.randint(0, len(train_df))]["Id"] for _ in range(4)]
images = [load_image(input_dir, file_id) for file_id in file_ids]
fig, ax = plt.subplots(len(file_ids),4,figsize=(20, 5 * len(file_ids)))
if ax.shape == (4,):
    ax = ax.reshape(1,-1)
    
for n in range(len(file_ids)):
    make_image_row(images[n], ax[n], make_title(file_ids[n]))
    
plt.axis('off')
plt.tight_layout()

## Train/Test difference

In [None]:
train_files = os.listdir(train_input_dir)
test_files = os.listdir(test_input_dir)
percentage = np.round(len(test_files) / len(train_files) * 100)

print("The test set size turns out to be {} % compared to the train set.".format(percentage))