# Load and process HAM 10000 data

files.upload returns a dictionary of the files which were uploaded. The dictionary is keyed by the file name and values are the data which were uploaded. 

*I will try to use this to upload my HAM100000 data to Google Drive and access it in my notebook here.*


## Installations before we start

In [None]:
# update pytorch and fastai to latest versions
!pip install torch -U
!pip install torchvision -U
!pip install fastai -U 
!pip install -q kaggle

In [None]:
# import the usual frameworks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import collections
import warnings
import json
import os

from IPython.core.display import display, HTML
    
# import plotly 
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as py
import plotly.tools as tls

# for color scales in plotly
import colorlover as cl 

# import deep learning libraries
import torch
import fastai
from fastai import *
from fastai.vision import *

from sklearn.metrics import auc, roc_curve, roc_auc_score

# configure things
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:,.2f}'.format  
pd.options.display.max_columns = 999

py.init_notebook_mode(connected=True)

%load_ext autoreload
%autoreload 2
%matplotlib inline

## Setting up the Kaggle API

In [None]:
# mount your google drive so you can save to it. You'll need to put in a token.
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from google.colab import files
files.upload()

In [None]:
# create environment variables for kaggle to authenticate with
os.environ['KAGGLE_USERNAME'] = "your_username"
os.environ['KAGGLE_KEY'] = "your_token"

In [None]:
##os.mkdir('c_skin')
os.listdir()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

## Loading the HAM10000 dataset

In [None]:
# get the dataset from kaggle
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000 -p 'c_skin'

In [None]:
! kaggle datasets list

In [None]:
# Unzip the whole zipfile into /c_skin
!unzip -o c_skin/skin-cancer-mnist-ham10000.zip -d c_skin

# Quietly unzip the image files
#!unzip -o -q c_skin/HAM10000_images_part_1 -d c_skin
#!unzip -o -q c_skin/HAM10000_images_part_2 -d c_skin

# Tell me how many files I unzipped///
!echo files in c_skin: `ls c_skin | wc -l`

In [None]:
!ls c_skin

## Load data into pandas df

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
from glob import glob

In [None]:
base_skin_dir = os.path.join('c_skin')

In [None]:
# This os.path method does not really work, because the files are not local
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

In [None]:
# Create dictionary of the different lesion types
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'dermatofibroma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [None]:
tile_df = pd.read_csv('c_skin/HAM10000_metadata.csv')
tile_df.head()

In [None]:
tile_df.info(verbose=True)

In [None]:
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df.sample(5)

In [None]:
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes
tile_df.sample(5)

In [None]:
tile_df.describe(exclude=[np.number])

In [None]:
# Using matplotlib to visualize the number of images per lesion type
fig, ax1 = plt.subplots(1, 1, figsize = (10, 5))
tile_df['cell_type'].value_counts().plot(kind='bar', ax=ax1)

In [None]:
# Load all the images into the df
from skimage.io import imread
tile_df['image'] = tile_df['path'].map(imread)

In [None]:
# See the image size distribution
tile_df['image'].map(lambda x: x.shape).value_counts()

## Show images in each category

In [None]:
n_samples = 5
fig, m_axs = plt.subplots(7, n_samples, figsize = (4*n_samples, 3*7))
for n_axs, (type_name, type_rows) in zip(m_axs, 
                                         tile_df.sort_values(['cell_type']).groupby('cell_type')):
    n_axs[0].set_title(type_name)
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=2018).iterrows()):
        c_ax.imshow(c_row['image'])
        c_ax.axis('off')
fig.savefig('category_samples.png', dpi=300)

## Make an EMNIST Like Dataset

In [None]:
tile_df[['cell_type_idx', 'cell_type']].sort_values('cell_type_idx').drop_duplicates()

Create a df with only the image(pixel) and the lable. The emnist data set in the federated example has the following `element_type_structure`

OrderedDict([('pixels', TensorSpec(shape=(28, 28), dtype=tf.float32, name=None)), ('label', TensorSpec(shape=(), dtype=tf.int32, name=None))])

In [None]:
federated_df = tile_df[['image','cell_type_idx']]
federated_df.head()