## TRASNFER LEARNING: INITIAL TEST

### Introduction

### Libraries

In [1]:
# basic
import pandas as pd
import numpy as np

# tensorflow and keras
from keras_preprocessing.image import ImageDataGenerator

# sklearn
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

### Functions

In [2]:
from utils import transform_diagnosis_to_numerical, add_prefix_to_string

### Globals

In [3]:
from constants import RANDOM_SEED

### Reading data 

In [4]:
dataframe = pd.read_csv("../../../../src/data/zr7vgbcyr2-1/metadata.csv")
image_file_names_and_diagnosis = dataframe[["img_id", "diagnostic"]]

### Data transformations

In [6]:
image_file_names_and_diagnosis["img_id"] = image_file_names_and_diagnosis["img_id"].apply(
    lambda string: add_prefix_to_string(string, "../../../../src/data/zr7vgbcyr2-1/images/all_images/"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  image_file_names_and_diagnosis["img_id"] = image_file_names_and_diagnosis["img_id"].apply(


In [7]:
image_file_names_and_diagnosis.head()

Unnamed: 0,img_id,diagnostic
0,../../../../src/data/zr7vgbcyr2-1/images/all_i...,NEV
1,../../../../src/data/zr7vgbcyr2-1/images/all_i...,BCC
2,../../../../src/data/zr7vgbcyr2-1/images/all_i...,ACK
3,../../../../src/data/zr7vgbcyr2-1/images/all_i...,ACK
4,../../../../src/data/zr7vgbcyr2-1/images/all_i...,BCC


In [8]:
image_file_names_and_diagnosis.img_id[0]

'../../../../src/data/zr7vgbcyr2-1/images/all_images/PAT_1516_1765_530.png'

### Split data in train, test and validation

In [9]:
stratified_shuffle_split_train_test = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=RANDOM_SEED,
)
stratified_shuffle_split_train_val = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.1,
    random_state=RANDOM_SEED,
)

In [10]:
for train_index, test_index in stratified_shuffle_split_train_test.split(
    image_file_names_and_diagnosis,
    image_file_names_and_diagnosis["diagnostic"]):
        stratified_train_and_validation_set = image_file_names_and_diagnosis.iloc[train_index]
        stratified_test_set = image_file_names_and_diagnosis.iloc[test_index]

In [11]:
for train_index, validation_index in stratified_shuffle_split_train_val.split(
    stratified_train_and_validation_set,
    stratified_train_and_validation_set["diagnostic"]):
        stratified_train_set = stratified_train_and_validation_set.iloc[train_index]
        stratified_validation_set = stratified_train_and_validation_set.iloc[validation_index]

In [12]:
print("Examples in train set: {0}".format(stratified_train_set.shape[0]))
print("Examples in validation set: {0}".format(stratified_validation_set.shape[0]))
print("Examples in test set: {0}".format(stratified_test_set.shape[0]))

Examples in train set: 1654
Examples in validation set: 184
Examples in test set: 460


### Generate tensorflow objects

In [13]:
# create the ImageDataGenerator object
data_augmentation_generator = ImageDataGenerator(
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
)

In [14]:
# generate batches and augment the images
train_data_generator = data_augmentation_generator.flow_from_dataframe(
    stratified_train_set,
    x_col='img_id',
    y_col='diagnostic',
    class_mode='categorical',
    target_size=(224, 224),
)

validation_data_generator = data_augmentation_generator.flow_from_dataframe(
    stratified_validation_set,
    x_col='img_id',
    y_col='diagnostic',
    class_mode='categorical',
    target_size=(224, 224),
)

Found 1654 validated image filenames belonging to 6 classes.
Found 184 validated image filenames belonging to 6 classes.
