In [16]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [1]:
import os
import PIL   # Python Image library
import keras
import tensorflow as tf

In [3]:
os.getcwd()

In [4]:
path = '../input/siim-isic-melanoma-classification/jpeg/train'

In [5]:
len(os.listdir(path))

Therefore  33,126 training images

In [6]:
df = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')

In [10]:
df.head(10)

In [8]:
df.shape

In [9]:
pd.value_counts(df['benign_malignant'])

We have 584 images for malignant disease and 32,542 images for benign disease.<br>
So, our data is largely imbalanced

In [34]:
test_df = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')
test_df.head()

In [35]:
test_df.shape

## Loading Images

In [12]:
from PIL import Image

In [14]:
img = Image.open('../input/siim-isic-melanoma-classification/jpeg/train/ISIC_0015719.jpg')

In [17]:
#img

In [18]:
img = np.asarray(img)

In [20]:
# plotting the image
plt.imshow(img)

In [26]:
images = os.listdir(path)
plt.figure(figsize=(10,6))

for i in range(12):
    img = Image.open(path + '/' + images[i])
    img = np.asarray(img)
    
    plt.subplot(3,4,i+1)    # 3 rows, 4 columns
    plt.title(df['benign_malignant'][i])
    plt.xticks([])
    plt.yticks([])
    plt.imshow(img)
    
plt.show()

In [27]:
# For loading large image dataset

train_datagen = keras.preprocessing.image.ImageDataGenerator(
    rescale=1/255,          # Normalizing images
    horizontal_flip = True,
    rotation_range = 45,    # In degrees
    shear_range = 0.3,
    zoom_range = 0.2
)

test_datagen = keras.preprocessing.image.ImageDataGenerator(rescale = 1/255)

There are 3 ways to pass data in data generator
- flow  (x_train, y_train)
- flow from directory  (categorized directory)
- flow from data frame

In [48]:
data_dictionary = {'image_path': [], 'target' : []}
for i in range(len(images)):
    data_dictionary['image_path'].append(path + "/" + df['image_name'][i] + '.jpg')
    data_dictionary['target'].append(df['benign_malignant'][i])

In [49]:
final_df = pd.DataFrame(data_dictionary)
final_df.head()

In [36]:
from sklearn.model_selection import train_test_split

In [50]:
x_train, x_test, y_train, y_test = train_test_split(final_df['image_path'], final_df['target'], test_size=0.25)

In [51]:
train_df = pd.DataFrame(x_train)
train_df['target'] = y_train

In [52]:
train_df.head()

In [53]:
val_df = pd.DataFrame(x_test)
val_df['target'] = y_test

In [54]:
train_gen = train_datagen.flow_from_dataframe(train_df, 
                                  x_col = 'image_path', 
                                  y_col = 'target',
                                  target_size= (299,299),
                                  batch_size = 256,
                                  shuffle = True,
                                  class_mode = 'binary'   
                                 )

test_gen = test_datagen.flow_from_dataframe(val_df,
                                    x_col = 'image_path', 
                                    y_col = 'target',
                                    target_size= (299,299),
                                    batch_size = 256,
                                    shuffle = False,
                                    class_mode = 'binary' 
                                 )

Training images - 24,844<br>
Validation images - 8,282

## Building Model

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

In [57]:
model = Sequential([
    Conv2D(32, 3, input_shape = (299,299,3), activation='relu'),  
    MaxPooling2D(),          
    Conv2D(16, 3, activation='relu'),
    MaxPooling2D(),
    Conv2D(16, 3, activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(1, activation = 'sigmoid')
])

In [58]:
model.summary()

In [59]:
model.compile(optimizer='adam', loss= tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [60]:
model_fit = model.fit(train_gen, epochs = 10, validation_data = test_gen)

In [None]:
# Data might overfit (due to imbalanced data)