In [None]:
import pandas as pd
from pathlib import Path

In [None]:
from functools import partial
from glob import glob
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from kaggle_datasets import KaggleDatasets


In [None]:
AUTO = tf.data.experimental.AUTOTUNE
# tf.config.experimental_run_functions_eagerly(True)

GCS_PATH = KaggleDatasets().get_gcs_path()


In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)


In [None]:
train_csv = pd.read_csv('../input/siim-isic-melanoma-classification/train.csv')
test_csv = pd.read_csv('../input/siim-isic-melanoma-classification/test.csv')


In [None]:
train_csv.target.value_counts()


In [None]:
IMG_SIZE = [1024, 1024]
EPOCHS = 12
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
train_files = tf.io.gfile.glob(GCS_PATH + '/tfrecords/train*.tfrec')
test_files = tf.io.gfile.glob(GCS_PATH + '/tfrecords/test*.tfrec')


In [None]:
%%time
def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=3)
    image = tf.cast(image, tf.float32) / 255.0  # convert image to floats in [0, 1] range
    image = tf.reshape(image, [*IMG_SIZE, 3]) # explicit size needed for TPU
    return image

def read_tfrecord(example, labeled):
    tfrecord_format = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "target": tf.io.FixedLenFeature([], tf.int64)
    } if labeled else {
        "image": tf.io.FixedLenFeature([], tf.string),
        "image_name": tf.io.FixedLenFeature([], tf.string)
    }
    example = tf.io.parse_single_example(example, tfrecord_format)
    image = decode_image(example['image'])
    if labeled:
        label = tf.cast(example['target'], tf.int32)
        return image, label
    idnum = example['image_name']
    return image, idnum

def load_dataset(filenames, labeled=True, ordered=False):
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTO) # automatically interleaves reads from multiple files
    dataset = dataset.with_options(ignore_order) # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.map(partial(read_tfrecord, labeled=labeled), num_parallel_calls=AUTO)
    # returns a dataset of (image, label) pairs if labeled=True or (image, id) pairs if labeled=False
    return dataset

def get_train_vald_dataset(vald_split=0.2, ordered=False):
    dataset = load_dataset(train_files, labeled=True, ordered=ordered)
    n = sum(1 for record in dataset)
    n_vald = int(vald_split * n)
    n_train = n - n_vald
    #train_dataset = dataset.map(data_augment, num_parallel_calls=AUTO)
    train_dataset = dataset.take(n_train)
    train_dataset = train_dataset.repeat() # the training dataset must repeat for several epochs
    train_dataset = train_dataset.shuffle(2048)
    train_dataset = train_dataset.batch(BATCH_SIZE)
    train_dataset = train_dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    
    vald_dataset = dataset.skip(n_train)
    n1 = sum(1 for rec in vald_dataset)
    vald_dataset = vald_dataset.batch(BATCH_SIZE)
    vald_dataset = vald_dataset.cache()
    vald_dataset = vald_dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    if n_vald != n1:
        print("Validation Dataset sizes - ", n_vald, n1)
    return n_train, train_dataset, n1, vald_dataset

def get_test_dataset(ordered=False):
    dataset = load_dataset(test_files, labeled=False, ordered=ordered)
    n = sum(1 for record in dataset)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTO) # prefetch next batch while training (autotune prefetch buffer size)
    return n, dataset

n_train, train_dataset, n_vald, vald_dataset = get_train_vald_dataset()
n_test, test_dataset = get_test_dataset(True)
print(f'Dataset: {n_train} training images, {n_vald} validation images, {n_test} unlabeled test images')


In [None]:
def initialize_model(model_name=""):
    #pretrained_model = tf.keras.applications.MobileNetV2(input_shape=[*IMAGE_SIZE, 3], include_top=False)
    pretrained_model = tf.keras.applications.Xception(input_shape=[*IMG_SIZE, 3], include_top=False, weights='imagenet')
    #pretrained_model = tf.keras.applications.VGG16(weights='imagenet', include_top=False ,input_shape=[*IMAGE_SIZE, 3])
    #pretrained_model = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
    #pretrained_model = tf.keras.applications.MobileNet(weights='imagenet', include_top=False, input_shape=[*IMAGE_SIZE, 3])
    # EfficientNet can be loaded through efficientnet.tfkeras library (https://github.com/qubvel/efficientnet)
    #pretrained_model = efficientnet.tfkeras.EfficientNetB0(weights='imagenet', include_top=False)
    
    pretrained_model.trainable = False

    model = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.GlobalAveragePooling2D(),
        #tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(8, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss = 'binary_crossentropy',
        metrics=['AUC']
    )

    return model


In [None]:
with strategy.scope():
    model = initialize_model()


In [None]:
TRAIN_STEPS = n_train // BATCH_SIZE
VALID_STEPS = n_vald // BATCH_SIZE


In [None]:
history = model.fit(train_dataset, epochs=2, steps_per_epoch=TRAIN_STEPS, class_weight={0: 1, 1: 2},
                    validation_data=vald_dataset, validation_steps=VALID_STEPS)#, callbacks=[lr_callback])


In [None]:
model.save('model.h5')


In [None]:
outs = model.predict(test_dataset.map(lambda image, idnum: image))


In [None]:
pred = pd.DataFrame({'image_name': test_csv['image_name'], 'target': outs.ravel()})


In [None]:
pred.to_csv('submissions.csv', header=True, index=False)
