In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os
import re

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.applications.efficientnet import EfficientNetB7
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, GlobalAveragePooling2D
from tensorflow.keras.metrics import F1Score
from tensorflow.keras.optimizers import Adamax
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.optimizers.schedules import CosineDecay

from sklearn.utils.class_weight import compute_class_weight

import warnings
warnings.filterwarnings('ignore')

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU: ', tpu.master())
except ValueError:
    tpu = None
    
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

In [None]:
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
TRAIN_SIZE = 12753
VAL_SIZE = 3712
TEST_SIZE = 7382

# Reading the Dataset

In [None]:
def decode_image(image):
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [299, 299])            # required shape for Xception
    image = tf.cast(image, tf.float32)
    image = (image / 127.5) - 1
    return image

In [None]:
def read_labeled_tfrecord(tfrecord):
    schema = {
        'image': tf.io.FixedLenFeature([], tf.string),
        'class': tf.io.FixedLenFeature([], tf.int64)
    }
    
    record = tf.io.parse_single_example(tfrecord, schema)
    
    image = decode_image(record['image'])
    label = tf.one_hot(record['class'], 104)
    
    return image, label

In [None]:
def read_unlabeled_tfrecord(tfrecord):
    schema = {
        'id': tf.io.FixedLenFeature([], tf.string),
        'image': tf.io.FixedLenFeature([], tf.string)
    }
    
    record = tf.io.parse_single_example(tfrecord, schema)
    
    image = decode_image(record['image'])
    
    return record['id'], image

In [None]:
def load_dataset(files, testing_set=False, augment=False):
    ignore_order = tf.data.Options()
    
    if not testing_set:
        ignore_order.experimental_deterministic = False
        
    dataset = tf.data.TFRecordDataset(files)
    dataset = dataset.with_options(ignore_order)
    
    if not testing_set:
        dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        if augment:
            dataset = dataset.map(data_augmentation, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    else:
        dataset = dataset.map(read_unlabeled_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        
    return dataset

In [None]:
def data_augmentation(image, label):
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
    
    return image, label

In [None]:
def get_file_names(dataset_type):
    file_names = []
    pattern = f'/kaggle/input/tpu-getting-started/tfrecords-jpeg-512x512/{dataset_type}/*.tfrec'
    files = tf.io.gfile.glob(pattern)
    file_names.extend(files)
        
    return file_names

In [None]:
def load_training_data():
    training_file_names = get_file_names('train')
    dataset = load_dataset(training_file_names, augment=True)
    dataset = dataset.repeat()
    dataset = dataset.shuffle(2048)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset

def load_val_data():
    val_file_names = get_file_names('val')
    dataset = load_dataset(val_file_names)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.cache()
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset 
    
def load_test_data():
    test_file_names = get_file_names('test')
    dataset = load_dataset(test_file_names, testing_set=True)
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset 

In [None]:
train = load_training_data()
validate = load_val_data()
test = load_test_data()

# Data Exploration:

In [None]:
train_files = get_file_names('train')
val_files = get_file_names('val')
test_files = get_file_names('test')

pattern = '-(\d+).tfrec'

train_files_sum = sum(map(lambda x: int(re.search(pattern, x).group(1)), train_files))
val_files_sum = sum(map(lambda x: int(re.search(pattern, x).group(1)), val_files))
test_files_sum = sum(map(lambda x: int(re.search(pattern, x).group(1)), test_files))

categories = ['train', 'validation', 'test']
values = [train_files_sum, val_files_sum, test_files_sum]

cmap = plt.get_cmap('magma')
colors = cmap(np.linspace(0, 1, len(values)))

plt.figure(figsize=(8, 6))
bars = plt.bar(categories, values, color=colors)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2., height, f'{int(height)}',
             ha='center', va='bottom', color='black')

plt.title('Number of Images')
plt.ylabel('Sum of Numbers in Filenames')
plt.show()

In [None]:
train_labels_ds = train.map(lambda image, label: label).unbatch()
train_labels = next(iter(train_labels_ds.batch(TRAIN_SIZE))).numpy()
label_indices = np.argmax(train_labels, axis=1)

unique, counts = np.unique(label_indices, return_counts=True)

plt.figure(figsize=(8, 6))
bars = plt.bar(unique, counts, color=plt.get_cmap('viridis')(np.linspace(0, 1, len(unique))))

plt.xticks([]) 
plt.tick_params(axis='x', length=0)  
plt.xlabel('Labels')
plt.title('Training Labels Distribution')

plt.show()

In [None]:
val_labels_ds = validate.map(lambda image, label: label).unbatch()
val_labels =  next(iter(val_labels_ds.batch(VAL_SIZE))).numpy()
val_label_indices = np.argmax(val_labels, axis=1)

unique, counts = np.unique(val_label_indices, return_counts=True)

plt.figure(figsize=(8, 6))
bars = plt.bar(unique, counts, color=plt.get_cmap('viridis')(np.linspace(0, 1, len(unique))))

plt.xticks([]) 
plt.tick_params(axis='x', length=0)  
plt.xlabel('Labels')
plt.title('Validation Labels Distribution')

plt.show()

# Model training and evaluation :


In [None]:
with strategy.scope():
    base_model = Xception(include_top=False, weights='imagenet', input_shape=(299,299,3))
    
    model = Sequential([
        base_model,
        BatchNormalization(),
        GlobalAveragePooling2D(),
        Dense(1024, activation='relu'),
        Dropout(0.1),
        Dense(512, activation='relu'),
        Dropout(0.1),
        Dense(256, activation='relu'),
        Dropout(0.1),
        Dense(104, activation='softmax')
    ])
    
    model.compile(
        optimizer='adamax',
        loss='categorical_crossentropy',
        metrics=[F1Score(average='macro')]
    )
    
    train_history = model.fit(
        train,
        steps_per_epoch=100,
        epochs=100,
        validation_data=validate,
        validation_steps=15
    )


In [None]:
model.evaluate(validate)

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(2,1,1)
plt.plot(train_history.history['loss'])
plt.plot(train_history.history['val_loss'], 'ro')
plt.title('Loss')
plt.grid(True)

plt.subplot(2,1,2)
plt.plot(train_history.history['f1_score'])
plt.plot(train_history.history['val_f1_score'], 'ro')
plt.title('F1-Score')
plt.grid(True)

In [None]:
test_images = test.map(lambda idx, image: image)
predictions_list = []

for batch_images in test_images:
    batch_predictions = model(batch_images, training=False) 
    predictions_list.append(batch_predictions)

all_predictions = np.concatenate(predictions_list, axis=0)
predicted_classes = np.argmax(all_predictions, axis=-1)

In [None]:
test_ids_ds = test.map(lambda idx, image: idx).unbatch()
test_ids = next(iter(test_ids_ds.batch(TEST_SIZE))).numpy().astype('U')

submission = pd.DataFrame({'id': test_ids, 'label': predicted_classes})
submission.to_csv('submission.csv', index=False)