# Setup

In [None]:
!pip install '../input/pawpularset/Keras_Applications-1.0.8-py3-none-any.whl'
!pip install '../input/pawpularset/efficientnet-1.1.1-py3-none-any.whl'

import gc
import numpy as np
import pandas as pd
import random
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split

import tensorflow as tf
import efficientnet.tfkeras as efn
from tensorflow.keras.layers import Input

In [2]:
IMG_SIZE = 384
CHANNELS = 3
BATCH_SIZE = 16
Q = 30
EPOCHS = 8
FOLDS = 6
FEATURE_FOLDS = 10
SEED = 1234
VERBOSE = 1
LR = 0.000005

TRAIN_FEATURE_MODEL = False

DATA_DIR = '../input/petfinder-pawpularity-score/'
TRAIN_DIR = DATA_DIR + 'train/'
TEST_DIR = DATA_DIR + 'test/'

## SET GPU

In [3]:
tf.config.optimizer.set_jit(enabled = "autoclustering")
strategy = tf.distribute.get_strategy()
AUTOTUNE = tf.data.experimental.AUTOTUNE   

## Training data

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}train.csv')
train_df['Id'] = train_df['Id'].apply(lambda x: f'{TRAIN_DIR}{x}.jpg')
train_df['stratify_label'] = pd.qcut(train_df['Pawpularity'], q = Q, labels = range(Q))
train_df['target_value'] = train_df['Pawpularity'] / 100.

## Test data

In [None]:
test_df = pd.read_csv(f'{DATA_DIR}test.csv')
test_df['Id'] = test_df['Id'].apply(lambda x: f'{TEST_DIR}{x}.jpg')
test_df['Pawpularity'] = 0

## TF Dataset support code

In [6]:
def build_augmenter(is_labelled):
    def augment(img):
        img = tf.image.random_flip_left_right(img)
        img = tf.image.random_flip_up_down(img)
        img = tf.image.random_saturation(img, 0.95, 1.05)
        img = tf.image.random_brightness(img, 0.05)
        img = tf.image.random_contrast(img, 0.95, 1.05)
        img = tf.image.random_hue(img, 0.05)
        return img
    
    def augment_with_labels(img, label):
        return augment(img), label
    
    return augment_with_labels if is_labelled else augment

def build_decoder(is_labelled):
    def decode(path):
        file_bytes = tf.io.read_file(path)
        img = tf.image.decode_jpeg(file_bytes, channels = CHANNELS)
        img = tf.cast(img, tf.float32) / 255.0
        img = tf.image.resize(img, (IMG_SIZE, IMG_SIZE))
        
        return img
    
    def decode_with_labels(path, label):
        return decode(path), label
    
    return decode_with_labels if is_labelled else decode

def create_dataset(df, batch_size = 32, is_labelled = False, augment = False, repeat = False, shuffle = False):
    decode_fn = build_decoder(is_labelled)
    augmenter_fn = build_augmenter(is_labelled)
    
    if is_labelled:
        dataset = tf.data.Dataset.from_tensor_slices((df['Id'].values, df['target_value'].values))
    else:
        dataset = tf.data.Dataset.from_tensor_slices((df['Id'].values))

    dataset = dataset.map(decode_fn, num_parallel_calls = AUTOTUNE)
    dataset = dataset.map(augmenter_fn, num_parallel_calls = AUTOTUNE) if augment else dataset
    dataset = dataset.repeat() if repeat else dataset
    dataset = dataset.shuffle(1024, reshuffle_each_iteration = True) if shuffle else dataset
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(AUTOTUNE)
    
    return dataset

## EfficientNet Feature Model Training

In [None]:
def model_checkpoint(fold):
    return tf.keras.callbacks.ModelCheckpoint(f'feature_model_{fold}.h5',
                                              verbose = 1, 
                                              monitor = 'val_rmse', 
                                              mode = 'min', 
                                              save_weights_only = True,
                                              save_best_only = True)

def unfreeze_model(model):
    for layer in model.layers:
        if not isinstance(layer, tf.keras.layers.BatchNormalization):
            layer.trainable = True
        else:
            layer.trainable = False

def create_model(): 
    effnet_model = efn.EfficientNetB2(include_top = False, 
                                      classes = None, 
                                      input_shape = (IMG_SIZE, IMG_SIZE, CHANNELS), 
                                      weights = '../input/pawpularset/efficientnet-b2_noisy-student_notop.h5', 
                                      pooling = 'avg')

    unfreeze_model(effnet_model)
    
    X = tf.keras.layers.Dropout(0.25)(effnet_model.output)
    output = tf.keras.layers.Dense(1, activation = 'sigmoid')(X)
    
    model = tf.keras.Model(inputs = effnet_model.input, outputs = output)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR), 
                  loss = tf.keras.losses.BinaryCrossentropy(), 
                  metrics = [tf.keras.metrics.RootMeanSquaredError('rmse')])        
    
    return model

In [8]:
if TRAIN_FEATURE_MODEL:
    all_val_rmse = []
    kfold = StratifiedKFold(n_splits = FEATURE_FOLDS, shuffle = True, random_state = SEED)
    for fold, (train_index, val_index) in enumerate(kfold.split(train_df.index, train_df['stratify_label'])):
        tf.keras.backend.clear_session()
        gc.collect()

        model = create_model()

        trn = train_df.iloc[train_index]
        val = train_df.iloc[val_index]
        training_dataset = create_dataset(trn, batch_size = BATCH_SIZE, is_labelled = True, augment = True, repeat = True, shuffle = True)
        validation_dataset = create_dataset(val, batch_size = BATCH_SIZE, is_labelled = True, augment = False, repeat = True, shuffle = False)

        history = model.fit(training_dataset,
                            epochs = EPOCHS,
                            steps_per_epoch = trn.shape[0] // BATCH_SIZE,
                            validation_steps = val.shape[0] // BATCH_SIZE,
                            callbacks = [model_checkpoint(fold)],
                            validation_data = validation_dataset,
                            verbose = 1)   
        best_val_rmse = min(history.history['val_rmse'])
        all_val_rmse.append(best_val_rmse)

## CatBoost 6 Fold CV Training

In [9]:
preds_final = np.zeros((test_df.shape[0], 1))
all_oof_score = []
Y_strat = train_df['stratify_label'].values
Y_pawpularity = train_df['Pawpularity'].values

In [None]:
for fold_index in range(FEATURE_FOLDS):

    tf.keras.backend.clear_session()
    gc.collect()
    model = create_model()
    model.load_weights(f'../input/pawpularset/feature_model_{fold_index}.h5')
    model = tf.keras.Model(inputs = model.input, outputs = model.layers[-3].output)
    
    cb_train_set = create_dataset(train_df, batch_size = BATCH_SIZE, is_labelled = True, augment = False, repeat = False, shuffle = False)
    cb_test_set = create_dataset(test_df, batch_size = BATCH_SIZE, is_labelled = False, augment = False, repeat = False, shuffle = False)
    cb_train_features = model.predict(cb_train_set, verbose = VERBOSE)
    cb_test_features = model.predict(cb_test_set, verbose = VERBOSE)
    
    oof_score = 0

    kfold = StratifiedKFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    for idx, (train, val) in enumerate(kfold.split(cb_train_features, Y_strat)):

        train_x, train_y = cb_train_features[train], Y_pawpularity[train]
        val_x, val_y = cb_train_features[val], Y_pawpularity[val]

        cb_params = {'loss_function' : 'RMSE',
                     'eval_metric' : 'RMSE',
                     'iterations' : 1200,
                     'grow_policy' : 'SymmetricTree',
                     'depth' : 8,
                     'l2_leaf_reg' : 2.0,
                     'random_strength' : 1.0,
                     'learning_rate' : 0.05,
                     'task_type' : 'CPU',
                     'devices' : '0',
                     'verbose' : 0,
                     'random_state': SEED}
        
        cb_model = CatBoostRegressor(**cb_params)
        cb_model.fit(train_x, train_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 250)

        y_pred = cb_model.predict(val_x)
        preds_final += np.array([cb_model.predict(cb_test_features)]).T

        oof_score += np.sqrt(mean_squared_error(val_y, y_pred))        

        del cb_model, y_pred
        del train_x, train_y
        del val_x, val_y
        gc.collect()   
    
    oof_score /= FOLDS
    all_oof_score.append(oof_score)
    
    SEED += 1

## Create submission file

In [12]:
preds_final /= (FOLDS*FEATURE_FOLDS)
submission_df = pd.read_csv(f'{DATA_DIR}sample_submission.csv')
submission_df['Pawpularity'] = preds_final.ravel()
submission_df.to_csv('submission.csv', index = False)