In [1]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import MissingIndicator, SimpleImputer, IterativeImputer, KNNImputer
from deepctr.inputs import  SparseFeat, DenseFeat, get_feature_names
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam,RMSprop
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import backend as K
from tensorflow.keras import callbacks
from tensorflow.keras import utils
from deepctr.models import DeepFM
import tensorflow.keras as keras
import tensorflow as tf
import pandas as pd
import numpy as np
import re
import warnings
warnings.simplefilter('ignore')

In [2]:
import tensorflow as tf
if tf.test.gpu_device_name():
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

Default GPU Device: /device:GPU:0


In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
test["target"] = -1

In [5]:
data = pd.concat([train, test]).reset_index(drop=True)

In [6]:
data['null'] = data.isna().sum(axis=1)

# Sparse and replace value in test set

In [7]:
sparse_features = [feat for feat in train.columns if feat not in ['id','target']]

In [8]:
for col in sparse_features:
    train_unique_values = set(train[col].dropna().unique())
    test_unique_values  = set(test[col].dropna().unique())

    symmetric_difference_values = train_unique_values.symmetric_difference(test_unique_values)
    if symmetric_difference_values:
        print(f'{len(symmetric_difference_values)} values in {col}, {symmetric_difference_values} Replaced with nan')
        data.loc[data[col].isin(symmetric_difference_values), col] = np.nan

1 values in nom_5, {'b3ad70fcb'} Replaced with nan
4 values in nom_6, {'ee6983c6d', 'a885aacec', 'f0732a795', '3a121fefb'} Replaced with nan
2 values in nom_9, {'1065f10dd', '3d19cd31d'} Replaced with nan


# Missing

In [9]:
missing_indicators = MissingIndicator(sparse=False).fit_transform(data[sparse_features]).astype(np.int8)

In [10]:
missing_indicator_cols = [feat+'_ind' for feat in sparse_features]
for col in missing_indicator_cols:
    data[col] = 0
    data[col] = data[col].astype(np.uint8)
data[missing_indicator_cols] = MissingIndicator(sparse=False).fit_transform(data[sparse_features]).astype(np.int8)

# Label

In [11]:
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat].fillna('-1',).astype(str).values)

In [12]:
train = data[data.target != -1].reset_index(drop=True)
test  = data[data.target == -1].reset_index(drop=True)

# Columns

In [13]:
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in (sparse_features+missing_indicator_cols)]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [14]:
def auc(y_true, y_pred):
    def fallback_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except:
            return 0.5
    return tf.py_function(fallback_auc, (y_true, y_pred), tf.double)

# Model

In [15]:
class CyclicLR(keras.callbacks.Callback):

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1 / (2. ** (x - 1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma ** (x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}
        
        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.

    def clr(self):
        cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))
        x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(
                self.clr_iterations)

    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())

    def on_batch_end(self, epoch, logs=None):

        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        K.set_value(self.model.optimizer.lr, self.clr())

In [16]:
target = ['target']
N_Splits = 50
Verbose = 1
Epochs = 15
SEED = 0

In [17]:
import tensorflow_addons as tfa
radam = tfa.optimizers.RectifiedAdam(lr=0.0001,
    total_steps=10000,
    warmup_proportion=0.1,
    min_lr=0.00001,)
ranger = tfa.optimizers.Lookahead(radam, sync_period=6, slow_step_size=0.5)

In [18]:
oof_pred_deepfm = np.zeros((len(train), ))
y_pred_deepfm = np.zeros((len(test),))


skf = StratifiedKFold(n_splits=N_Splits, shuffle=True, random_state=SEED)
for fold, (tr_ind, val_ind) in enumerate(skf.split(train, train[target])):
    X_train, X_val = train[sparse_features+missing_indicator_cols].iloc[tr_ind], train[sparse_features+missing_indicator_cols].iloc[val_ind]
    y_train, y_val = train[target].iloc[tr_ind], train[target].iloc[val_ind]
    train_model_input = {name:X_train[name] for name in feature_names}
    val_model_input = {name:X_val[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}
    
    model = DeepFM(linear_feature_columns, dnn_feature_columns, dnn_hidden_units=(256, 256), dnn_dropout=0.05, dnn_use_bn=True, task='binary')
    model.compile(ranger, "binary_crossentropy", metrics=[auc], )
    
    es = callbacks.EarlyStopping(monitor='val_auc',min_delta=0.001, patience=4, verbose=Verbose, mode='max', baseline=None, restore_best_weights=True)
    sb = callbacks.ModelCheckpoint(monitor='val_auc',filepath='nn_model.w8', save_weights_only=True, mode='max', save_best_only=True, verbose=Verbose)
    clr = CyclicLR(base_lr=0.00001 / 100, max_lr = 0.0001, 
                       step_size= int(1.0*(test.shape[0])/1024) , mode='exp_range',
                       gamma=1., scale_fn=None, scale_mode='cycle')
    
    history = model.fit(train_model_input, y_train,
                        validation_data=(val_model_input, y_val),
                        batch_size=256, epochs=Epochs, verbose=Verbose,
                        callbacks=[es, sb, clr],)
    
    model.load_weights('nn_model.w8')
    
    val_pred = model.predict(val_model_input, batch_size=256)
    print(f"validation AUC fold {fold+1} : {round(roc_auc_score(y_val, val_pred), 5)}")
    oof_pred_deepfm[val_ind] = val_pred.ravel()
    y_pred_deepfm += model.predict(test_model_input, batch_size=256).ravel() / (N_Splits)
    K.clear_session()

Train on 588000 samples, validate on 12000 samples
Epoch 1/15
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 00001: val_auc improved from -inf to 0.78420, saving model to nn_model.w8
Epoch 2/15
Epoch 00002: val_auc improved from 0.78420 to 0.78479, saving model to nn_model.w8
Epoch 3/15
Epoch 00003: val_auc did not improve from 0.78479
Epoch 4/15
Epoch 00004: val_auc did not improve from 0.78479
Epoch 5/15

Epoch 00005: val_auc did not improve from 0.78479
Epoch 00005: early stopping
validation AUC fold 1 : 0.7848
Train on 588000 samples, validate on 12000 samples
Epoch 1/15
Epoch 00001: val_auc improved from -inf to 0.78833, saving model to nn_model.w8
Epoch 2/15
Epoch 00002: val_auc improved from 0.78833 to 0.79017, saving model to nn_model.w8
Epoch 3/15
Epoch 00003: val_auc did not improve from 0.79017
Epoch 4/15
Epoch 00004: val_auc did not improve from 0.79017
Epoch 5/15
Epoch 00005: val_auc did not improve from 0.79017
Epoch 6/15

Epoch 000

In [22]:
print(f"OOF AUC : {round(roc_auc_score(train.target.values, oof_pred_deepfm), 5)}")

OOF AUC : 0.78769


In [23]:
test_idx = test.id.values
submission = pd.DataFrame.from_dict({
    'id': test_idx,
    'target': y_pred_deepfm
})
submission.to_csv("submission.csv", index=False)
print("Submission file saved!")

Submission file saved!


In [24]:
np.save('oof_pred_deepfm.npy',oof_pred_deepfm)
np.save('y_pred_deepfm.npy',    y_pred_deepfm)