In [1]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics,preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [14]:
def create_model(data,catcols):
    """This function returns a compiled tf.keras model for entity embeddings
    :param data: this is a pandas dataframe
    :param catcols: list of categorical column names
    :return: compiled tf.keras model
    """
    
    # initialize list of inputs for embeddings
    inputs=[]
    
    # initialize list of outputs for embeddings
    outputs=[]
    
    # loop over all categorical columns
    for c in catcols:
        #find the number of unique values in the columns
        num_unique_values = int(data[c].nunique())
        # simple dimnesion of embedding calculator
        # min size is half of the number of unique values
        # max size is 50. dpends on no. of unique categories too.
        # 50 is quite sufficient most of the times but if you have
        # millions of unique values, you might need a larger dimension
        embed_dim = int(min(np.ceil((num_unique_values)/2),50))
        
        # simple keras input layer with size 1
        inp = layers.Input( shape=(1,))
        
        # add embedding layer to raw input
        #embedding size is always 1 more than unique values in input
        out = layers.Embedding(
            num_unique_values + 1, embed_dim, name=c
        )(inp)
        
        # 1-d spatial dropout is the standard for embedding layers
        # you can us eit in NLP tasks too
        out = layers.SpatialDropout1D(0.3)(out)
        
        # reshape the input to the dimension of embedding
        # this becomes our output layer for current feature
        out = layers.Reshape(target_shape=(embed_dim,))(out)
        
        # add input to input list
        inputs.append(inp)
        
        # add output to output list
        outputs.append(out)
        
        # concatenate all output layers
        x = layers.Concatenate()(outputs)
        
        # add a batchnorm layer
        # from here everything is upto you
        # you can try different architectures
        # this is one suggested/liked
        # if you have numerical features you should add
        # them here or in concatenate layer
        x = layers.BatchNormalization()(x)
        
        # a bunch of dense layers with dropout
        # start with 1 or 2 layers only
        x = layers.Dense(300,activation="relu")(x)
        x = layers.Dropout(0.3)(x)
        x = layers.BachNormalization()(x)
        
        x = layers.Dense(300,activation="relu")(x)
        x = layers.Dropout(0.3)(x)
        x = layers.BachNormalization()(x)
        
        # using softmax and treating it as two class problem
        # you cna also use sigmoid, then you need to use only one 
        # output class
        y = layers.Dense(2,activation="softmax")(x)
        
        # create final model
        model = Model(inputs=inputs,outputs=y)
        
        # compile the model
        # we use adam and binary cross entropy.
        # feel free to use something else and see how model behaves
        model.compile(loss='binary_crossentropy',optimizer='adam')
        return model

NameError: name 'num_unique_values' is not defined

In [19]:
def run(fold):
    
    df =pd.read_csv("/home/anand/catinthedatii/cat_train_folds.csv")
    
    # all columns are features except id, target and kfold
    features =[
        f for f in df.columns if f not in ("id","target","kfold")
    ]
    
    # fill all NaN values with NONE
    for col in features:
        df.loc[:,col] = df[col].astype(str).fillna("NONE")
        
    # encode all features with label encoder individually
    # in a live setting you need to save all label encoders
    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:,feat] = lbl_enc.fit_transform(df[feat].values)
    
    # get training data using kfolds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    # get validation data using kfolds
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    # create tf.keras model
    model = create_model(df,features)
    
    # our features are lists of lists
    xtrain = [
        df_train[features].values[:,k] for k in range(len(features))
    ]
    
    xvalid = [
        df_valid[features].values[:,k] for k in range(len(features))
    ]
    
    # fetch target columns
    ytrain = df_train.target.values
    yvalid = df_valid.target.values
    
    # convert targets to categories
    # this is just binarization
    ytrain_cat = utils.to_categorical(ytrain)
    yvalid_cat = utils.tocategorical(yvalid)

      # fit the model
    model.fit(xtrain,
              ytrain_cat,
              validation_data=(xvalid,yvalid_cat),
              verbose=1,
              batch_size=1024,
              epochs=3
             )
    
    # genrate validation predicitions
    valid_preds = model.predict(xvalid)[:,1]
    
    #print roc auc score
    print(metrics.roc_auc_score(yvalid,valid_preds))
    
    #clear session to free up some GPU memoru
    K.clear_session()

if __name__ == "__main__":
    run(0)
    run(1)
    run(2)
    run(3)
    run(4)

ValueError: A `Concatenate` layer should be called on a list of at least 2 inputs

In [18]:
print(features)

NameError: name 'features' is not defined

NameError: name 'inp' is not defined