# *Modern Deep Learning for Tabular Data*, Chapter 6

**Applying Attention to Tabular Data**

This notebook contains the complementary code discussed in Chapter 6 of *Modern Deep Learning for Tabular Data*.

External Kaggle links to datasets used in this notebook:
- [TripAdvisor Hotel Reviews](https://www.kaggle.com/datasets/andrewmvd/trip-advisor-hotel-reviews)
- [Daily Reddit News for Stock Market Prediction](https://www.kaggle.com/datasets/aaron7sun/stocknews)
- [Forest Cover Type Dataset](https://www.kaggle.com/datasets/uciml/forest-cover-type-dataset)

You can download these datasets from Kaggle, or import these notebooks into Kaggle and connect them internally.

In [None]:
# data management
import numpy as np                   # for linear algebra
import pandas as pd                  # for tabular data manipulation and processing

# machine learning
import sklearn                       # for data prep and classical ML
import tensorflow as tf              # for deep learning
from tensorflow import keras         # for deep learning
import keras.layers as L             # for easy NN layer access
from keras import backend as K       # for accessing Keras backend

# data visualization and graphics
import matplotlib.pyplot as plt      # for visualization fundamentals
import seaborn as sns                # for pretty visualizations
import cv2                           # for image manipulation

# misc
from tqdm.notebook import tqdm       # for progress bars
import math                          # for calculation
import sys                           # for system manipulation
import os                            # for file manipulation

---

## The Attention Mechanism in Keras

Custom attention layer.

In [None]:
class Attention(keras.layers.Layer):
    def __init__(self,**kwargs):
        super(Attention,self).__init__(**kwargs)
 
    def build(self,input_shape):
        self.W=self.add_weight(name='attention_weight', shape=(input_shape[-1],1), 
                               initializer='random_normal', trainable=True)
        self.b=self.add_weight(name='attention_bias', shape=(input_shape[1],1), 
                               initializer='zeros', trainable=True)        
        super(Attention, self).build(input_shape)
 
    def call(self,x):
        context = x * self.get_alpha(x)
        context = K.sum(context, axis=1)
        return context
    
    def get_alpha(self,x):
        e = K.tanh(K.dot(x, self.W)+self.b)
        e = K.squeeze(e, axis=-1)
        alpha = K.softmax(e)
        alpha = K.expand_dims(alpha, axis=-1)
        return alpha

Demonstrating the attention scores of the custom attention layer.

In [None]:
x, y = [], []

NUM_SAMPLES = 10_000

next_element = lambda arr: arr[-2] + arr[-4]

vector_switch = [np.zeros((1,8)), np.ones((1,8))]
for i in tqdm(range(NUM_SAMPLES)):
    seed = np.random.normal(0, 5, size=(10,8))
    x.append(seed)
    y.append(next_element(seed))
    
x = np.array(x)
y = np.array(y)

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(x, y, train_size=0.8)

In [None]:
inp = L.Input((10,8))
lstm1 = L.GRU(16, return_sequences=True)(inp)
lstm2 = L.GRU(16, return_sequences=True)(lstm1)
attention = Attention()
attended = attention(lstm2)
dense = L.Dense(16, activation='relu')(attended)
dense2 = L.Dense(16, activation='relu')(dense)
out = L.Dense(8, activation='linear')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', 
              loss='mse',
              metrics=['mae'])
model.fit(X_train, y_train,
          validation_data=(X_valid, y_valid),
          epochs=10)

In [None]:
inp = L.Input((10,8))
rnn1 = model.layers[1](inp)
rnn2 = model.layers[2](rnn1)
submodel = keras.models.Model(inputs=inp, outputs=rnn2)

recurrent_out = tensorflow.constant(submodel.predict(x))

plt.figure(figsize=(10, 5), dpi=400)
plt.bar(range(10), attention.get_alpha(recurrent_out[0,:,0], 
        color='red')
plt.ylabel('Alpha Values')
plt.xlabel('Time Step')
plt.show()

Demonstrating Keras' native attention layers.

In [None]:
sigmoid = lambda x: 1/(1 + np.exp(-x))
sigmoid_deriv = lambda x: sigmoid(x) * sigmoid(-x)
adjusted_sigmoid_deriv = lambda x: 4 * sigmoid_deriv(x - 5)
weights = adjusted_sigmoid_deriv(np.linspace(0, 10, 10))

x, y = [], []

NUM_SAMPLES = 10_000

next_element = lambda arr: np.dot(weights, arr)

for i in tqdm(range(NUM_SAMPLES)):
    seed = np.random.normal(0, 1, size=(10,8))
    x.append(seed)
    y.append(next_element(seed))
    
x = np.array(x)
y = np.array(y)

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(x, y, train_size=0.8)

In [None]:
inp = L.Input((10,8))
lstm1 = L.Bidirectional(L.LSTM(8, return_sequences=True))(inp)
attended = L.Attention(use_scale=True)([lstm1, lstm1])
lstm2 = L.LSTM(16)(attended)
dense = L.Dense(16, activation='relu')(lstm2)
dense2 = L.Dense(16, activation='relu')(dense)
out = L.Dense(8, activation='linear')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=1000,
                    validation_data=(X_valid, y_valid))

In [None]:
lstm1_ = model.layers[1](inp)
_, attn = model.layers[2]([lstm1_, lstm1_], 
                          return_attention_scores=True)
submodel = keras.models.Model(inputs=inp, outputs=attn)

scores = submodel.predict(X_train)

In [None]:
plt.figure(figsize=(12,12), dpi=400)
sns.heatmap(scores[0,:,:], cbar=False)
plt.show()

Demonstrating Keras' multi-head attention layer.

In [None]:
sigmoid = lambda x: 1/(1 + np.exp(-x))
sigmoid_deriv = lambda x: sigmoid(x) * sigmoid(-x)
adjusted_sigmoid_deriv1 = lambda x: 4 * sigmoid_deriv(x - 2)
adjusted_sigmoid_deriv2 = lambda x: 4 * sigmoid_deriv(x - 8)
x = np.linspace(0, 10, 10)
weights = adjusted_sigmoid_deriv1(x) + adjusted_sigmoid_deriv2(x)

x, y = [], []

NUM_SAMPLES = 10_000

next_element = lambda arr: np.dot(weights, arr)

for i in tqdm(range(NUM_SAMPLES)):
    seed = np.random.normal(0, 1, size=(10,8))
    x.append(seed)
    y.append(next_element(seed))

x = np.array(x)
y = np.array(y)

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(x, y, train_size=0.8)

In [None]:
inp = L.Input((10,8))
lstm1 = L.Bidirectional(L.LSTM(8, return_sequences=True))(inp)
attended, scores = L.MultiHeadAttention(num_heads=4, 
                                        key_dim=16)(lstm1,
                                                    lstm1,
                                                    return_attention_scores=True)
lstm2 = L.LSTM(16)(attended)
dense = L.Dense(16, activation='relu')(lstm2)
dense2 = L.Dense(16, activation='relu')(dense)
out = L.Dense(8, activation='linear')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=1000,
                    validation_data=(X_valid, y_valid))

In [None]:
plt.figure(figsize=(24,24), dpi=400)

for i in range(2):
    for j in range(2):
        plt.subplot(2, 2, 2*i + j + 1)
        sns.heatmap(scores[0,2*i + j,:,:], cbar=False)
plt.show()

Demonstrating a sequence-to-sequence problem.

In [None]:
x, y = [], []

NUM_SAMPLES = 10_000

next_element = lambda arr: np.stack([arr[(i+4)%10] + arr[(i+5)%10] + arr[(i+6)%10] for i in range(10)])

for i in tqdm(range(NUM_SAMPLES)):
    seed = np.random.normal(0, 5, size=(10,8))
    x.append(seed)
    y.append(next_element(seed))
    
x = np.array(x)
y = np.array(y)

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(x, y, train_size=0.8)

In [None]:
inp = L.Input((10,8))
encoder = L.Bidirectional(L.LSTM(16, return_sequences=True))(inp)
encoder2 = L.LSTM(16, return_sequences=True)(encoder)
decoder = L.LSTM(16, return_sequences=True)(encoder2)
attn, scores = L.Attention(use_scale=True)([decoder, encoder2], 
                                           return_attention_scores=True)
concat = L.Concatenate()([decoder, attn])
out = L.TimeDistributed(L.Dense(8, activation='linear'))(concat)

model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train, y_train, epochs=1000,
                    validation_data=(X_valid, y_valid))

In [None]:
submodel = keras.models.Model(inputs=inp, outputs=scores)
scores = submodel.predict(X_train)

for i in range(4):    
    plt.figure(figsize=(12,12), dpi=400)
    sns.heatmap(scores[i,:,:], cbar=False)
    plt.show()

---

## Improving Natural Language Models with Attention

### TripAdvisor dataset modeling.

In [None]:
data = pd.read_csv('../input/trip-advisor-hotel-reviews/tripadvisor_hotel_reviews.csv')
data.head()

In [None]:
SEQ_LEN, MAX_TOKENS = 128, 2048
EMBEDDING_DIM = 16

vectorize = tensorflow.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                                      output_sequence_length=SEQ_LEN)
vectorize.adapt(data['Review'])

In [None]:
labels = data['Rating'] - 1

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(data['Review'], labels, train_size=0.8)

X_train_vec = vectorize(X_train)
X_valid_vec = vectorize(X_valid)

Custom attention layer.

In [None]:
inp = L.Input((SEQ_LEN,))
embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)(inp)
rnn1 = L.LSTM(16, return_sequences=True)(embed)
rnn2 = L.LSTM(16, return_sequences=True)(rnn1)
attn = Attention()(rnn2)
dense = L.Dense(16, activation='relu')(attn)
dense2 = L.Dense(16, activation='relu')(dense)
out = L.Dense(5, activation='softmax')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_vec, y_train, epochs=200,
                    validation_data=(X_valid_vec, y_valid))

In [None]:
inp = L.Input((SEQ_LEN,))
embed = model.layers[1](inp)
rnn1 = model.layers[2](embed)
rnn2 = model.layers[3](rnn1)
submodel = keras.models.Model(inputs=inp, outputs=rnn2)

for index in range(5):
    
    fig, ax = plt.subplots(figsize=(10, 5), dpi=400)
    lstm_encodings = tensorflow.constant(submodel.predict(X_train_vec[index:index+1]))
    alpha_values = model.layers[4].get_alpha(lstm_encodings)[0,:,0]
    bars = ax.bar(range(SEQ_LEN), alpha_values, color='red', alpha=0.7)
    text = X_train[X_train.index[index]].split(' ')
    text += ['']*(SEQ_LEN - len(text))
    for i, bar in enumerate(bars):
        height = bar.get_height()
        ax.text(x=bar.get_x() + bar.get_width() / 2 - 0.02, y=height+.0002,
                rotation = 90, size=6,
                s=text[i],
                ha='center')
    ax.set_ylabel('Alpha Values')
    ax.set_xlabel('Time Step')
    ax.axes.yaxis.set_visible(False)
    plt.show()

Keras attention layer.

In [None]:
inp = L.Input((SEQ_LEN,))
embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)(inp)
rnn1 = L.Bidirectional(L.GRU(16, return_sequences=True))(embed)
attn, scores = L.MultiHeadAttention(num_heads=4, key_dim=4)(rnn1, rnn1,
                                    return_attention_scores=True)
rnn2 = L.LSTM(16, return_sequences=True)(attn)
rnn3 = L.LSTM(16)(rnn2)
dense = L.Dense(8, activation='relu')(rnn3)
dense2 = L.Dense(8, activation='relu')(dense)
out = L.Dense(5, activation='softmax')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

In [None]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit(X_train_vec, y_train, epochs=200,
                    validation_data=(X_valid_vec, y_valid))

In [None]:
submodel = keras.models.Model(inputs=inp, outputs=scores)
attn_scores = submodel.predict(X_train_vec)
vocab = vectorize.get_vocabulary()

In [None]:
for index in range(5):

    vectorized_text = X_train_vec[index]
    words = [vocab[token] for token in vectorized_text.numpy()]

    plt.figure(figsize=(12,12), dpi=400)
    sns.heatmap(attn_scores[index,0,:,:], cbar=False,
                xticklabels=words, yticklabels=words)
    plt.show()

### Reddit News and Stock Forecasting

In [None]:
news = pd.read_csv('../input/stocknews/Combined_News_DJIA.csv')
news = news[['Top1', 'Top2', 'Top3', 'Date']]
stock = pd.read_csv('../input/stocknews/upload_DJIA_table.csv')
data = news.merge(stock, how='inner', left_on='Date', right_on='Date')
stock = data[['Open', 'High', 'Low', 'Close']]
stock /= 1000

In [None]:
WINDOW_LENGTH = 20

x_stock = np.zeros((len(stock) - WINDOW_LENGTH,
                    WINDOW_LENGTH,
                    len(stock.columns)))
y_stock = np.zeros((len(stock) - WINDOW_LENGTH,
                    len(stock.columns)))

for i in range(len(stock) - WINDOW_LENGTH):
    x_stock[i] = np.array(stock.loc[i:i+WINDOW_LENGTH-1])
    y_stock[i] = np.array(stock.loc[i+WINDOW_LENGTH])

In [None]:
data = data.loc[WINDOW_LENGTH:]
top1_text, top2_text, top3_text = data['Top1'], data['Top2'], data['Top3']

In [None]:
SEQ_LEN, MAX_TOKENS = 32, 10_000
EMBEDDING_DIM = 32

vectorize = tensorflow.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                                      output_sequence_length=SEQ_LEN)
vectorize.adapt(pd.concat([top1_text, top2_text, top3_text]))

In [None]:
top1_text = vectorize(top1_text)
top2_text = vectorize(top2_text)
top3_text = vectorize(top3_text)

In [None]:
variables = ['x_stock', 'y_stock', 
             'top1_text', 'top2_text', 'top3_text']

train_prop = 0.8
train_index = round(train_prop * len(data))
for variable in variables:
    exec(f'{variable}_train = {variable}[:{train_index}]')
    exec(f'{variable}_valid = {variable}[{train_index}:]')

In [None]:
top1_inp = L.Input((SEQ_LEN,), name='top1')
top2_inp = L.Input((SEQ_LEN,), name='top2')
top3_inp = L.Input((SEQ_LEN,), name='top3')

embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)
top1_embed = embed(top1_inp)
top2_embed = embed(top2_inp)
top3_embed = embed(top3_inp)

lstm1 = L.Bidirectional(L.LSTM(16, return_sequences=True))
top1_lstm1 = lstm1(top1_embed)
top2_lstm1 = lstm1(top2_embed)
top3_lstm1 = lstm1(top3_embed)

attn = L.MultiHeadAttention(num_heads=3, key_dim=4,
                            dropout=0.1)

top1_lstm2 = L.LSTM(32)(attn(top1_lstm1, top1_lstm1))
top2_lstm2 = L.LSTM(32)(attn(top2_lstm1, top2_lstm1))
top3_lstm2 = L.LSTM(32)(attn(top3_lstm1, top3_lstm1))

concat = L.Concatenate()([top1_lstm2, top2_lstm2, top3_lstm2])
concat_dense = L.Dense(16, activation='relu')(concat)

stock_inp = L.Input((WINDOW_LENGTH, 4), name='stock')
stock_cnn1 = L.Conv1D(8, 5, activation='relu')(stock_inp)
stock_lstm1 = L.LSTM(8, return_sequences=True)(stock_cnn1)
stock_lstm2 = L.LSTM(8)(stock_lstm1)

joint_concat = L.Concatenate()([concat_dense, stock_lstm2])
joint_dense1 = L.Dense(16, activation='relu')(joint_concat)
joint_dense2 = L.Dense(16, activation='relu')(joint_dense1)
out = L.Dense(4, activation='relu')(joint_dense2)

model = keras.models.Model(inputs={'top1': top1_inp,
                                   'top2': top2_inp,
                                   'top3': top3_inp,
                                   'stock': stock_inp},
                          outputs=out)

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

history = model.fit(x={'top1': top1_text_train,
             'top2': top2_text_train,
             'top3': top3_text_train,
             'stock': x_stock_train},
          y=y_stock_train,
          validation_data=({'top1': top1_text_valid,
                            'top2': top2_text_valid,
                            'top3': top3_text_valid,
                            'stock': x_stock_valid},
                            y_stock_valid),
         batch_size=128,
         epochs=300)

In [None]:
plt.figure(figsize=(15, 7), dpi=400)
plt.plot(history.history['loss'], color='red', label='Train')
plt.plot(history.history['val_loss'], color='blue', label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
_, attn_top1 = attn(top1_lstm1, top1_lstm1, return_attention_scores=True)
_, attn_top2 = attn(top2_lstm1, top2_lstm1, return_attention_scores=True)
_, attn_top3 = attn(top3_lstm1, top3_lstm1, return_attention_scores=True)

submodel = keras.models.Model(inputs=[top1_inp, top2_inp, top3_inp], outputs=[attn_top1, attn_top2, attn_top3])

In [None]:
vocab = vectorize.get_vocabulary()

In [None]:
for index in range(5):
    
    scores = submodel.predict([top1_text_train[index:index+1],
                               top2_text_train[index:index+1],
                               top3_text_train[index:index+1]])
    top1_scores, top2_scores, top3_scores = scores

    bundles = ((top1_scores, top1_text_train[index]),
               (top2_scores, top2_text_train[index]),
               (top3_scores, top3_text_train[index]))

    for curr_scores, vectorized_text in bundles:

        words = [vocab[token] for token in vectorized_text.numpy()]
        if sum([1 for i in words if i != 0]) > 25:
        
            print('-'* 500)
            print(''.join(words))

            for i in range(len(curr_scores[0])): # number heads
                plt.figure(figsize=(12,12), dpi=400)
                sns.heatmap(curr_scores[0,i,:,:], cbar=False,
                            xticklabels=words, yticklabels=words)
                plt.show()

---

## Direct Attention Modeling

In [None]:
data = pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')
X = data.drop('Cover_Type', axis=1)
y = data['Cover_Type'] - 1

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(X, y, train_size=0.8)

In [None]:
def attn_block(inp, 
               dense_units=8,
               num_heads=4,
               key_dim=4):
    dense = L.Dense(dense_units, activation='relu')(inp)
    dense2 = L.Dense(dense_units, activation='relu')(dense)
    attn_out = L.Attention(use_scale=True)([dense2, dense2])
    layer_norm = L.LayerNormalization()(attn_out)
    return layer_norm

inp = L.Input((len(X_train.columns),))
reshape = L.Reshape((len(X_train.columns),1))(inp)
attn1 = attn_block(reshape)
attn2 = attn_block(attn1)
flatten = L.Flatten()(attn2)
predense = L.Dense(32, activation='relu')(flatten)
out = L.Dense(7, activation='softmax')(predense)

model = keras.models.Model(inputs=inp, outputs=out)

In [None]:
model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100,
          batch_size=4096,
          validation_data=(X_valid, y_valid))

---

## Attention-Based Deep Learning Research

### TabTransformer

Loading data.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/hjhuney/Data/master/AmesHousing/train.csv')
df = df.dropna(axis=1, how='any').drop('Id', axis=1)
x = df.drop('SalePrice', axis=1)
y = df['SalePrice'] / 1000

cat_features = []
for colIndex, colName in enumerate(x.columns):
    if type(x.iloc[0, colIndex]) == str or len(x[colName].unique()) <= 5:
        cat_features.append(colName)
cont_features = [col for col in x.columns if col not in cat_features]

from sklearn.preprocessing import OrdinalEncoder
encoders = {col:OrdinalEncoder() for col in cat_features}
for cat_feature in cat_features:
    encoder = encoders[cat_feature]
    x[cat_feature] = encoder.fit_transform(np.array(x[cat_feature]).reshape(-1, 1)) #.astype(np.float32)
    
for cont_feature in cont_features:
    x[cont_feature] = x[cont_feature].astype(np.float32)

from sklearn.model_selection import train_test_split as tts
X_train, X_valid, y_train, y_valid = tts(x, y, train_size=0.8)

Custom architecture.

In [None]:
'''
CONFIG
'''

NUM_CONT_FEATS = 8
NUM_CAT_FEATS = 4
NUM_UNIQUE_CLASSES = [32 for i in range(NUM_CAT_FEATS)]

EMBEDDING_DIM = 32

NUM_HEADS = 4
KEY_DIM = 4
NUM_TRANSFORMERS = 6
FF_HIDDEN_DIM = 32

MLP_LAYERS = 4
MLP_HIDDEN = 16

OUT_DIM = 1
OUT_ACTIVATION = 'linear'

'''
ARCHITECTURE
'''

cont_inp = L.Input((NUM_CONT_FEATS,), name='Cont Feats')
normalize = L.LayerNormalization()(cont_inp)

cat_inps = [L.Input((1,),
                    name=f'Cat Feats {i}') for i in range(NUM_CAT_FEATS)]
zipped = zip(NUM_UNIQUE_CLASSES, cat_inps)
embeddings = [L.Embedding(uqcls, EMBEDDING_DIM)(cat_inp) for uqcls, cat_inp in zipped]
concat_embed = L.Concatenate(axis=1)(embeddings)

def transformer(inp):
    attention = L.MultiHeadAttention(num_heads=NUM_HEADS,
                                     key_dim=KEY_DIM)(inp, inp)
    add = L.Add()([inp, attention])
    norm = L.LayerNormalization()(add)
    dense1 = L.Dense(FF_HIDDEN_DIM, activation='relu')(norm)
    dense2 = L.Dense(EMBEDDING_DIM, activation='relu')(dense1)
    add2 = L.Add()([norm, dense2])
    norm2 = L.LayerNormalization()(add2)
    return norm2

transformed = concat_embed
for i in range(NUM_TRANSFORMERS):
    transformed = transformer(transformed)
contextual_embeddings = L.Flatten()(transformed)

all_feat_concat = L.Concatenate()([normalize, contextual_embeddings])
mlp = all_feat_concat
for i in range(MLP_LAYERS):
    mlp = L.Dense(MLP_HIDDEN, activation='relu')(mlp)
out = L.Dense(OUT_DIM, activation=OUT_ACTIVATION)(mlp)

all_inps = cat_inps + [cont_inp]
model = keras.models.Model(inputs=all_inps, outputs=out)

Using prebuilt architecture.

In [None]:
!git clone https://github.com/CahidArda/tab-transformer-keras.git

In [None]:
os.rename('./tab-transformer-keras', './tab_transformer_keras')

In [None]:
from tab_transformer_keras.tab_transformer_keras import tab_transformer_keras
from tab_transformer_keras import misc

In [None]:
from tab_transformer_keras.tab_transformer_keras.tab_transformer_keras import TabTransformer
from tab_transformer_keras.misc import get_X_from_features

X_train_tt = get_X_from_features(X_train, cont_features, cat_features)
X_valid_tt = get_X_from_features(X_valid, cont_features, cat_features)
class_counts = [x[col].nunique() for col in cat_features]
model = TabTransformer(
    categories = class_counts,
    num_continuous = len(cat_features),
    dim = 16,
    dim_out = 1,
    depth = 6,
    heads = 8,
    attn_dropout = 0.1,
    ff_dropout = 0.1,
    mlp_hidden = [(32, 'relu'), (16, 'relu')] # mlp layer dimensions and activations
)

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
history = model.fit(X_train_tt, y_train, epochs=500,
                    validation_data=(X_valid_tt, y_valid))

In [None]:
plt.figure(figsize=(10, 5), dpi=400)
plt.plot(history.history['loss'], color='red', label='Training')
plt.plot(history.history['val_loss'], color='blue', label=['Validation'],
         linestyle='--')
plt.legend()
plt.show()

### TabNet

In [None]:
from sklearn.model_selection import train_test_split as tts
data = pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')
X, y = data.drop('Cover_Type', axis=1), data['Cover_Type'] - 1
columns = X.columns
X = np.array(X).astype(np.float32)
y = np.array(y).astype(np.float32)
X_train, X_valid, y_train, y_valid = tts(X, y, train_size=0.8)

In [None]:
!pip install --upgrade tabnet

In [None]:
from tabnet import TabNetClassifier
model = TabNetClassifier(feature_columns=None,
                         num_classes=7,
                         num_features=X.shape[-1],
                         feature_dim=32,
                         output_dim=16,
                         num_decision_steps=8,
                         relaxation_factor=0.7,
                         sparsity_coefficient=1e-6)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=30,
          validation_data=(X_valid, y_valid),
          batch_size=10_000)

In [None]:
_ = model(X_valid)
fs_masks_orig = model.tabnet.feature_selection_masks
fs_masks = np.stack([mask.numpy()[0,:,:,0] for mask in fs_masks_orig])

for i in range(7):
    plt.figure(figsize=(15, 8), dpi=400)
    sns.heatmap(fs_masks[i,:100,:],
                xticklabels=columns,
                yticklabels=[])
    plt.xlabel('Columns')
    plt.ylabel('Samples')
    plt.title(f'Sample of Mask Values for Layer {i+1}')
    plt.show()

In [None]:
agg_mask = model.tabnet.aggregate_feature_selection_mask

plt.figure(figsize=(15, 8), dpi=400)
sns.heatmap(agg_mask.numpy()[0,:100,:,0],
            xticklabels=columns,
            yticklabels=[])
plt.xlabel('Columns')
plt.ylabel('Samples')
plt.title(f'Aggregate Feature Mask')
plt.show()

### SAINT

See book for details.

### ARM-Net

See book for details.