# *Modern Deep Learning for Tabular Data*, Chapter 5

**Applying Recurrent Structures to Tabular Data**

This notebook contains the complementary code discussed in Chapter 5 of *Modern Deep Learning for Tabular Data*.

External Kaggle links to datasets used in this notebook:
- [Amazon US Software Reviews](https://www.kaggle.com/datasets/washingtongold/amazon-us-software-reviews)
- [Daily News for Stock Market Prediction](https://www.kaggle.com/datasets/aaron7sun/stocknews)
- [Forest Cover Type Dataset](https://www.kaggle.com/datasets/uciml/forest-cover-type-dataset)
- [Speech Accent Archive](https://www.kaggle.com/datasets/rtatman/speech-accent-archive)

You can download these datasets from Kaggle, or import these notebooks into Kaggle and connect them internally.

---

## Imports

In [None]:
# data management
import numpy as np                   # for linear algebra
import pandas as pd                  # for tabular data manipulation and processing

# machine learning
import sklearn                       # for data prep and classical ML
import tensorflow as tf              # for deep learning
from tensorflow import keras         # for deep learning
import keras.layers as L             # for easy NN layer access

# data visualization and graphics
import matplotlib.pyplot as plt      # for visualization fundamentals
import seaborn as sns                # for pretty visualizations
import cv2                           # for image manipulation
from skimage import io               # for input/output processing

# misc
from tqdm.notebook import tqdm       # for progress bars
import math                          # for calculation
import sys                           # for system manipulation
import os                            # for file manipulation

---

## Natural RNN Applications

### Natural Language

In [None]:
data = pd.read_csv('../input/amazon-us-software-reviews/data.csv')
data['data/review_body'] = data['data/review_body'].apply(lambda x:x[2:])

Let's try to predict star rating and whether the person actually purchased it or not.

In [None]:
data[['data/review_body', 'data/star_rating', 'data/verified_purchase']]

Vectorization

In [None]:
SEQ_LEN, MAX_TOKENS = 128, 2048
EMBEDDING_DIM = 64

vectorize = tf.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                              output_sequence_length=SEQ_LEN)
vectorize.adapt(data['data/review_body'])

In [None]:
TRAIN_SIZE = 0.8
train_indices = np.random.choice(data.index, replace=False, size=round(TRAIN_SIZE * len(data)))
valid_indices = np.array([i for i in data.index if i not in train_indices])
X_train = vectorize(data['data/review_body'][train_indices])
X_valid = vectorize(data['data/review_body'][valid_indices])
y_train = data['data/star_rating'][train_indices] - 1
y_valid = data['data/star_rating'][valid_indices] - 1

Find a text to token library with auto NLP handling

In [None]:
inp = L.Input((SEQ_LEN,))
embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)(inp)
rnn1 = L.LSTM(32, return_sequences=True)(embed)
rnn2 = L.LSTM(32)(rnn1)
dense = L.Dense(32, activation='relu')(rnn2)
dense2 = L.Dense(32, activation='relu')(dense)
out = L.Dense(5, activation='softmax')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid),
                    epochs=130, batch_size=512)

plt.figure(figsize=(10, 5), dpi=400)
plt.plot(history.history['loss'], color='red', label='Training')
plt.plot(history.history['val_loss'], color='blue', label='Validation', linestyle='--')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

model.save_weights('weights.h5')

### Time Series

In [None]:
def clean_name(filename):
    for i, v in enumerate(filename):
        if v in '0123456789':
            break
    return filename[:i]

In [None]:
directory_path = '../input/speech-accent-archive/recordings/recordings/'
filenames = os.listdir(directory_path)
classes = [clean_name(name) for name in filenames]
i, j = np.unique(classes, return_counts=True)
top_5_accents = [x for _, x in sorted(zip(j, i))][::-1][:5]

top_5_files = [file for file in filenames if clean_name(file) in top_5_accents]
top_5_classes = [clean_name(file) for file in top_5_files]
ordinal_encoding = {val:i for i, val in enumerate(np.unique(top_5_classes))}
top_5_classes = [ordinal_encoding[class_] for class_ in top_5_classes]

In [None]:
SAMPLE_RATE = 6_000
WINDOW_SEC = 5
WINDOW_LEN = WINDOW_SEC * SAMPLE_RATE
SHIFT_SEC = 5
SHIFT_LEN = SHIFT_SEC * SAMPLE_RATE

audio, target = [], []
for i, file in tqdm(enumerate(top_5_files)):
    y, sr = librosa.load(os.path.join(directory_path, file),
                         sr=SAMPLE_RATE)
    start, end = 0, WINDOW_LEN
    while (end < len(y)):
        audio.append(y[start:end])
        target.append(top_5_classes[i])
        start += SHIFT_LEN
        end += SHIFT_LEN

In [None]:
audio = np.array(audio)
target = np.array(target)

In [None]:
inp = L.Input((WINDOW_LEN,))
reshape = L.Reshape((WINDOW_LEN,1))(inp)
conv1 = L.Conv1D(4, 16, strides=8, activation='relu')(reshape)
conv2 = L.Conv1D(4, 16, strides=8, activation='relu')(conv1)
conv3 = L.Conv1D(8, 16, strides=4, activation='relu')(conv2)
conv4 = L.Conv1D(8, 16, strides=4, activation='relu')(conv3)

lstm1 = L.LSTM(16, return_sequences=True)(conv4)
lstm2 = L.LSTM(16)(lstm1)
dense1 = L.Dense(16, activation='relu')(lstm2)
dense2 = L.Dense(16, activation='relu')(dense1)
out = L.Dense(5, activation='softmax')(dense2)

model = keras.models.Model(inputs=inp, outputs=out)

tensorflow.keras.utils.plot_model(model, show_shapes=True, dpi=400)

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(audio, target, epochs=100)

---

## Multimodal Learning

### Amazon Stock Reviews

In [None]:
tabular = data[['data/helpful_votes', 'data/total_votes', 'data/star_rating']]
body_text = data['data/review_body']
head_text = data['data/review_headline']
target = data['data/verified_purchase']

In [None]:
SEQ_LEN, MAX_TOKENS = 64, 1024
EMBEDDING_DIM = 32

vectorize = tensorflow.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                                      output_sequence_length=SEQ_LEN)
vectorize.adapt(pd.concat([body_text, head_text]))

In [None]:
vec_body_text = vectorize(body_text)
vec_head_text = vectorize(head_text)

In [None]:
TRAIN_SIZE = 0.8
train_indices = np.random.choice(data.index, replace=False, size=round(TRAIN_SIZE * len(data)))
valid_indices = np.array([i for i in data.index if i not in train_indices])

tabular_train, tabular_valid = tabular.loc[train_indices], tabular.loc[valid_indices]
body_text_train, body_text_valid = vec_body_text.numpy()[train_indices], vec_body_text.numpy()[valid_indices]
head_text_train, head_text_valid = vec_head_text.numpy()[train_indices], vec_head_text.numpy()[valid_indices]
target_train, target_valid = target[train_indices], target[valid_indices]

In [None]:
body_inp = L.Input((SEQ_LEN,), name='body_inp')
head_inp = L.Input((SEQ_LEN,), name='head_inp')

embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)

body_embed = embed(body_inp)
head_embed = embed(head_inp)

body_lstm1 = L.GRU(16, return_sequences=True)(body_embed)
body_lstm2 = L.GRU(16)(body_lstm1)

head_lstm = L.GRU(16)(head_embed)

tab_inp = L.Input((3,), name='tab_inp')
tab_dense1 = L.Dense(8, activation='relu')(tab_inp)
tab_dense2 = L.Dense(8, activation='relu')(tab_dense1)

concat = L.Concatenate()([body_lstm2, head_lstm, tab_dense2])
outdense1 = L.Dense(16, activation='relu')(concat)
outdense2 = L.Dense(16, activation='relu')(outdense1)
out = L.Dense(1, activation='sigmoid')(outdense2)

model = keras.models.Model(inputs=[body_inp, head_inp, tab_inp], outputs=out)
keras.utils.plot_model(model, show_shapes=True)

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit({'body_inp': body_text_train,
                     'head_inp': head_text_train,
                     'tab_inp': tabular_train},
                    target_train,
                    validation_data=({'body_inp': body_text_valid,
                                      'head_inp': head_text_valid,
                                      'tab_inp': tabular_valid},
                                      target_valid),
                    epochs=110, batch_size=512)

In [None]:
plt.figure(figsize=(10, 5), dpi=400)
plt.plot(history.history['loss'], color='red', label='Training')
plt.plot(history.history['val_loss'], color='blue', label='Validation', linestyle='--')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

### Stock Market and News Modeling

In [None]:
news = pd.read_csv('../input/stocknews/Combined_News_DJIA.csv')
news = news[['Top1', 'Top2', 'Top3', 'Date']]
stock = pd.read_csv('../input/stocknews/upload_DJIA_table.csv')
data = news.merge(stock, how='inner', left_on='Date', right_on='Date')
stock = data[['Open', 'High', 'Low', 'Close']]
stock /= 100

In [None]:
news

In [None]:
stock

In [None]:
WINDOW_LENGTH = 20

x_stock = np.zeros((len(stock) - WINDOW_LENGTH,
                    WINDOW_LENGTH,
                    len(stock.columns)))
y_stock = np.zeros((len(stock) - WINDOW_LENGTH,
                    len(stock.columns)))

for i in range(len(stock) - WINDOW_LENGTH):
    x_stock[i] = np.array(stock.loc[i:i+WINDOW_LENGTH-1])
    y_stock[i] = np.array(stock.loc[i+WINDOW_LENGTH])

In [None]:
data = data.loc[WINDOW_LENGTH:]
top1_text, top2_text, top3_text = data['Top1'], data['Top2'], data['Top3']

In [None]:
SEQ_LEN, MAX_TOKENS = 64, 1024
EMBEDDING_DIM = 32

vectorize = tensorflow.keras.layers.TextVectorization(max_tokens=MAX_TOKENS,
                                                      output_sequence_length=SEQ_LEN)
vectorize.adapt(pd.concat([top1_text, top2_text, top3_text]))

In [None]:
top1_text = vectorize(top1_text)
top2_text = vectorize(top2_text)
top3_text = vectorize(top3_text)

In [None]:
variables = ['x_stock', 'y_stock', 
             'top1_text', 'top2_text', 'top3_text']

train_prop = 0.8
train_index = round(train_prop * len(data))
for variable in variables:
    exec(f'{variable}_train = {variable}[:{train_index}]')
    exec(f'{variable}_valid = {variable}[{train_index}:]')

In [None]:
top1_inp = L.Input((SEQ_LEN,), name='top1')
top2_inp = L.Input((SEQ_LEN,), name='top2')
top3_inp = L.Input((SEQ_LEN,), name='top3')

embed = L.Embedding(MAX_TOKENS, EMBEDDING_DIM)
top1_embed = embed(top1_inp)
top2_embed = embed(top2_inp)
top3_embed = embed(top3_inp)

lstm1 = L.LSTM(32, return_sequences=True)
top1_lstm1 = lstm1(top1_embed)
top2_lstm1 = lstm1(top2_embed)
top3_lstm1 = lstm1(top3_embed)

top1_lstm2 = L.LSTM(32)(top1_lstm1)
top2_lstm2 = L.LSTM(32)(top2_lstm1)
top3_lstm2 = L.LSTM(32)(top3_lstm1)

concat = L.Concatenate()([top1_lstm2, top2_lstm2, top3_lstm2])
concat_dense = L.Dense(16, activation='relu')(concat)

stock_inp = L.Input((WINDOW_LENGTH, 4), name='stock')
stock_cnn1 = L.Conv1D(8, 5, activation='relu')(stock_inp)
stock_lstm1 = L.LSTM(8, return_sequences=True)(stock_cnn1)
stock_lstm2 = L.LSTM(8)(stock_lstm1)

joint_concat = L.Concatenate()([concat_dense, stock_lstm2])
joint_dense1 = L.Dense(16, activation='relu')(joint_concat)
joint_dense2 = L.Dense(16, activation='relu')(joint_dense1)
out = L.Dense(4, activation='relu')(joint_dense2)

model = keras.models.Model(inputs={'top1': top1_inp,
                                   'top2': top2_inp,
                                   'top3': top3_inp,
                                   'stock': stock_inp},
                          outputs=out)

In [None]:
keras.utils.plot_model(model, dpi=400, show_shapes=True)

In [None]:
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
model.fit(x={'top1': top1_text_train,
             'top2': top2_text_train,
             'top3': top3_text_train,
             'stock': x_stock_train},
          y=y_stock_train,
          validation_data=({'top1': top1_text_valid,
                            'top2': top2_text_valid,
                            'top3': top3_text_valid,
                            'stock': x_stock_valid},
                            y_stock_valid),
         batch_size=128,
         epochs=20)

---

## Direct Modeling

In [None]:
from sklearn.model_selection import train_test_split as tts

In [None]:
data = pd.read_csv('../input/forest-cover-type-dataset/covtype.csv')
X, y = data.drop('Cover_Type', axis=1), data['Cover_Type']
X_train, X_valid, y_train, y_valid = tts(X, y - 1, train_size=0.8)

### Direct Recurrent Modeling

In [None]:
inp = L.Input((54,))
reshape = L.Reshape((54,1))(inp)
rnn1 = L.SimpleRNN(32)(reshape)
predense = L.Dense(32, activation='relu')(rnn1)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500,
          validation_data=(X_valid, y_valid),
          batch_size = 1024)

### Dense Soft Ordering + Recurrent

In [None]:
inp = L.Input((54,))
dense1 = L.Dense(32, activation='relu')(inp)
dense2 = L.Dense(32, activation='relu')(dense1)
reshape = L.Reshape((32,1))(dense2)
rnn1 = L.SimpleRNN(32)(reshape)
predense = L.Dense(32, activation='relu')(rnn1)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=500,
          validation_data=(X_valid, y_valid),
          batch_size = 1024)

Visualizing results.

In [None]:
inp = L.Input((54,))
dense1 = model.layers[1](inp)
dense2 = model.layers[2](dense1)

submodel = keras.models.Model(inputs=inp,
                              outputs=dense2)

i = 0

plt.figure(figsize=(10/2.5, 33/2.5), dpi=400)
sns.heatmap(submodel.predict(X_train[10*i:10*i + 10]).reshape((32, 10)), cbar=False)
plt.xlabel('Sample')
plt.ylabel('Sequence Index')
plt.show()

### Dense Soft Ordering + Convolutional Preprocessing + Recurrent

In [None]:
inp = L.Input((54,))
dense1 = L.Dense(32, activation='relu')(inp)
dense2 = L.Dense(32, activation='relu')(dense1)
reshape = L.Reshape((32,1))(dense2)
conv1 = L.Conv1D(16, 3)(reshape)
conv2 = L.Conv1D(16, 3)(conv1)
rnn1 = L.LSTM(16, return_sequences=True)(conv2)
rnn2 = L.LSTM(16)(rnn1)
predense = L.Dense(16, activation='relu')(rnn2)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=inp, outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20,
          validation_data=(X_valid, y_valid),
          batch_size = 1024)

Visualizing results.

In [None]:
inp = L.Input((54,))
dense1 = model.layers[1](inp)
dense2 = model.layers[2](dense1)
reshape = L.Reshape((32,1))(dense2)
conv1 = model.layers[4](reshape)
conv2 = model.layers[5](conv1)

submodel = keras.models.Model(inputs=inp,
                              outputs=conv2)

j = 0  # convolutional filter
i = 0  # batch number

plt.figure(figsize=(10/2.5, 29/2.5), dpi=400)
sns.heatmap(submodel.predict(X_train[10*i:10*i + 10])[:,:,j].reshape((28, 10)), cbar=False)
plt.xlabel('Sample')
plt.ylabel('Sequence Index')
plt.show()

### Tabular Vector as Hidden State

Ones-vector initialization.

In [None]:
BATCH_SIZE = 1024

init_hidden_vec = L.Input((54,))
init_inp_vec = L.Input((16, 1))
dense1 = L.Dense(32, activation='relu')(init_hidden_vec)
dense2 = L.Dense(32, activation='relu')(dense1)
rnn1 = L.GRU(32, return_sequences=True)(init_inp_vec, initial_state=[dense2])
rnn2 = L.GRU(32)(rnn1)
predense = L.Dense(16, activation='relu')(rnn2)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=[init_hidden_vec, init_inp_vec], outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.fit([X_train, np.ones((len(X_train), 16, 1))],
           y_train, epochs=20,
          validation_data=([X_valid, np.ones((len(X_valid), 16, 1))],
                           y_valid),
          batch_size = BATCH_SIZE)

Sine wave positional encoding as initial sequence.

In [None]:
individ_seq = np.stack([np.sin(np.linspace(0, 1/2 * np.pi, 32)),
                        np.sin(np.linspace(0, np.pi, 32)),
                        np.sin(np.linspace(0, 2*np.pi, 32)),
                        np.sin(np.linspace(0, 4*np.pi, 32))],
                        axis=1)
train_pos_encoding = np.stack([individ_seq] * len(X_train))
valid_pos_encoding = np.stack([individ_seq] * len(X_valid))

In [None]:
BATCH_SIZE = 1024

init_hidden_vec = L.Input((54,))
init_inp_vec = L.Input((32, 4))
dense1 = L.Dense(16, activation='relu')(init_hidden_vec)
dense2 = L.Dense(16, activation='relu')(dense1)
rnn1 = L.GRU(16, return_sequences=True)(init_inp_vec, initial_state=[dense2])
rnn2 = L.GRU(16)(rnn1)
predense = L.Dense(16, activation='relu')(rnn2)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=[init_hidden_vec, init_inp_vec], outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit([X_train, train_pos_encoding],
           y_train, epochs=20,
          validation_data=([X_valid, valid_pos_encoding],
                           y_valid),
          batch_size = BATCH_SIZE)

### Tabular Vector as Hidden State and Initial Sequence

GRU/SimpleRNN

In [None]:
BATCH_SIZE = 1024

init_vec = L.Input((54,))

dense1 = L.Dense(32, activation='relu')(init_vec)
dense2 = L.Dense(32, activation='relu')(dense1)
reshape = L.Reshape((32,1))(dense2)
conv1 = L.Conv1D(16, 3)(reshape)
conv2 = L.Conv1D(16, 3)(conv1)

hidden_dense1 = L.Dense(16, activation='relu')(init_vec)
hidden_dense2 = L.Dense(16, activation='relu')(hidden_dense1)

rnn1 = L.GRU(16, return_sequences=True)(conv2, 
                                         initial_state=hidden_dense2)
rnn2 = L.GRU(16)(rnn1)

predense = L.Dense(16, activation='relu')(rnn2)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=init_vec, outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20,
          validation_data=(X_valid, y_valid),
          batch_size = BATCH_SIZE)

LSTM

In [None]:
BATCH_SIZE = 1024

init_vec = L.Input((54,))

dense1 = L.Dense(32, activation='relu')(init_vec)
dense2 = L.Dense(32, activation='relu')(dense1)
reshape = L.Reshape((32,1))(dense2)
conv1 = L.Conv1D(16, 3)(reshape)
conv2 = L.Conv1D(16, 3)(conv1)

hidden_dense1 = L.Dense(16, activation='relu')(init_vec)
hidden_dense2 = L.Dense(16, activation='relu')(hidden_dense1)

cell_dense1 = L.Dense(16, activation='relu')(init_vec)
cell_dense2 = L.Dense(16, activation='relu')(cell_dense1)

rnn1 = L.LSTM(16, return_sequences=True)(conv2, 
                                         initial_state=[hidden_dense2, 
                                                        cell_dense2])
rnn2 = L.LSTM(16)(rnn1)

predense = L.Dense(16, activation='relu')(rnn2)
out = L.Dense(7, activation='softmax')(predense)
model = keras.models.Model(inputs=init_vec, outputs=out)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=20,
          validation_data=(X_valid, y_valid),
          batch_size = BATCH_SIZE)