# Tensor Flow Text Classification using RNN

This code started off as the tutorial from Tensor Flow. [TF RNN](https://www.tensorflow.org/text/tutorials/text_classification_rnn)

I have made modifications to allow my datasets, ability to switch between datasets and optimizers,
created my own code for splitting the datasets into training, validation and testing, batched the datasets,
changed the NN models, added features for regularization of NN layers, etc. 

In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# If controlTest_flag set to True, the model will run a control dataset, capable of high accuracy.
#NOTE: this will download a dataset from Tensor Flow
controlTest_flag = False
# Setting retrainControlTest_flag to False will use the pre-trained model
retrainControlTest_flag = True

In [None]:
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

## Choosing the Dataset

1. Set `tickerSymbol` to a stock ticker from the list below. 
2. Set `textChoice` to 'title' or 'content'. Chooses whether to use the just the 'title' or entire 'content' from the news article. 

The datasets were created by filtering out a selected stock from this news archive dataset: 
[Kaggle - US Equities News Data](https://www.kaggle.com/datasets/gennadiyr/us-equities-news-data?resource=download)

Then the historical price dataset for the stock was downloaded from: 
[Nasdaq - Historical Data](https://www.nasdaq.com/market-activity/quotes/historical)

The 2 datasets were merged and cleaned up to contain only what was needed. The datasets contain the 
date (which was used for merging), the news article title or content, and a label indicating that the daily
price increased (1) (or statyed the same) or decreased (0). 

In [None]:
# Choose a stock
# 'AAPL' 'MSFT' 'AMZN' 'TSLA' 'NFLX' 'GOOGL' 'BA'  
# 'Reddit' can also be used for the Reddit dataset. Note: only the title option can be used. 
tickerSymbol = 'AAPL'
textChoice = 'title' # 'title' 'content'

dataFile = '../Data/' + tickerSymbol + '_' + textChoice + '_' + 'NewsDataset.csv'

Change the `SHUFFLE_SEED` to re-shuffle the dataset prior to running. 

The seed allows reproducibility to repeat a run.

In [None]:
# Change this seed for a different shuffle. 
# Seed is here to create reproducible results, if needed.
if not controlTest_flag:
    SHUFFLE_SEED = 12345

    stockDF = pd.read_csv(dataFile)
    stockDF = stockDF.sample(frac=1, random_state=SHUFFLE_SEED)

`BATCH_SIZE`, `TRAIN_PERCENT`, and `VALID_PERCENT` can be adjusted here.

1. `BATCH_SIZE` is the number of text features per batch
2. `TRAIN_PERCENT` is the percent of the dataset that is used for training vs. testing.
3. `VALID_PERCENT` is the percent of the training dataset that is used for validation.

In [None]:
if not controlTest_flag:
    N = len(stockDF)
    BATCH_SIZE = 32
    TRAIN_PERCENT = 0.8
    VALID_PERCENT = 0.2

    trainSize = int(N * TRAIN_PERCENT)
    validSize = int(trainSize * VALID_PERCENT)

    train_df = stockDF.iloc[:trainSize-validSize]
    valid_df = stockDF.iloc[trainSize-validSize:trainSize]
    test_df  = stockDF.iloc[trainSize: N]

In [None]:
if not controlTest_flag:
    train_ds = tf.data.Dataset.from_tensor_slices((train_df['Text'],train_df['Label']))
    valid_ds = tf.data.Dataset.from_tensor_slices((valid_df['Text'],valid_df['Label']))
    test_ds  = tf.data.Dataset.from_tensor_slices((test_df['Text'],test_df['Label']))

    train_ds = train_ds.batch(BATCH_SIZE)
    valid_ds = valid_ds.batch(BATCH_SIZE)
    test_ds  = test_ds.batch(BATCH_SIZE)

In [None]:
if controlTest_flag:
    import tensorflow_datasets as tfds
    train_ds = tfds.load('imdb_reviews', split='train[:90%]', as_supervised=True)
    valid_ds = tfds.load('imdb_reviews', split='train[90%:]', as_supervised=True)
    test_ds = tfds.load('imdb_reviews', split= 'test', as_supervised=True)
    #train_ds, test_ds = dataset['train'], dataset['test']
    BUFFER_SIZE = 10000
    BATCH_SIZE = 64
    train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    valid_ds = valid_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    test_ds = test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

## Tokenization

Tokenization is performed using Keras built in tokenizer. In the cell below, parameters can be adjusted to affect tokenization.

1. `VOCAB_SIZE` is the max vocab size created from the dataset
2. `NGRAMS` allows ngrams to be used for tokenization. If a tuple is used, multiple ngrams are used. 
3. `TOKENIZATION_TYPE` allows for a standard index or batched index to be used. 

See [TF API Keras Vectorization](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization)

In [None]:
VOCAB_SIZE          = 1000
NGRAMS              = None      # None, 1, 2, etc. (1,2,3)
TOKENIZATION_TYPE   = 'int'     # 'int', 'multi_hot', 'count', 'tf_idf'

In [None]:
encoder = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, ngrams=NGRAMS, output_mode=TOKENIZATION_TYPE)
encoder.adapt(train_ds.map(lambda Text, Label: Text))

In [None]:
# First 20 words of vocab created from data
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

## NN Model

Below is the model definition. 

1. `L2_REGULARIZATION` can be adjusted to reduce the weights of the model to make a more generic model and avoid over fitting. 

In [None]:
L2_REGULARIZATION = 0.000001    # Default = 0.01

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(len(encoder.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True, kernel_regularizer=tf.keras.regularizers.l2(L2_REGULARIZATION))),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, kernel_regularizer=tf.keras.regularizers.l2(L2_REGULARIZATION))),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(L2_REGULARIZATION)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])

In [None]:
tf.keras.utils.plot_model(model)

## Training

1. `EPOCHS` is the iteration of entire dataset ran during training.
2. `VALIDATION_STEPS` Total number of steps (batches of samples) to draw before stopping when performing validation at the end of every epoch.
    1. See [TF Keras Sequential](https://www.tensorflow.org/api_docs/python/tf/keras/Sequential#fit)
3. `LOSS_CLASS` sets the type of loss function used
3. `OPTIMIZER` selects which optimizer from keras to use. 
    1. See [TF Keras Optimizers](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers)
4. `LEARNING_RATE` sets the learning rate of the optimizer. 0.03, 0.01, 0.003, 0.001, etc.


In [None]:
EPOCHS              = 5
VALIDATION_STEPS    = None              # None 20, 40, etc. None will use all validation data
LOSS_CLASS          = 'CrossEntropy'    # 'CrossEntropy' 'FocalCrossEntropy' 
OPTIMIZER           = 'Adam'            # 'Adam' 'Adadelta' 'SGD'
LEARNING_RATE       = 1e-4              # Learning rate for optimizer

In [None]:
loss = {
    'CrossEntropy'      : tf.keras.losses.BinaryCrossentropy(from_logits=True),
    'FocalCrossEntropy' : tf.keras.losses.BinaryFocalCrossentropy(from_logits=True)
}

optimizer = {
    'Adam'      : tf.optimizers.Adam(learning_rate=LEARNING_RATE),
    'Adadelta'  : tf.optimizers.Adadelta(),
    'SGD'       : tf.optimizers.SGD()
}

metrics =   [   
                tf.keras.metrics.BinaryCrossentropy(from_logits=True, name='binary_crossentropy'), 
                tf.keras.metrics.BinaryAccuracy(name='binary_accuracy'),
            ]

In [None]:
model.compile(loss=loss[LOSS_CLASS],
              optimizer=optimizer[OPTIMIZER],
              metrics=metrics)

In [None]:
# Model training
if not controlTest_flag or retrainControlTest_flag:
    history = model.fit(train_ds, epochs=EPOCHS,
                        validation_data=test_ds,
                        validation_steps=VALIDATION_STEPS)

In [None]:
#model.save('../Data/IMDB_RNNControlTestModel', include_optimizer=False)

## Results

Testing the test dataset.

In [None]:
if retrainControlTest_flag:
    test_loss, test_bin, test_acc = model.evaluate(test_ds)
else:
    reloaded_model = tf.keras.models.load_model('../Data/IMDB_RNNControlTestModel')
    reloaded_model.compile(loss=loss[LOSS_CLASS],
              optimizer=optimizer[OPTIMIZER],
              metrics=metrics)
    test_loss, test_bin, test_acc = reloaded_model.evaluate(test_ds)

print('Test Loss:', test_loss)
print('Test Binary Cross Entropy:', test_bin)
print('Test Accuracy:', test_acc)

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(2, 2, 1)
plot_graphs(history, 'binary_accuracy')
plt.subplot(2, 2, 2)
plot_graphs(history, 'loss')
plt.subplot(2, 2, 3)
plot_graphs(history, 'binary_crossentropy')