In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Uncomment and run if fasttext is not already installed.

# ! git clone https://github.com/facebookresearch/fastText.git

# ! pip install /content/fastText

In [4]:
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score

import fasttext
from fasttext.FastText import load_model

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.data.experimental import save, load

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, Flatten, Dense, LSTM, Dropout
from tensorflow.keras.callbacks import Callback, CSVLogger, EarlyStopping
from tensorflow.keras.metrics import Precision, Recall, SensitivityAtSpecificity, SpecificityAtSensitivity, TruePositives, TrueNegatives, FalsePositives, FalseNegatives, PrecisionAtRecall
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Location of a text file containing the text from all posts in the ALL_POTENTIAL_TRAIN dataset. Used to train the fasttext model.
reddit_text_path = r'/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/LSTM/fastText/reddit_posts.txt'

# Location of the trained fasttext word vector model
word_vector_file = r'/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/LSTM/fastText/fastText_vectors.bin'

# All potential training data, used to train the fasttext model.
reddit_dataframe_path = "/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/iterating_data/5_to_20_words_preprocessed_ALL_POTENTIAL_TRAIN.csv"

# Creating a fasttext word vector model

In [6]:
# ==========================================================================
# This function is used to perform a quick check that the fasttext word
# vectors were created correctly. The function trys to load the trained
# fasttext model and perform a basic operation using the word vectors.
# ==========================================================================
def check_ft_model(wv_file):

    ft_model = fasttext.FastText.load_model(wv_file)

    print("verifying fastText model loads correctly by printing the nearest neighbors to rocket.")
    print("=======================================================")
    print(f"Nearest Neighbors to rocket:\n {list(ft_model.get_nearest_neighbors('rocket'))}")
    print("=======================================================")

In [7]:
# ==========================================================================
# This function reads in the .csv containing all of the reddit posts and
# uses the text in those posts to train a set of fasttext word vectors.
# 
# The trained fasttext model is then saved so it can be loaded later
# when building the neural networks embedding matrix.
# ==========================================================================
def build_ft_model(data_file_path=reddit_dataframe_path, text_data_save_path=reddit_text_path, wv_file=word_vector_file): 

    print("Reading reddit dataframe...\n")

    # Read in the dataframe containing all potential training data.
    reddit_df = pd.read_csv(data_file_path)

    print("Grabbing text from dataframe... \n")

    # Grab all of the text from the all potential training data file.
    reddit_text = reddit_df.loc[:, 'all_text_data']

    print("Saving text data to path...")
    print(text_data_save_path, "\n")

    # Save the text in a file that fasttext can access.
    np.savetxt(text_data_save_path, reddit_text, fmt='%s')

    print("Training fasttext model...")

    # Train the fasttext model.
    fastText_model = fasttext.train_unsupervised(text_data_save_path, model='skipgram')

    # Save the trained fasttext model. 
    print("Saving model to path...")
    print(wv_file, "\n")
    fastText_model.save_model(wv_file)
    print("Model save complete!")

    return fastText_model

In [8]:
# Commented out because the fasttext word vector model has already been created.
# ft_model = build_ft_model()

# Training a RNN 

Note: The build_and_train_rnn function allows for the choice of using fasttext word vectors in its embedding layer or having the network learn its own embeddings.

In [9]:
# ============================================================================================================================================
# This function is used to build an embedding matrix that is populated with fasttext word vectors. This embedding matrix can be loaded into
# the embedding layer of a recurrent neural network.
#
# This function performs the following:
#
# 1) Load a trained fasttext word vector model, which should be saved at the path given to the ft_file parameter.
# 2) Instantiate a Keras TextVectorization layer using the max_vocab_size and max_sequence_length parameters.
# 3) Load the training data stored at the path given to the training_data_csv_path parameter.
# 4) Use the textvectorization layer to learn a vocabulary for the text in the training data.
# 5) Use the learned vocabulary and the trained fasttext model to build an embedding matrix that contains fasttext 
#    word vectors for the top max_vocab_size most frequent words in the training data vocabulary.
# ============================================================================================================================================
def build_embedding_matrix(training_data_csv_path, max_vocab_size, max_sequence_length, ft_file=word_vector_file, embedding_dim=100):

    # Load the trained fasttext model.
    print("Loading fasttext word vector model...\n")
    ft_model = fasttext.FastText.load_model(ft_file)

    print("Instantiating keras text vectorizer...\n")
    
    # Instantiate the keras text vectorizer.
    text_vectorization = TextVectorization(
        max_tokens=max_vocab_size,
        output_mode="int",
        output_sequence_length=max_sequence_length,
        split="whitespace")

    print("Reading in the training data file...")
    print(training_data_csv_path, "\n")

    # Load the training data
    train_df = pd.read_csv(training_data_csv_path)

    # Split into X and y
    X_train = train_df.loc[:, 'all_text_data'].astype(str).to_numpy()
    y_train = train_df.loc[:, 'subreddit'].to_numpy()

    print("Having the text vectorizer learn the training data vocabulary...\n")

    # Let the keras TextVectorization layer learn the vocabulary of the
    # training data.
    text_vectorization.adapt(X_train)

    print("Getting vocab from text_vectorizer...\n")

    # Get the training data vocab from the TextVectorization layer.
    vocabulary = text_vectorization.get_vocabulary()

    # Build a dictionary that maps each word in the vocabulary to a number
    print("Building word index... word --->num.")
    word_index = dict(zip(vocabulary, range(len(vocabulary))))

    # Create an empty embedding matrix (filled with all zeros).
    print("Creating empty embedding matrix...\n")
    embedding_matrix = np.zeros((max_vocab_size, embedding_dim))

    print("Populating embedding matrix...\n")

    # Loop over the vocabulary dictionary to populate the embedding matrix.
    for word, index in list(word_index.items()):

        # Add every word in the TextVectorizers vocab up to the max vocab size we decided on.
        # Note: The TextVectorizer automatically has the most frequent words at the lower index values,
        # so this will be the top max_vocab_size most frequent words.
        if index < max_vocab_size:

            # Get a vector for this word from the trained fasttext model.
            embedding_vector = ft_model.get_word_vector(word)

        # Add this vector to the embedding matrix.
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
        else:
            print(f"The fasttext model did not return a vector for word: {word}")

    print("Returning embedding matrix...")
    return embedding_matrix 

In [10]:
# After we finish the fasttext section, make sure these get imported in case there was any collision
# with the same names being used in the fasttext library.
from tensorflow.keras.models import save_model, load_model

In [11]:
# ============================================================================================================================================
# This function is used to load the tensorflow integer datasets used when training the RNN.
# ============================================================================================================================================
def load_int_datasets(base_path, train_size, max_tokens):

    print("loading the datasets") 
    int_train_ds = load(base_path + f"train{train_size}/train{train_size}_batch_32_maxTokens_{max_tokens}_maxLength_20_int_test_ds")
    int_val_ds = load(base_path + f"train{train_size}/train{train_size}_batch_32_maxTokens_{max_tokens}_maxLength_20_int_val_ds")
    int_test_ds = load(base_path + f"train{train_size}/train{train_size}_batch_32_maxTokens_{max_tokens}_maxLength_20_int_test_ds")

    print("Finished Loading the  datasets!")
    return int_train_ds, int_val_ds, int_test_ds

In [12]:
# ============================================================================================================================================
# This function will perform the following:
#
# 1) Load tensorflow datasets that should be located at base_data_and_save_path/train{train_size}/ folder. 
#    The name of the dataset also indicates the size of the training set and the size of the vocabulary that was used when creating the dataset.
#
# 2) Create filepaths for saving the "best" model (checkpoint_save) and the model as it existed on the last epoch of training (final_save)
#
# If use_fasttext_embeddings
# 3) Construct an embedding matrix of a vocabulary size defined by max_tokens, and fill the embedding matrix with fasttext word vectors.
#
# If not using fasttext embeddings
# 3) instantiate an embedding layer that is trainable. 
# 
# 4) Instantiate a basic bidirectional RNN architecture.
#
# 5) Compile the model, and fit the network using the datasets loaded in step 1.
#
# 6) Evaluate the best model on the training, validation and test sets, and save the results to a .csv. 
# 
# ============================================================================================================================================
def build_and_train_rnn(training_data_csv_path, train_set_size, epochs, checkpoint_save, max_tokens, use_fasttext_embeddings=False, final_save=None, dropout=0.5,
                        optimizer='rmsprop', max_sequence_length=20, metrics=['accuracy'], loss="binary_crossentropy", int_train_ds=None, int_test_ds=None,
                        int_val_ds=None, ft_file=word_vector_file, embedding_dim=100, score_df=None,
                        base_data_and_save_path="/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/iterating_data/"): 


    # Load dataset (if they were not passed into the function as parameters).
    if int_train_ds is None:
        int_train_ds, int_val_ds, int_test_ds = load_int_datasets(base_data_and_save_path, train_set_size, max_tokens)

    # Create save paths for the model.
    # checkpoint_save is where the model will be saved each time there is a new "best model".
    base_save_path = base_data_and_save_path + f"/train{train_set_size}/"
    checkpoint_save = base_save_path + checkpoint_save

    # Create the path to save the final model (the neural network at the last epoch in training).
    if final_save is not None:
        final_save = base_save_path + final_save

    # If we are using fasttext embeddings in the embedding matrix
    if use_fasttext_embeddings:

        # Build the embedding matrix
        embedding_matrix = build_embedding_matrix(training_data_csv_path=training_data_csv_path,
                                                  max_vocab_size=max_tokens,
                                                  max_sequence_length=max_sequence_length,
                                                  ft_file=ft_file,
                                                  embedding_dim=embedding_dim) 

        # Instantiate the embedding layer using the embedding matrix created above.
        # Setting trainable=False means the network should not try to change the word embeddings we intialized it with. 
        embedding_layer = layers.Embedding(input_dim=max_tokens,
                                           output_dim=embedding_dim,
                                           embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                           input_length=max_sequence_length,
                                           trainable=False,
                                           mask_zero=True)
        
    # Else the network will need to learn its own embeddings.
    else:

        print("Instatiating the embedding layer.")
        print("This network will learn its own word embeddings.")

        embedding_layer = layers.Embedding(input_dim=max_tokens,
                                           output_dim=embedding_dim,
                                           input_length=max_sequence_length,
                                           mask_zero=True)

    print("Building model architecture")
    #--------------------------------------------------------
    # Model architecture
    #--------------------------------------------------------

    inputs = keras.Input(shape=(None,), dtype="int64")

    embedded = embedding_layer(inputs)

    x = layers.Bidirectional(layers.LSTM(32))(embedded)

    x = layers.Dropout(dropout)(x)

    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs, outputs)

    # Compile model
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)
    

    print("============================= Model Summary==================================")
    print(model.summary())
    print("===============================================================\n\n")

    # Callback to save current "best" model
    callbacks = [keras.callbacks.ModelCheckpoint(checkpoint_save,
                                                 save_best_only=True)]

    # Fit the model
    print("Fitting model")
    model.fit(int_train_ds, validation_data=int_val_ds, epochs=epochs, callbacks=callbacks)


    # If we want to save the model at the final epoch.
    if final_save is not None:
        save_model(model=model,
                   filepath=final_save,  
                   overwrite=True,
                   include_optimizer=True)


    # Load the best model
    model = keras.models.load_model(checkpoint_save)

    # If a dataframe was passed in to record the train, test and validation scores in, update that dataframe here.
    if score_df is not None:
        print("Updating scores...\n")
        temp_dict = {"train_accuracy" : [], "val_accuracy" : [], "test_accuracy" : [], 'train_data_size' : []}
        temp_dict['test_accuracy'].append(model.evaluate(int_test_ds)[1])
        temp_dict['train_accuracy'].append(model.evaluate(int_train_ds)[1])
        temp_dict['val_accuracy'].append(model.evaluate(int_val_ds)[1])
        temp_dict['train_data_size'].append(train_set_size)

        # Save the scores from this round in a temporary dataframe.
        temp_df = pd.DataFrame(temp_dict)

        # Concatenate this networks score to the dataframe tracking all scores.
        score_df = pd.concat([score_df, temp_df], ignore_index=True)

        # Save the updated score dataframe
        score_df.to_csv(base_save_path + f"BRNN_fasttext_SCORES{train_set_size}.csv", index=False)

    # Evaluate best model on the test set
    print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

    # If we are using a score dataframe
    if score_df is not None:

        # Return the best model found and the score dataframe
        return model, score_df
    
    else:

        return model

In [13]:
# ============================================================================================================================================
# This function is used to train the neural network defined above multiple times with a list of training dataset sizes.
# ============================================================================================================================================
def train_rnn_with_multiple_training_set_sizes(train_sizes, epochs, max_tokens, use_fasttext_embeddings=False,
                                               dropout=0.5, optimizer='rmsprop', max_sequence_length=20, metrics=['accuracy'],
                                               loss="binary_crossentropy", ft_file=None, embedding_dim=256, save_model_at_last_epoch=None,
                                               base_data_and_save_path ="/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/iterating_data/"):
    

    # Create a list of the filepaths to training sets of each size in "train_sizes".
    train_file_paths = [base_data_and_save_path + f"train{size}/train_{size}.csv" for size in train_sizes]

    # Create an empty dataframe that will store the performance of the neural network after being trained on each dataset in the list above.
    score_df = pd.DataFrame({"train_accuracy" : [], "val_accuracy" : [], "test_accuracy" : [], 'train_data_size' : []})

    # For each datset we want to train the network with.
    for train_size, file_path in zip(train_sizes, train_file_paths): 

        # Print the training file we are about to use.
        print(f"Training network with file at path ---> {file_path}")

        # The saved model filenames depend on if we used fasttext or not.
        if use_fasttext_embeddings:

            # Save the "best" model here, we always want to do this.
            checkpoint_save = f"fasttext_embeddings_BEST_brnn_model_trainsize_{train_size}.keras"

            # Option to save the last model or not. Helpful if we may want to train it more.
            if save_model_at_last_epoch:
                final_save = f"fasttext_embeddings_FINAL_brnn_model_trainsize_{train_size}.keras"
            else:
                final_save= None

        else:
            # Save the "best" model here, we always want to do this.
            checkpoint_save = f"learned_embeddings{embedding_dim}_BEST_brnn_model_trainsize_{train_size}.keras"

            # Option to save the last model or not. Helpful if we may want to train it more.
            if save_model_at_last_epoch:
                final_save = f"learned_embeddings{embedding_dim}_FINAL_brnn_model_trainsize_{train_size}.keras"
            else:
                final_save= None

        # Train the RNN using the dataset read in above.
        best_model, score_df = build_and_train_rnn(score_df=score_df,                          # Score df for tracking performance across different train set sizes
                                                   training_data_csv_path=file_path,           # Train data path for building the embedding matrix         
                                                   train_set_size=train_size,                  # Size of the training set, used in all the file names               
                                                   max_tokens=max_tokens,                      # Max vocab size, used for building embedding layer                  
                                                   epochs=epochs,                              # Number of epochs to train the network for          
                                                   dropout=dropout,                            # Dropout percentage between LSTM and Dense layer
                                                   optimizer=optimizer,                        # Optimizer to use when updating network weights
                                                   max_sequence_length=max_sequence_length,    # Length of the longest sequence.
                                                   metrics=metrics,
                                                   loss=loss,
                                                   ft_file=ft_file,
                                                   embedding_dim=embedding_dim,
                                                   base_data_and_save_path=base_data_and_save_path,
                                                   checkpoint_save=checkpoint_save,
                                                   final_save=final_save,
                                                   use_fasttext_embeddings=use_fasttext_embeddings)  # True if using fastext, False for network to learn embeddings.
    return score_df

In [3]:
train_sizes = [100, 1000, 3000, 5000, 10000, 30000, 50000, 150000, 200000, 300000, 400000, 500000]

# Uncomment the function call below to train and evaluate performance of the RNN with fasttext
# word embeddings using the training set sizes listed above.

'''
train_rnn_with_multiple_training_set_sizes(train_sizes=train_sizes,
                                           epochs=20,
                                           max_tokens=20000,
                                           use_fasttext_embeddings=True,
                                           save_model_at_last_epoch=True,
                                           dropout=0.5,
                                           optimizer='rmsprop',
                                           max_sequence_length=20,
                                           metrics=['accuracy'],
                                           loss="binary_crossentropy",
                                           ft_file=word_vector_file,
                                           embedding_dim=100,
                   |                        base_data_and_save_path="/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/iterating_data/")
''';

In [2]:
train_sizes = [100, 1000, 3000, 5000, 10000, 30000, 50000, 150000, 200000, 300000, 400000, 500000]

# Uncomment the function call below to train and evaluate performance of the RNN 
# that will learn its own 256 dimensional word embeddings.

'''
train_rnn_with_multiple_training_set_sizes(train_sizes=train_sizes,
                                           epochs=2,
                                           max_tokens=20000,
                                           use_fasttext_embeddings=False,
                                           embedding_dim=256,
                                           save_model_at_last_epoch=True,
                                           dropout=0.5,
                                           optimizer='rmsprop',
                                           max_sequence_length=20,
                                           metrics=['accuracy'],
                                           loss="binary_crossentropy",
                                           ft_file=None,
                                           base_data_and_save_path="/content/drive/MyDrive/Programming/Colab Notebooks/General_Assembly/Project_3_NLP/data/iterating_data/")
''';

### References:

The idea to use a TextVectorization layer for creating the vocabulary, and the RNN architecture I implemented, were inspired by Francois Chollets book Deep Learning with Python