In [1]:
import numpy as np
import pandas as pd
import time
import os

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from tensorflow.keras.models import save_model, load_model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.data.experimental import save, load

In [2]:
# =========================================================================================================
# This function takes as input a dataframe containing reddit posts, and returns a filtered version of that
# dataframe where the length of all posts is between word_count_min and word_count_max.
# =========================================================================================================
def filter_samples_by_word_count(df, word_count_max, word_count_min):
    
    df['all_text_data'] = df['all_text_data'].astype(str)
    
    df['word_count'] = df['all_text_data'].apply(lambda text: len(text.split()))
    
    # create word count filters
    min_word_filter = (df['word_count'] >= word_count_min)
    max_word_filter = (df['word_count'] <= word_count_max)
    
    df = df.loc[(min_word_filter) & (max_word_filter), :].copy(deep=True)
    
    return df

In [3]:
# =========================================================================================================
# This function references the 'created_utc' column to print the data and time of creation for the
# newest and oldest post in the dataset.
# =========================================================================================================
def print_post_times(df):
    
    newest_post_time = np.max(df.loc[:, 'created_utc'].to_numpy())
    oldest_post_time = np.min(df.loc[:, 'created_utc'].to_numpy())
    
    newest_post_string = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(newest_post_time))
    oldest_post_string = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(oldest_post_time))
    
    print("===================================================")
    print(f"Newest Post: {newest_post_string}")
    print(f"Oldest Oldest: {oldest_post_string}")
    print("===================================================\n")

In [4]:
# =========================================================================================================
# This function prints the number of duplicate samples in a dataframe. 
# Only the all_text_data and subreddit columns are considered when finding duplicates.
# =========================================================================================================
def duplicate_check(df):
    
    df = df.loc[:, ['all_text_data', 'subreddit']]
    
    print("===================================================")
    print(f"Total Number of duplicates: {df.duplicated().sum()}")
    print("===================================================\n")

In [5]:
# ===================================================================================================================================
# This function is used to print the number of unique words found in the training set after it is created in the 
# make_train_and_unused_sets function. 
#
# Displaying this information could be helpful when considering what value to choose for max_tokens when building tensorflow datasets.
# ===================================================================================================================================
def print_num_unique_words(df):
    
    all_text = df.loc[:, 'all_text_data'].str.cat(sep=' ')
    num_unique_words = len(list(set(all_text.split())))
    
    print("===================================================")
    print(f"Number of unique words in the training set {num_unique_words}")
    print("===================================================\n")

In [6]:
# ====================================================================================================================================
# NOTE: This function only needs to be run once ever!
# 
# This function is used to allocate the data into different sets to prepare for the process of iteratively retraining models with
# larger and larger training sets and evaluating their performance.
#
# This function splits file containing the fully preprocessed posts (Processed_Through_Lemmatization_2073132_ALL_DATA.csv) into three datsets.
#
# 1) {min_words}_to_{max_words}_words_preprocessed_ALL_POTENTIAL_TRAIN.csv
# 2) {min_words}_to_{max_words}_words_preprocessed_VALIDATION.csv
# 3) {min_words}_to_{max_words}_words_preprocessed_TEST.csv
#
# In the above, min_words and max_words represent the minimum and maximum acceptable word length for any post. All posts outside this
# range are discarded. For my experimentation I used min_words = 5 and max_words = 20 which left me with a little over 1 million
# posts in total to experiement with. This process could be reperformed with a different min_word and max_word value if desired. 
# 
# The validation and test sets will contain 25k samples each. The all_potential_train set will contain the remainder of the
# samples that meet the min_words to max_words criteria.
# 
# The test set will be used to measure and compare the performance of models trained on different size training sets.
# 
# The validation set will be used when training neural networks only. This set will be used as the validation_data
# parameter when calling the tf.keras.fit() method. https://www.tensorflow.org/api_docs/python/tf/keras/Model
# 
# The all_potential_train set will be pulled from to create larger and larger training sets as required to complete the 
# model performance vs training set size experimentation. 
#
# ====================================================================================================================================

def build_val_and_test_sets(max_words=20, min_words=5):
    
    # All preprocessed data.
    full_preprocessed_df = pd.read_csv("./data/Processed/Processed_Through_Lemmatization_2073132_ALL_DATA.csv")
    
    # Shuffle everything just for fun. (The data is initially very unshuffled, first half all crypto second half all wsb).
    full_preprocessed_df = full_preprocessed_df.sample(frac=1, random_state = 42, axis='index').reset_index(drop=True)
    
    # Filter posts to only inlcude those that meet the min_words to max_words criteria.
    filtered_df = filter_samples_by_word_count(full_preprocessed_df, word_count_max=max_words, word_count_min=min_words)
    
    # Only keep the features and target columns, as well as created_utc incase its interesting to look at later.
    filtered_df.drop(columns=['selftext', 'title', 'word_count'], inplace=True)
    
    # Remove any post that is a duplicate (only consider the features and target when finding duplicates).
    filtered_df = filtered_df.loc[ (filtered_df[['all_text_data', 'subreddit']].duplicated() == False), :]
    
    # Map the target to numeric values.
    print("Mapping the target to numeric.... 0 for wsb and 1 for crypto.\n")
    filtered_df['subreddit'] = [0 if reddit == 'wallstreetbets' else 1 for reddit in filtered_df['subreddit']]
    
    # Save a copy of all posts that were between min_words and max_words (no duplicates).
    # This file is not necessarily needed for anything and is saved here only as a convenience. This would be the same data as
    # concatenating the train, val and potential train sets back together.
    filtered_df.to_csv(f"./data/Processed/increment_train_size/{min_words}_to_{max_words}_words_preprocessed_ALL_POSTS.csv", index=False)
    
    # Verify no duplicates are left in the dataset.
    print(f"Duplicates -- all {min_words} to {max_words} word posts.")
    duplicate_check(filtered_df)
    
    # Display the created time for the newest and oldest post.
    print("Post times for the full dataset")
    print_post_times(filtered_df)
    
    X = filtered_df.drop(columns='subreddit')
    y = filtered_df.loc[:, 'subreddit']
    
    # Split the data into a large set and a validation set of size 25k
    X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=25000)
    
    # Put the validation data into a df
    val_df = pd.concat([X_val, y_val], axis=1)
    
    # Save validation data to a .csv
    val_df.to_csv(f"./data/Processed/increment_train_size/{min_words}_to_{max_words}_words_preprocessed_VALIDATION.csv", index=False)
    
    # Split once more, into the "potential training" set and a test set of 25k
    X_TRAIN, X_test, y_TRAIN, y_test = train_test_split(X_train, y_train, stratify=y_train, test_size=25000)
    
    # Put the test set into a dataframe.
    test_df = pd.concat([X_test, y_test], axis=1)
     
    # Save the test set to .csv
    test_df.to_csv(f"./data/Processed/increment_train_size/{min_words}_to_{max_words}_words_preprocessed_TEST.csv", index=False)
    
    # Create the "Potential" training set with the remainder of the data.
    train_df = pd.concat([X_TRAIN, y_TRAIN], axis=1)
    
    # Save the potential training set to .csv
    train_df.to_csv(f"./data/Processed/increment_train_size/{min_words}_to_{max_words}_words_preprocessed_ALL_POTENTIAL_TRAIN.csv", index=False)
    
    # Check for duplicates once more
    print("Train set duplicates:")
    duplicate_check(train_df)
    print("Validation set duplicates:")
    duplicate_check(val_df)
    print("Test set duplicates:")
    duplicate_check(test_df)
    print("")
       
    # Check for post time distributions once more
    print("Train set times:")
    print_post_times(train_df)
    print("Validation set times:")
    print_post_times(val_df)
    print("Test set times:")
    print_post_times(test_df)
    print("")
    
    # Check for class distributions
    print(f"Train Distribution: \n {train_df['subreddit'].value_counts(normalize=True)}\n")
    print(f"Validation Distribution: \n {val_df['subreddit'].value_counts(normalize=True)}\n")
    print(f"Test Distribution: \n {test_df['subreddit'].value_counts(normalize=True)}\n")
    
    # Check dataset sizes:
    print(f"POTENTIAL Train set size {len(train_df.index)}")
    print(f"Validation set size {len(val_df.index)}")
    print(f"Test set size {len(test_df.index)}")

#build_val_and_test_sets()

In [7]:
# ====================================================================================================================================
# This function is used to pull from the {min_words}_to_{max_words}_words_preprocessed_ALL_POTENTIAL_TRAIN.csv file to create
# a dataset of a desired size.
# ====================================================================================================================================
def make_train_and_unused_sets(train_set_size, min_words=5, max_words=20):
    
    # Read in the dataset that contains all potential training data of posts between min_words and max_words.
    potential_train = pd.read_csv(f"./data/Processed/increment_train_size/{min_words}_to_{max_words}_words_preprocessed_ALL_POTENTIAL_TRAIN.csv")
    
    
    # Split the ALL_POTENTIAL_TRAIN data into a dataset of the desired size (train_set_size) and the remainder (the unused posts).
    X = potential_train.drop(columns='subreddit')
    y = potential_train['subreddit']
    X_train, X_unused, y_train, y_unused = train_test_split(X, y, stratify=y, train_size=train_set_size)
    
    # Convert the training and unused posts into dataframes.
    train_df = pd.concat([X_train, y_train], axis=1)
    unused_df = pd.concat([X_unused, y_unused], axis=1)
    
    # Create two .csv files, one for the training set of size train_set_size. The other is the unused posts which will
    # be of size ALL_POTENTIAL_TRAIN - train_set_size.
    train_df.to_csv(f"./data/Processed/increment_train_size/train{train_set_size}/train_{train_set_size}.csv", index=False)
    train_df.to_csv(f"./data/Processed/increment_train_size/train{train_set_size}/unused_{train_set_size}.csv", index=False)
    
    # Check for duplicates once more
    print("Train set duplicates:")
    duplicate_check(train_df)
    print("Unused set duplicates:")
    duplicate_check(unused_df)
    print("")
       
    # Check for post time distributions once more
    print("Train set times:")
    print_post_times(train_df)
    print("Unused set times:")
    print_post_times(unused_df)
    print("")
    
    # Check for class distribution
    print(f"Train Distribution: \n {train_df['subreddit'].value_counts(normalize=True)}\n")
    print(f"Unused Distribution: \n {unused_df['subreddit'].value_counts(normalize=True)}\n")

    # Check dataset size:
    print(f"Train set size {len(train_df.index)}")
    print(f"Unused set size {len(unused_df.index)}")
    
    # Print number of unique words
    print_num_unique_words(train_df)


In [8]:
# ====================================================================================================================================
# This function takes as input a list of desired training set sizes. The function will then create folders for each training set size
# at a path specified by ./data/Processed/increment_train_size/train{train_size}/ and will call the make_train_and_unused_sets
# function to create the dataset.
# ====================================================================================================================================
def make_multiple_training_sets(train_set_sizes, base_path="./data/Processed/increment_train_size/", min_words=5, max_words=20):
    
    for train_size in train_set_sizes:
        
        try:
            os.mkdir(base_path + f"train{train_size}/")
            make_train_and_unused_sets(train_size, min_words=min_words, max_words=max_words)
        except:
            print("Path already exists!")
            print(base_path + f"train{train_size}/")
            print("\n")

In [10]:
train_sizes = [100 * n for n in range(1, 21)] + [250 * n for n in range(9, 20)] + [5000 * n for n in range(1, 101)]

# Run this to make all the training set files.
#make_multiple_training_sets(train_set_sizes=train_sizes)

Train set duplicates:
Total Number of duplicates: 0

Unused set duplicates:
Total Number of duplicates: 0


Train set times:
Newest Post: 2021-06-26 18:59:34
Oldest Oldest: 2014-12-08 08:19:32

Unused set times:
Newest Post: 2021-06-27 08:33:09
Oldest Oldest: 2014-12-05 03:22:31


Train Distribution: 
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Unused Distribution: 
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Train set size 20000
Unused set size 885525
Number of unique words in the training set 19888

Train set duplicates:
Total Number of duplicates: 0

Unused set duplicates:
Total Number of duplicates: 0


Train set times:
Newest Post: 2021-06-27 08:20:42
Oldest Oldest: 2014-12-05 05:30:09

Unused set times:
Newest Post: 2021-06-27 08:33:09
Oldest Oldest: 2014-12-05 03:22:31


Train Distribution: 
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Unused Distribution: 
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Train set size 100000
Un

## Tensorflow dataset section

In [24]:
# ====================================================================================================================================
# This function takes as input a list of pandas dataframes.
# This function outputs a list of tensorflow datasets that is the result of return each dataframe into a tf dataset.
# ====================================================================================================================================
def build_tf_datasets(dfs, batch_size=32):
    
    tf_datasets = []
    
    for df in dfs: 
        
        df = df.loc[:, ['all_text_data', 'subreddit']].copy(deep=True)
         
        # Make sure the text is all string datatype
        df['all_text_data'] = df['all_text_data'].astype(str)
        
        # Make sure the subreddit column is all int datatypes
        df['subreddit'] = df['subreddit'].astype('int64')
        
        # Create the tensorflow dataset with the appropriate batch size
        ds = tf.data.Dataset.from_tensor_slices((tf.cast(df['all_text_data'], tf.string),
                                                 tf.cast(df['subreddit'], tf.int64))).batch(batch_size)
        
        tf_datasets.append(ds)
    
    return tf_datasets

In [25]:
# ====================================================================================================================================
# This function takes as input the following values:
#
# 1) max_tokens: The maximum allowed vocab size
#
# 2) max_length: The maximum length of any sequence in the dataset. Using this value, the output integer datasets will have all sequences
#    either padded or truncated to be exactly max_length. This means the tensors in the output dataset will be 
#    of shape [batch_size, output_sequence_length] regardless of how many tokens were in the original sequences. 
#    Note: This is only a valid TextVectorization option when output_mode = 'int' as we are doing here.
#  
# 3) A list of tensorflow string datasets set up as [train_ds, val_ds, test_ds].
#
# 4) A "train_only_train_dataset" which is the same information contained in train_ds except the target is not included.
#
# This function uses the keras TextVectorization the learn the vocabulary of the training dataset, and then creates
# integer versions of the training, validation and test datasets which are then returned.
# 
# ====================================================================================================================================
def build_integer_datasets(max_tokens, max_length, datasets, train_only_train_dataset):
    
    # Unpack the string tensorflow datasets
    train_ds, val_ds, test_ds = datasets

    # Set up keras TextVectorization Layer
    text_vectorization = TextVectorization(
        max_tokens=max_tokens,
        output_mode="int",
        output_sequence_length=max_length)
    
    # Let the text vectorization layer learn the training datasets vocabulary.
    text_vectorization.adapt(train_only_train_dataset)
    
    # Use the textvectorization layer to map strings to integers.
    int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))
    int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))
    int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))
    
    return [int_train_ds, int_val_ds, int_test_ds]

In [26]:
# ====================================================================================================================================
# This function takes the following inputs:
#
# 1 - 3) Dataframes containing training, validation and test datasets that we want to make tensorflow datasets for.
#
# 4) The desired batch size for the tensorflow dataset.
#
# 5) max_tokens: The maximum allowed vocab size.
#
# 6) max_length: The maximum allowed length of any sequence in the dataset.
#
# 7) train_size: The number of training examples in train_df. Used for making appropriate filenames only.
#
# 8 - 9) save_datasets, base_path ---> Optional parameters to determine if the datasets should be saved and where to save them. 
#
# This function returns a dictionary containing all tensorflow datasets that were created. 
# ====================================================================================================================================
def create_tensorflow_datasets_from_pandas(train_df, val_df, test_df, batch_size, max_tokens, max_length, train_size, save_datasets=True,
                                           base_path="./data/Processed/deep/tf_data/"):
    
    print("Creating the string tf datasets.\n")
    
    # Create the tensorflow datasets from the pandas dataframe.
    train_ds, val_ds, test_ds = build_tf_datasets(dfs = [train_df, val_df, test_df], batch_size=batch_size)
    
    # Create a tensorflow dataset that only has the text data (not the target). We will use this for letting the TextVectorization
    # layer the vocabulary for our text data.
    text_only_train_ds = train_ds.map(lambda x, y : x)
    
    print("Creating the integer tf datasets.\n")
    
    # Create the integer tensorflow datasets. This is the result of using the TextVectorization layer to convert the strings in the 
    # tensorflow datasets on line 1 to unique integers. 
    int_train_ds, int_val_ds, int_test_ds = build_integer_datasets(datasets=[train_ds, val_ds, test_ds], train_only_train_dataset=text_only_train_ds,
                                                                   max_tokens=max_tokens, max_length=max_length)
    
    # Create a dictionary of all the tf datasets that have been made.
    datasets = {'train_ds' : train_ds,
                'val_ds' : val_ds,
                'test_ds' : test_ds,
                'int_train_ds' : int_train_ds,
                'int_val_ds' : int_val_ds,
                'int_test_ds' : int_test_ds,
                'text_only_train_ds' : text_only_train_ds}
    
    # Save the tensorflow datasets. 
    if save_datasets:
        print("Saving the datasets")
        save(train_ds, path= base_path + f"train{train_size}_batch_{batch_size}_train_ds")
        save(val_ds, path= base_path + f"train{train_size}_batch_{batch_size}_val_ds")
        save(test_ds, path= base_path + f"train{train_size}_batch_{batch_size}_test_ds")
        save(int_train_ds, path= base_path + f"train{train_size}_batch_{batch_size}_maxTokens_{max_tokens}_maxLength_{max_length}_int_train_ds")
        save(int_val_ds, path= base_path + f"train{train_size}_batch_{batch_size}_maxTokens_{max_tokens}_maxLength_{max_length}_int_val_ds")
        save(int_test_ds, path= base_path + f"train{train_size}_batch_{batch_size}_maxTokens_{max_tokens}_maxLength_{max_length}_int_test_ds")
        save(text_only_train_ds, path= base_path + f"train{train_size}_batch_{batch_size}_text_only_train_ds")
    
    return datasets

In [27]:
# ====================================================================================================================================
# This function is used to efficiently call the create_tensorflow_datasets_from_pandas function multiple times.
#
# When passed a list of training_set_sizes (train_set_sizes) this function will attempt to read each sized dataset into a pandas
# dataframe and then will call the create_tensorflow_dataset using the pandas dataframes as input.
#
# If datasets of sizes specified have train_set_sizes have not yet been saved as .csv files, then an error is printed and the
# function returns.
# ====================================================================================================================================
def build_multiple_tf_datasets(train_set_sizes, save_datasets=True, batch_size=32, max_tokens=10000, max_length=20):
    
    # Loop over the list of train_set_sizes to create a tensorflow datasets for each. 
    for size in train_set_sizes:
        try: 
            train_df = pd.read_csv(f"./data/Processed/increment_train_size/train{size}/train_{size}.csv")
            val_df = pd.read_csv(f"./data/Processed/increment_train_size/5_to_20_words_preprocessed_VALIDATION.csv")
            test_df = pd.read_csv(f"./data/Processed/increment_train_size/5_to_20_words_preprocessed_TEST.csv")
        except:
            print(f"A dataset of size {size} has not been made yet.")
            return -1
        
        
        tf_datasets = create_tensorflow_datasets_from_pandas(train_df, val_df, test_df, batch_size=batch_size, max_tokens=max_tokens, max_length=max_length,
                                                             save_datasets=True, base_path=f"./data/Processed/increment_train_size/train{size}/",
                                                             train_size=size) 
    
    return 

In [28]:
# train_sizes = [100, 500, 1000, 3000, 5000, 10000, 30000, 50000, 150000, 200000, 300000, 400000, 500000]
build_multiple_tf_datasets(train_set_sizes=train_set_sizes, save_datasets=True, batch_size=32, max_tokens=20000, max_length=20)

Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
Creating the string tf datasets.

Creating the integer tf datasets.

Saving the datasets
