In [1]:
# Run this if spacy has not been installed on your machine yet. 
#! pip install spacy
#!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.0.0/en_core_web_lg-3.0.0-py3-none-any.whl (778.8 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_lg')


In [2]:
import numpy as np
import pandas as pd
import spacy
import time

import en_core_web_lg

from spacy.lang.en.stop_words import STOP_WORDS


# Global variables to track how long spaCy is taking to
# perform lemmatization. 
global lem_counter
lem_counter = 0

global notebook_clock
notebook_clock = time.time()

In [3]:
# Read in the data that has been preprocessed by the 01_Data_Cleaning.ipynb notebook.
reddit_df = pd.read_csv("./data/Processed/wsb_crypto_preprocessed_2073132.csv")
reddit_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc,all_text_data
0,CryptoCurrency,,All Nodes Ethereum 2.0 Services,1624735574,all nodes ethereum 20 services
1,CryptoCurrency,,Is the Largest Difficulty Adjustment In Bitcoi...,1624735572,is the largest difficulty adjustment in bitcoi...
2,CryptoCurrency,,Binance leaving Ontario: Binance will handle E...,1624735503,binance leaving ontario binance will handle et...
3,CryptoCurrency,I own some purely because of the doge effect. ...,What are your thoughts on shib?,1624735442,what are your thoughts on shib i own some pure...
4,CryptoCurrency,,Miami Beach's most expensive penthouse just so...,1624735282,miami beachs most expensive penthouse just sol...


In [9]:
# Define a list of words that we want to add the default stop word list spaCy uses.
words = ['b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'I', 'j', 'k', 'l', 'm', 'M', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'es', 'ing']

In [11]:
# ================================================================================================================
# This function uses the spacy model to perform lemmatization on each word in a single post.
# ================================================================================================================
def lemmatize_text(text, spaCy_nlp, verbose):
    
    # Note: lemmatization on large datasets can be quite slow. The if statement below is used to track
    # progress by printing the total number of posts that have been lemmatized as well as the amount of 
    # time it has taken after every 10k posts processed.
    global lem_counter
    lem_counter = lem_counter + 1
    if lem_counter % 10000 == 0:
        print(f"We successfully lemmatized {lem_counter} posts...")
        print(f"The notebook clock has been running {time.time() - notebook_clock} seconds...\n\n")
      
    # Turn the post into a spaCy doc object.
    # Ensure the text is string type before we do.
    doc = spaCy_nlp(str(text))
    
    # Use spacy to get the lemmas of each word. Discard any stopwords. 
    # Also make sure spaCy doesn't try to give us anything that is upper case. 
    lemmas = [token.lemma_.lower() for token in doc if (token.is_stop == False)]
    
    # Join the lemmatized text back together to form a single string
    lemmatized_text = " ".join(lemmas).strip()
    
    # This is for verifying the implementation only. Never recommended to use unless you suspect
    # something is not working properly. This will print the original post and the lemmatized version.
    if verbose and (lemmatized_text != text):
        print("\n===========================================================================")
        print(f"Original: {text}\n")
        print(f"Lemmatized: {lemmatized_text}\n")
        print("===========================================================================\n")
    
    return lemmatized_text

# Functions to build a numeric dataset

The four functions listed below add a capability to the preprocess function to have a numeric version of the dataset created automatically directly after lemmatization and stop word removal is performed. Creating the numeric dataset in this way is disabled by default because this is **not** the preferred method for creating such datasets. The best option for creating numeric datasets is to use the keras TextVectorization layer (shown in later notebooks) https://keras.io/api/layers/preprocessing_layers/core_preprocessing_layers/text_vectorization/. The numeric dataset capability here can serve as a backup if for whatever reason the keras textvectorization function is not available.

build_word_frequency_dict

make_frequency_based_word_map

replace_words_with_numbers

transform_texts_to_numeric

In [14]:
def build_word_frequency_dict(df, time_stats):
    
    print("Starting to build all_text list.")
    start = time.time()
    
    # Convert the texts for all posts into one giant string
    all_text = df.loc[:, 'all_text_data'].str.cat(sep=' ')
    
    if time_stats:
        print("\n=================================================")
        print(f"Finished building all text list. That took {time.time()-start}")
        print("Going to split all_text into a list of words.")
        print("=================================================\n")
    
    # Split the giant string into one giant list
    word_list = all_text.split()
    
    if time_stats:
        print("\n=================================================")
        print("Finished building word list... going to start building the freq_dict.")
        print(f"The length of the word list is {len(word_list)}")
        print(f"Total time in this function is now {time.time() - start}")
        print("=================================================\n")
    
    # Create a dictionary that maps each unique word to the number of times in shows up
    # Across all reddit posts.    
    freq_dict = {}
    for word in word_list:
        if word not in freq_dict.keys():
            freq_dict[word] = 1
        else:
            freq_dict[word] = freq_dict[word] + 1
        
    if time_stats:
        print("\n=================================================")
        print("Finished building freq_dict... saving to csv and exiting.")
        print(f"Total time in this function is now {time.time() - start}")
        print("=================================================\n")
    
    # Save the frequency count data to a .csv as a convenience, in case there is any need to review it later.
    words = list(freq_dict.keys())
    values = list(freq_dict.values())
    freq_df = pd.DataFrame({'word' : words, 'frequency' : values})
    freq_df.to_csv("./support_data/nice_to_have/word_frequencies.csv", index=False)
    
    return freq_dict



In [15]:
def make_frequency_based_word_map(frequency_dict, time_stats):
    
    start = time.time()
    
    # Use the frequency dict to create a list of tuples sorted by frequency. 
    # The most frequent word will be the first item in the list
    sorted_frequencies = sorted(list(frequency_dict.items()), key = lambda sublist : sublist[1], reverse = True)
    
    if time_stats:
        print("\n=========================================================")
        print("Turning freq_dict into a list of tuples, sorted by frequency")
        print(f"Time to perform sorting: {time.time() - start}")
        print("Next, using the sorted tuple list to create a list of words only")
        print("=========================================================\n")
    
    # Create a list of just the words, but they will still be in the order most frequent ---> least frequent
    words = [word for (word, num) in sorted_frequencies]
    
    if time_stats:
        print("\n=========================================================")
        print("Finished creating the word list")
        print(f"Total time in this function is now: {time.time() - start}")
        print("Next, mapping each unique word to a unique number...")
        print("=========================================================\n")
    
    # Give each word a unique number. The most common word gets the smallest number.
    # We start at 2 because 0 and 1 are going to be reserved for unknown and out of vocab words respectively.
    unique_word_map = {word:(index+2) for (index,word) in enumerate(words)}
    
    # 'Mask token' (i.e. not a word), used for padding sequencies.
    unique_word_map['mask'] = 0
    
    # 'Unknown' or 'Out of Vocab' token to map to 1
    unique_word_map['<unk>'] = 1
    
    if time_stats:
        print("\n=========================================================")
        print("Saving and exiting the make_frequency_based_word_map function...")
        print(f"total time in this function: {time.time() - start}")
        print("=========================================================\n")
    
    # Save the word ---> unique value map to a .csv in case it needs to be referenced later.
    words = list(unique_word_map.keys())
    unique_numbers = list(unique_word_map.values())
    unique_map_df = pd.DataFrame({'word' : words, 'unique_value' : unique_numbers})
    unique_map_df.to_csv("./support_data/nice_to_have/map_words_to_unique_numbers.csv", index=False)
    
    return unique_word_map

In [17]:
def replace_words_with_numbers(text, word_to_num_map):
    
    numbers = " ".join([str(word_to_num_map[word]) for word in str(text).split()])
    
    return numbers

In [19]:
# Does the same thing as the second half of the preprocess function. Left here in case its ever need separately. 
def transform_texts_to_numeric(df):
    
    num_samples = len(df.index)
    
    word_frequencies = build_word_frequency_dict(df, time_stats=True)
    
    word_to_unique_num_map = make_frequency_based_word_map(word_frequencies, time_stats=True)
    
    df['all_text_data_numeric'] = df['all_text_data'].apply(lambda text: replace_words_with_numbers(text, word_to_unique_num_map))
    
    df.to_csv(f"./data/Processed/Processed_Numeric_Lemmatized_{num_samples}_ALL_DATA.csv", index=False)
    
    return df

# Preprocessing

In [16]:
# This function updates spaCys list of default stop words to include the additional words we
# defined at the start of this notebook.
def update_stop_words(nlp, new_stop=words):
    
    for word in new_stop:
        nlp.vocab[word].is_stop = True
    
    return nlp

In [20]:
# ================================================================================================================
#
# ================================================================================================================
def preprocess(df, additional_stopwords=words, verbose=False, time_stats=True, build_numeric_dataset=False):
    
    if time_stats:
        start_time = time.time()

    # Used for file names
    num_samples = len(df.index)
    
    # Load the spaCy model.
    nlp = en_core_web_lg.load()
    
    if time_stats:
        print("\n========================================================================")
        print(f"Finished loading the spaCy model... About to update stop words.")
        print(f"preprocess function has been running for: {time.time() - start_time}")
        print("========================================================================\n")
    
    # Update spaCys default stopwords to include some additional items.
    nlp = update_stop_words(nlp, new_stop=additional_stopwords)

    if time_stats:
        print("\n========================================================================")
        print(f"Finished updating stop words... about to lemmatize.")
        print(f"preprocess function has been running for: {time.time() - start_time}")
        print("========================================================================\n")    
       
    # Perform lemmatization on every word in every post. 
    df['all_text_data'] = df['all_text_data'].apply(lambda text : lemmatize_text(text, nlp, verbose=verbose))
    
    if time_stats:
        print("\n========================================================================")
        print(f"!!!!!!!!!!!!!!!!!!!!!! LEMMATIZATION COMPLETE !!!!!!!!!!!!!!!!!!!!!!!!!")
        print(f"preprocess function has been running for: {time.time() - start_time}")
        print("========================================================================\n")
    
    # Save the lemmatized file to .csv
    df.to_csv(f"./data/Processed/NOTREAL_Processed_Through_Lemmatization_{num_samples}_ALL_DATA.csv", index=False)
    
    if build_numeric_dataset:
    
        if time_stats:
            print("\n========================================================================")
            print(f"Saved lem file to .csv... about to build word_freq dict.")
            print(f"preprocess function has been running for: {time.time() - start_time}")
            print("========================================================================\n")

        # Build a dictionary that maps each word to the number of times it occurs across all posts.
        word_frequencies = build_word_frequency_dict(df, time_stats=time_stats)

        if time_stats:
            print("\n========================================================================")
            print(f"Finished building word_freq dict... now building word-->unique_num map.")
            print(f"preprocess function has been running for: {time.time() - start_time}")
            print("========================================================================\n")

        # Build a dictionary that maps each unique word to a unique number, based on its frequency
        # More common words get smaller numbers. No words get 0 or 1 because they are reserved.
        # So the most common word maps to 2.
        word_to_unique_num_map = make_frequency_based_word_map(word_frequencies, time_stats=time_stats)

        if time_stats:
            print("\n========================================================================")
            print(f"Finished building word-->unique_num map... about to use the map")
            print("On the lemmatized posts to create a numeric version of the data...")
            print(f"preprocess function has been running for: {time.time() - start_time}")
            print("========================================================================\n")

        # Create a copy of df, which we will then use to convert the text data to numbers with the 
        # word_to_unique_num_map
        number_df = df.copy(deep=True)

        # Create a new column where all of the words in the posts have been converted to unique numbers
        number_df['all_text_data_numeric'] = number_df['all_text_data'].apply(lambda text: replace_words_with_numbers(text, word_to_unique_num_map))

        if time_stats:
            print("\n========================================================================")
            print(f"Finished building numeric data! Preprocessing is now complete!")
            print(f"Preprocessing function took a total of: {time.time() - start_time} seconds.")
            print("========================================================================\n")

        # Save the numeric data to .csv
        number_df.to_csv(f"./data/Processed/Processed_Numeric_Lemmatized_{num_samples}_ALL_DATA.csv", index=False)
    
    return df

In [None]:
# Perform lemmatization and stop word removal on the dataframe read in at the start of this notebook.
reddit_df = preprocess(reddit_df, time_stats=True)