In [1]:
# Run this if the emoji-data library has not yet been installed.
# ! pip install emoji-data



In [2]:
# Global variable used to keep a dictionary of any emojies identified during preprocessing that
# we do not currently have a description for.
global emojis_missing_descriptions
emojis_missing_descriptions = {}
emojis_missing_descriptions['emoji'] = []
emojis_missing_descriptions['code'] = []

In [3]:
import pandas as pd
import numpy as np
import string
import time

from emoji_data import EmojiSequence

In [4]:
# Read in the posts from the wallstreetbets subreddit.
wsb_df = pd.read_csv("./data/Final/Clean_wallstreetbets_data_1125000_posts.csv")
wsb_df.name = "WallStreetBets_DataFrame"
wsb_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,wallstreetbets,"Monday, June 28, 2021\r\n\r\n09:0...",Market Events June 28 - July 2,1624808078
1,wallstreetbets,,This made me super bullish for $AMZN. Also RIP...,1624807989
2,wallstreetbets,,AMC might be up to a big move this week!,1624807950
3,wallstreetbets,,$COCP BLOCKBUSTER COVID ANTIVIRAL BIOTECH PLAY...,1624807898
4,wallstreetbets,,I dont normally publish DD - but this made me ...,1624807864


In [6]:
# Read in the posts from the CryptoCurrency subreddit
crypto_df = pd.read_csv("./data/Final/Clean_CryptoCurrency_data_950000_posts.csv")
crypto_df.name = "Crypto_DataFrame"
crypto_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc
0,CryptoCurrency,[removed],All Nodes Ethereum 2.0 Services,1624735574
1,CryptoCurrency,,Is the Largest Difficulty Adjustment In Bitcoi...,1624735572
2,CryptoCurrency,,Binance leaving Ontario: Binance will handle E...,1624735503
3,CryptoCurrency,I own some purely because of the doge effect. ...,What are your thoughts on shib?,1624735442
4,CryptoCurrency,,Miami Beach's most expensive penthouse just so...,1624735282


In [7]:
# =======================================================================================
# Helper to the review posts function. Calculates several statistics related to the text
# in a given set of posts.
# =======================================================================================
def calculate_statistics(df, column='selftext'):
    
    stats = {}
    text_lists = {}
    
    stats['column_name'] = column
    
    # List of everything in the column
    texts = [text_string for text_string in df.loc[:, column].to_numpy()]
    text_lists['texts_all'] = texts
    
    # Length of every self text that is a string (not NaN)
    text_lengths = [len(text) for text in texts if type(text) == str]
    text_lists['str_text_lengths'] = text_lengths
    
    # Number of times the text is just the string '[removed]'
    num_texts_removed = np.sum([1 if text == '[removed]' else 0 for text in texts])
    stats['str_text_removed'] = num_texts_removed
    
    # The text for every selftext string that is not ['removed'] or NaN
    valid_texts = [text for text in texts if text != '[removed]' and type(text) == str]
    text_lists['valid_texts'] = valid_texts
    
    # The number of posts where the text isn't missing or ['removed']
    num_valid_texts = len(valid_texts)
    stats['num_valid_texts'] = num_valid_texts
    
    # Length of all text strings that are not missing or ['removed']
    valid_text_lengths = [len(text) for text in valid_texts]
    text_lists['valid_text_lengths'] = valid_text_lengths
    
    # Statistics 
    longest_text_string = np.max(valid_text_lengths)
    stats['longest_text'] = longest_text_string
    
    shortest_text_string = np.min(valid_text_lengths)
    stats['shortest_text'] = shortest_text_string
    
    avg_length_text_string = np.mean(valid_text_lengths)
    stats['avg_text_length'] = avg_length_text_string
    
    median_length = np.median(valid_text_lengths)
    stats['median_text_length'] = median_length
    
    twenty_fifth_percentile = np.quantile(a=valid_text_lengths, q=0.25)
    stats['25th_perc_length'] = twenty_fifth_percentile
    
    seventy_fifth_percentile = np.quantile(a=valid_text_lengths, q=0.75)
    stats['75th_perc_length'] = seventy_fifth_percentile
    
    ninetieth_percentile = np.quantile(a=valid_text_lengths, q=0.90)
    stats['90th_perc_length'] = ninetieth_percentile
    
    num_text_missings = df[column].isna().sum()
    stats['num_missings'] = num_text_missings
    
    print(f"========================== {column} ==================================")
    print(f"Number of missings in {column} strings: {num_text_missings}")
    print(f"Number of {column} strings that are '[removed]': {num_texts_removed}")
    print(f"Number of valid {column} strings: {num_valid_texts}\n")
    
    print(">>>>>>>>>>>>>>> Stats below are for valid texts only <<<<<<<<<<<<<<<\n")
    
    print(f"Avg Length {column} string: {avg_length_text_string}")
    print(f"Shortest {column} string: {shortest_text_string}")
    print(f"25th percentile : {twenty_fifth_percentile}")
    print(f"Median Length Self Text String: {median_length}")
    print(f"75th percentile: {seventy_fifth_percentile}")
    print(f"90th percentile: {ninetieth_percentile}")
    print(f"Longest {column} string: {longest_text_string}")
    print("=========================================================================\n")
    
    return stats

In [9]:
# This function concatenates the 'title' and 'selftext' columns into a
# new column called 'all_text_data'
def combine_columns(df):
    
    df = df.copy(deep=True)
    
    df['selftext'].fillna("", inplace=True)
    
    df['selftext'] = [text if text != '[removed]' else "" for text in df['selftext']]
    
    df['title'].fillna("", inplace=True)
    
    df['all_text_data'] = df['title'] + " " + df['selftext']
    
    return df

In [10]:
# =======================================================================================
# This function has two separate purposes:
# 
# 1. Call the "combine_columns" create a new "all_text_data" column that is the result
#    of concatenating the 'title' and 'selftext' fields together.
# 
# 2. Print various statistics that describe and help the user get some insight into
#    the text data that is about to be preprocessed. 
# =======================================================================================
def review_posts(df): 
    
    print(f"======================= Starting {df.name} Review ===============================================\n")
    
    print(f"Number of rows in full dataframe {len(df.index)}")
    print(f"Number of columns in full dataframe: {len(df.columns)}\n")
    
    # Remove all columns except 'subreddit', 'selftext', 'title', and 'created_utc'
    # (If the file being collected came from the 00_Data_Collection notebook these)
    # may already be removed.
    clean_df = df.loc[:, ['subreddit', 'selftext', 'title', 'created_utc']].copy(deep=True)
    
    print("Creating smaller df that conly contains columns ---> subreddit, selftext, title, created_utc")
    print(f"Number of columns in the smaller df ----> {len(clean_df.columns)}\n")
    
    # Get the newest and oldest posts epoch times
    newest_post_time = np.max(clean_df.loc[:, 'created_utc'].to_numpy())
    oldest_post_time = np.min(clean_df.loc[:, 'created_utc'].to_numpy())
    
    # https://stackoverflow.com/questions/12400256/converting-epoch-time-into-the-datetime
    newest_post_string = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(newest_post_time))
    oldest_post_string = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(oldest_post_time))
    
    print("========================== Duplicates ==================================")
    print(f"Number of duplicates in full dataframe: {df.duplicated().sum()}")
    print(f"Number of duplicates in smaller dataframe: {clean_df.duplicated().sum()}")
    print("=========================================================================\n")
    
    print("========================== Post Times ==================================")
    print(f"Newest Post (Epoch Time): {newest_post_time}")
    print(f"Newest Post (Local Time): {newest_post_string}\n")
    print(f"Oldest Post (Epoch Time): {oldest_post_time}")
    print(f"Oldest Post (Local Time): {oldest_post_string}")
    print("=========================================================================\n")
    
    # Statistics on the selftext column
    selftext_stats = calculate_statistics(clean_df, column='selftext')
    
    # Statistics on the title column
    title_stats = calculate_statistics(clean_df, column='title')
    
    # Create a new 'alltext' column that is the combination of the 'title' + 'selftext' columns
    clean_df = combine_columns(clean_df)
    
    # Statistics on the all-text column
    all_text_stats = calculate_statistics(clean_df, column='all_text_data')
        
    return clean_df

In [11]:
clean_wsb_df = review_posts(wsb_df)


Number of rows in full dataframe 1125000
Number of columns in full dataframe: 4

Creating smaller df that conly contains columns ---> subreddit, selftext, title, created_utc
Number of columns in the smaller df ----> 4

Number of duplicates in full dataframe: 538
Number of duplicates in smaller dataframe: 538

Newest Post (Epoch Time): 1624808078
Newest Post (Local Time): 2021-06-27 08:34:38

Oldest Post (Epoch Time): 1597869510
Oldest Post (Local Time): 2020-08-19 13:38:30

Number of missings in selftext strings: 482284
Number of selftext strings that are '[removed]': 460917
Number of valid selftext strings: 181799

>>>>>>>>>>>>>>> Stats below are for valid texts only <<<<<<<<<<<<<<<

Avg Length selftext string: 479.7788051639448
Shortest selftext string: 1
25th percentile : 9.0
Median Length Self Text String: 119.0
75th percentile: 408.0
90th percentile: 1085.0
Longest selftext string: 39822

Number of missings in title strings: 1
Number of title strings that are '[removed]': 3
Numbe

In [12]:
clean_crypto_df = review_posts(crypto_df)


Number of rows in full dataframe 948853
Number of columns in full dataframe: 4

Creating smaller df that conly contains columns ---> subreddit, selftext, title, created_utc
Number of columns in the smaller df ----> 4

Number of duplicates in full dataframe: 140
Number of duplicates in smaller dataframe: 140

Newest Post (Epoch Time): 1624735574
Newest Post (Local Time): 2021-06-26 12:26:14

Oldest Post (Epoch Time): 1417761426
Oldest Post (Local Time): 2014-12-04 22:37:06

Number of missings in selftext strings: 491072
Number of selftext strings that are '[removed]': 240957
Number of valid selftext strings: 216824

>>>>>>>>>>>>>>> Stats below are for valid texts only <<<<<<<<<<<<<<<

Avg Length selftext string: 738.22324558167
Shortest selftext string: 1
25th percentile : 228.0
Median Length Self Text String: 400.0
75th percentile: 775.0
90th percentile: 1577.0
Longest selftext string: 40398

Number of missings in title strings: 0
Number of title strings that are '[removed]': 0
Number

In [14]:
# ========================================================================================================
# This function uses the emojis codepoint value to return its assoicated description, that is stored in 
# the desc_df dataframe. If not description is available, the emojis_missing_descriptions is updated
# to track this. 
#
# The verbose parameter is currently disabled (if False) as it is almost always undesireable to print
# every emoji that gets replaced. This could be enabled again by replacing if False with if verbose. If 
# this is a desired function I think it is a better idea to reimplement the verbose functionality with
# various levels, and make emoji printing the most extreme option (user wants a ton of messages). 
# ========================================================================================================
def get_description(char, desc_df, verbose):
    
    global emojis_missing_descriptions
    
    # Convert the emoji to a codepoint value
    code = str(hex(ord(char))).split('x')[1].upper()
        
    try: 
        # Reference desc_df to find the description associated with this emoji.
        description = desc_df.loc[desc_df['codepoint'] == code, 'description'].to_numpy()[0]
        description = " " + description + " "
    except: 
        print("========================= Emojo Description Error =========================")
        print(f"There is no description for emoji {char}")
        print(f"Replacing the emoji with an empty string")
        print("===========================================================================")
        emojis_missing_descriptions['emoji'].append(char)
        emojis_missing_descriptions['code'].append(code)
        description = ""
    
    # This could be changed back to if verbose to allow each emoji that gets replaced to be printed out
    # during preprocessing. I would only recommend doing this for small datasets, because printing each
    # replacement for a large dataset will cause a significant slow down.
    if False: 
        print("\n>>>>>>>>>>>>>>>>>>>>>>> Replacing Emoji >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
        print(f"Emoji: {char}    Code:{code}")
        print(f"Description: {description}")
        print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n")
    
    return description

In [15]:
# ========================================================================================================
# This function checks each character in a single posts text to determine if it is an emoji. Non-emoji
# characters are passed through, and emoji characters are replaced with their associated description
# via a call to the get_description() function
# ========================================================================================================
def emoji_processor(text, desc_df, verbose):
    
    # For each character, either pass it through if its not an emoji, or 
    # replace with the correct description if it is an emoji
    char_list = [get_description(char, desc_df, verbose=verbose) if (char in EmojiSequence) else char for char in text]
    
    # Join the character list back together as a string. 
    new_text = "".join(char_list).strip()
    
    return new_text

In [16]:
# ========================================================================================================
# This function applies the emoji_processor to each post (row) in the dataframe to replace each emoji
# with its associated description.
# ========================================================================================================
def replace_emojis_with_description(df, verbose):
    
    if verbose:
        print("\n =) :) 8) <3 Replacing Emojis with descriptions <3 (8 (: (= \n")
    
    # Read in the file that has the correct description for each unique emoji.
    descriptions_df = pd.read_excel("./support_data/COMPLETE_EMOJIS.xlsx")
    
    # Replace emojis with their description. 
    df['all_text_data'] = df['all_text_data'].apply(lambda text : emoji_processor(text, descriptions_df, verbose=verbose))
    
    return df

In [17]:
# ========================================================================================================
# This function processes each word in a given post, and replaces any contractions with their 
# expanded form.
#
# Note: similar to the get_description, the verbose option has been disabled in this function. If this is
# a desired functionality, simply replace if False with if verbose. Since printing contraction replacements
# can lead to a significant slow down in preprocessing, a better option would be to reimplement the structure
# of the "verbose" functionality to account for various levels. This should be included in the most
# extreme option (user wants a lot of messages printed).
# ========================================================================================================
def contraction_processor(text, cont_df, verbose):
    
    # List of known contractions.
    contractions = list(cont_df.loc[:, 'Contraction'].to_numpy())
    
    # List of words in the post
    words = text.split()
    
    # Check if the words in the post are a known contraction. Replace any that are with their expanded form.
    processed_words = [cont_df.loc[cont_df['Contraction'] == word, 'Expanded_Word'].to_numpy()[0].lower() if (word in contractions) else word for word in words]
   
    # If any changes were made, join the processed_words list back together into a string to return.
    if words != processed_words:
        new_text = " ".join(processed_words).strip()
    
    # If no changes were made, just return the original text.
    else:
        new_text = text
    
    # Disabled verbose functionality, see note above.
    if False and (words != processed_words):
        
        # The words we replaced are the ones in words but not in processed words
        replaced_words = [word for word in words if (word not in processed_words)]
        
        # The words we replaced them with are the ones in processed_words but not in words
        replacement_words = [cont_df.loc[cont_df['Contraction'] == word, 'Expanded_Word'].to_numpy()[0].lower() for word in replaced_words]
        
        print("/\/\/\/\/\/\/\//\/\/\/\/\/ Replacing Contractions /\/\/\/\/\/\/\//\/\/\/\/\/")
        print(f"Replaced Words: {replaced_words}")
        print(f"Replacement Words: {replacement_words}")
        print("/\/\/\/\/\/\/\//\/\/\/\/\/\/\/\/\/\/\/\/\//\/\/\/\/\/\/\/\/\/\/\/\/\//\/\/\/\/\/\/ \n")
    
    return new_text

In [18]:
# ========================================================================================================
# This function calls the contraction_processor on each post to replace contractions with their expanded form
# ========================================================================================================
def replace_contractions_with_expanded_form(df, verbose):
    
    if verbose:
        print("\n ~~~~~~~~~~~~~~~~~~~ Starting to replace contractions ~~~~~~~~~~~~~~~~~~~ \n")
    
    # Read the dataset of known contractions into a pandas dataframe. 
    contractions_df = pd.read_csv("./support_data/contractions.csv")
    
    # Process each post to replace contractions with their expanded form.
    df['all_text_data'] = df['all_text_data'].apply(lambda text : contraction_processor(text, contractions_df, verbose=verbose))
    
    return df

In [19]:
# ========================================================================================================
# This function processes the text for a single post to replace any instances of sms_speak with their
# their standardized form. 
# ========================================================================================================
def sms_speak_processor(text, sms_df, verbose):
    
    # Create a list of all words that are "sms speak" that we wish to replace with a more dictionary approved form.
    sms_words = list(sms_df.loc[:, 'SMS_Wording'].to_numpy())
    
    # Split this post into words
    words = text.split()
    
    # Check if each word is an sms word we want to replace, if it is, replace it with the correct form from the sms_df dataframe.
    processed = [sms_df.loc[sms_df['SMS_Wording'] == word.rstrip('!.?'), 'Correct_Wording'].to_numpy()[0].lower() if (word.rstrip('!.?') in sms_words) else
                 word for word in words]
    
    # Join the processed words back together to a single string.
    if words != processed:
        new_text = " ".join(processed).strip()
    else:
        new_text = text
    
    # For ensuring the accuracy of the implementation only, inspect the changes that are being made.
    if verbose and (words != processed):
        
        # The words we replaced are the ones in words but not in processed words
        replaced_words = [word for word in words if (word not in processed)]
        
        # Get the replacement words
        replacement_words = [sms_df.loc[sms_df['SMS_Wording'] == word, 'Correct_Wording'].to_numpy()[0].lower() for word in replaced_words]
        
        print("\n============================ Replacing SMS Speak ============================")
        print(f"Replaced Words: {replaced_words}")
        print(f"Replacement Words: {replacement_words}")
        print("==================================================================================== \n")
    
    return new_text

In [20]:
# ========================================================================================================
# This function applies the sms_speak_processor to each post in the dataframe to replace all instances of
# sms_speak with their standardized forms.
# ========================================================================================================
def replace_sms_speak(df, verbose):
    
    if verbose:
        print("\n>>>>>>>>>>>>>>> Starting to replace SMS Speak <<<<<<<<<<<<<<<<<<<<\n")
    
    # Read in the dataset of known sms_speak terms.
    sms_speak_df = pd.read_csv("./support_data/sms_speak.csv")
    
    # Apply the sms_speak processor to each post.
    df['all_text_data'] = df['all_text_data'].apply(lambda text : sms_speak_processor(text, sms_speak_df, verbose))
    
    return df

In [21]:
# ========================================================================================================
# This function can be used to remove any characters with unicode value greater than 127 
# (i.e. not on the english keyboard) from a single post. 
# ========================================================================================================
def remove_high_ord(text, verbose=False):
    
    # Discard anything left that is not a standard character (ord > 127)
    clean_text = [char for char in text if (ord(char) <= 127)]
    
    # Create a single string containing all characters that were not discarded.
    clean_text = "".join(clean_text).strip()
    
    # If the user wants messages displayed, print anything that this function removed.
    if verbose:
        removed_text = [char for char in text if (ord(char) >= 127)]
        removed_text = "".join(removed_text).strip()
        
        if removed_text != "":
        
            print("========================================================================")
            print(f"ORIGINAL TEXT:\n {text}\n")
            print(f"CLEAN TEXT:\n {clean_text}\n")
            print(f"REMOVED TEXT:\n {removed_text}\n")
            print("========================================================================\n")
    
    return clean_text

In [22]:
# Use the global list of emojis that are missing descriptions to update the full list of unique emojis.
# Only needed during development of the emoji list, not needed for normal preprocessing.
def missing_emojis():
    
    global emojis_missing_descriptions
    
    # Create a dataframe of the emojis we did not have descriptions for
    missing_df = pd.DataFrame(emojis_missing_descriptions)
    
    # Indicate "Not_Available" in the dataframes description column.
    missing_df['description'] = "Not_Available"
    
    # Drop duplicates to create a list of unique emojis that are missing.
    missing_df.drop_duplicates(inplace=True, ignore_index=True)
    
    missing_df.rename(columns={'emoji' : 'unique_emojis', 'code':'codepoint', 'description':'description'}, inplace=True)
    
    complete_emoji_df = pd.read_excel("./support_data/COMPLETE_EMOJIS.xlsx")
    
    combined_df = pd.concat([complete_emoji_df, missing_df], ignore_index=True)
    
    combined_df.to_excel("./support_data/ALL_UNIQUE_EMOJIS_IN_WORK.xlsx", index=False)
    
    return combined_df

In [23]:
# ========================================================================================================
# This function takes in two dataframes containing posts from subreddits performs the following:
#
# 1. Combine to a single dataframe
# 2. Drop any duplicate rows.
# 3. Replace emojis with a text that describes the emojis meaning
# 4. Replace SMS Speak
# 5. Lowercase all text
# 6. Replace contractions with expanded form
# 7. Remove punctuation
# 8 Remove Excessively long words (currently defined as > 25 chars) but this could become a parameter if desired.
# 9. Remove any remaining characters with unicode value > 127.
# 10. Save the processed dataframe to .csv
# ========================================================================================================
def preprocess(crypto_df, wsb_df, verbose, time_stats=True): 
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        start_time = time.time()
        print(f"Preprocess start time {start_time}\n\n")
        print(">>>>>>>>>>>>> Combining DataFrames >>>>>>>>>>>>>>>")
        print(f"Crypto_df rows: {len(crypto_df.index)}")
        print(f"WSB df rows: {len(wsb_df.index)}")
    
    # 1. Combine crypto and stocks into a single df
    combined_df = pd.concat([crypto_df, wsb_df], ignore_index=True)
    
    # Give a name to the dataframe, its the nice thing to do. 
    combined_df.name = "All_Reddit_Posts_Df"
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Rows after concat: {len(combined_df.index)}")
        print(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n")
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"~~~~~~~~~~~~~~~~~~~~ Dropping Duplicate Rows ~~~~~~~~~~~~~~~~~~~~")
        print(f"Duplicate before dropping: {combined_df.duplicated().sum()}")
    
    # 2. Drop duplicate rows
    combined_df.drop_duplicates(inplace=True, ignore_index=True)
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Duplicates after dropping: {combined_df.duplicated().sum()}")
        print(f"Rows after dropping duplicates: {len(combined_df.index)}")
        print(f"Total Elapsed Time {time.time() - start_time}")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
    
        print("====================================================================")
        print("Starting to replace emojis...")
    
    # 3. Replace emojis with a text that describes the emojis meaning
    combined_df = replace_emojis_with_description(combined_df, verbose=verbose)
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished replacing emojis, elapsed time is {time.time() - start_time}")
        print("====================================================================\n")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("Starting to replace sms speak....")
    
    # 4. Replace SMS Speak
    combined_df = replace_sms_speak(combined_df, verbose=verbose)
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished replacing sms speak, elapsed time is {time.time() - start_time}")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
        print("------------------------- lower casing everything -------------------------------")
    
    # 5. Lowercase everything
    combined_df['all_text_data'] = combined_df['all_text_data'].apply(lambda text : text.lower())
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished lowercasing, elapsed time is {time.time() - start_time}")
        print("---------------------------------------------------------------------------------\n")
        print("***********************************************************************************")
        print("Starting to replace contractions...")
    
    # 6. Replace contractions with expanded form
    combined_df = replace_contractions_with_expanded_form(combined_df, verbose=verbose)
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished replacing contractions, elapsed time is {time.time() - start_time}")
        print("***********************************************************************************\n")
        print(" !!!!!!!!!!!!!!!!!!!!!!!!!!!! Removing Punctuation !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    
    # 7. Remove punctuation
    combined_df['all_text_data'] = combined_df['all_text_data'].apply(lambda text : "".join([char for char in text if (char not in string.punctuation)]))
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished removing punctuation, elapsed time is {time.time() - start_time}")
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n")
        print("len len len len -----> Removing execessively long words (25 chars+) <----- len len len len")
    
    # 8 Remove Excessively long words. They are probably websites and not actual words. If they are too long, they are also probably too rare to be useful.
    combined_df['all_text_data'] = combined_df['all_text_data'].apply(lambda text : " ".join([word for word in text.split() if len(word) <= 25]))
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print(f"Finished removing long words, elapsed time is {time.time() - start_time}")
        print("----------------------------------------------------------------------------------\n")
        print("128+ 128+ 128+ Removing any high ord characters that are left 128+ 128+ 128+ \n")
    
    
    # 9. Remove anything characters left that have high ord (ord > 127). All emojis should be replaced already.
    # This is going to be things like the euro symbol or special font apostrophies that don't get caught by str.punctuation. 
    combined_df['all_text_data'] = combined_df['all_text_data'].apply(lambda text : remove_high_ord(text, verbose=verbose))
    
    # If user wants to see outputs indicating how long each part of preprocessing took.
    if time_stats:
        print("Finished removing high ord characters!")
        print(f"PREPROCESSING COMPLETE! total elapsed time {time.time() - start_time}")
        print("-----------------------------------------------------------------------------------------\n")
    
    # Get the total number of samples in the dataframe for use in the .csv filename. 
    num_samples = len(combined_df.index)
    
    # Save the processed dataframe to .csv
    combined_df.to_csv(f"./data/Processed/wsb_crypto_preprocessed_{num_samples}.csv", index=False)
    
    # Update the file containing the list of unique emojis to include any that were identified
    # to be missing descriptions in this round of processing.
    # Commented out because this is only needed if there is a desire to add descriptions for
    # additional new emojis. It is not needed for preprocessing to function.
    # emoji_df = missing_emojis()
    
    df = review_posts(combined_df)
    
    return combined_df

In [24]:
# Perform preprocessing on the clean_crypto_df and clean_wsb_df dataframes.
preprocessed_df = preprocess(crypto_df=clean_crypto_df, wsb_df=clean_wsb_df, verbose=False)

Preprocess start time 1625095479.124827


>>>>>>>>>>>>> Combining DataFrames >>>>>>>>>>>>>>>
Crypto_df rows: 948853
WSB df rows: 1125000
Rows after concat: 2073853
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

~~~~~~~~~~~~~~~~~~~~ Dropping Duplicate Rows ~~~~~~~~~~~~~~~~~~~~
Duplicate before dropping: 721
Duplicates after dropping: 0
Rows after dropping duplicates: 2073132
Total Elapsed Time 14.119234323501587
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Starting to replace emojis...
There is no description for emoji ✡
Replacing the emoji with an empty string
There is no description for emoji ☸
Replacing the emoji with an empty string
There is no description for emoji 📗
Replacing the emoji with an empty string
There is no description for emoji ✡
Replacing the emoji with an empty string
There is no description for emoji ✡
Replacing the emoji with an empty string
There is no description for emoji ♐
Replacing the emoji with an empty string
There is no descriptio

In [25]:
preprocessed_df.head()

Unnamed: 0,subreddit,selftext,title,created_utc,all_text_data
0,CryptoCurrency,,All Nodes Ethereum 2.0 Services,1624735574,all nodes ethereum 20 services
1,CryptoCurrency,,Is the Largest Difficulty Adjustment In Bitcoi...,1624735572,is the largest difficulty adjustment in bitcoi...
2,CryptoCurrency,,Binance leaving Ontario: Binance will handle E...,1624735503,binance leaving ontario binance will handle et...
3,CryptoCurrency,I own some purely because of the doge effect. ...,What are your thoughts on shib?,1624735442,what are your thoughts on shib i own some pure...
4,CryptoCurrency,,Miami Beach's most expensive penthouse just so...,1624735282,miami beachs most expensive penthouse just sol...
