<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/01_Data_Cleaning_With_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import spacy
!python -m spacy download en_core_web_md

In [None]:
import nltk
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import en_core_web_md

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.symbols import ORTH
from spacy.tokenizer import _get_regex_pattern

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# Read in the tweet dataset output by the 00_Emoji_Data_Cleaning notebook. Emojis are now clean but the rest of the tweet has not be preprocessed yet.
filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_with_emojis_clean.csv"

tweet_df = pd.read_csv(filename, index_col=0)

tweet_df.head(10)

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty,bihday your majesty
4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation,factsguide: society now #motivation
6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
7,0,@user camping tomorrow @user @user @user @use...,@user camping tomorrow @user @user @user @use...
8,0,the next school year is the year for exams.ð...,the next school year is the year for exams. su...
9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
10,0,@user @user welcome here ! i'm it's so #gr...,@user @user welcome here ! i'm it's so #gr...


# Part 1 - Load and setup spacy

In [None]:
# Importing the trained spaCy model. 
nlp = en_core_web_md.load()

# View the standard pipeline components that SpaCy uses for this model.
nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
# Save off a single tweet to use when exploring tokenization behavior. 
first_tweet = tweet_df.loc[ tweet_df.index == 1, ['tweet_emoji_cleaned']]['tweet_emoji_cleaned'].to_numpy()

first_tweet = str(first_tweet[0])

first_tweet

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------
# Remove "#" from the list of default prefixes spaCy looks for during tokenization. This will allow us to create a token_match regular
# expression that matches a hashtag and text (e.g. #hashTagsAreCool) as a single token. 
#
# If '#' is not removed from the prefix list then hashtags will always be a separate token from the text, regardless of whether or not we have a token
# match rule that would match a #text style string. This is because spaCy processes prefix rules before token_match rules during tokenization process. 
#
# Reference the spacy tokenization chart for a visual of how rules are processed: https://spacy.io/usage/linguistic-features#tokenization
#-------------------------------------------------------------------------------------------------------------------------------------------------------------

# The following four lines of code remove the # symbol from the prefix rules.

default_prefixes = list(nlp.Defaults.prefixes)                      # The default prefixes spacy will look for during tokenization
default_prefixes.remove('#')                                        # Remove hashtags from the default prefix list
prefix_regex = spacy.util.compile_prefix_regex(default_prefixes)    # Compile the new prefix regex (that now has # removed)
nlp.tokenizer.prefix_search = prefix_regex.search                   # Update the "nlp" trained model to use the new prefix regex


# The next three lines add a regular expression that will match '#text' as a single token to the token_match regex.

# Get the current regex that nlp is using for token matching.
nlp_token_matching_regex_pre_update = spacy.tokenizer._get_regex_pattern(nlp.tokenizer.token_match)

# Create a new regex that combines the current regex and a term that will treat hashtags as a single token. 
updated_token_matching_regex = f"({nlp_token_matching_regex_pre_update}|#\w+)"

# Update the token matching regex used by nlp to the regex created in the line above.
nlp.tokenizer.token_match = re.compile(updated_token_matching_regex).match

# This is kept as an example of how to use nlp.tokenizer.explain to debug the tokenizer
# tokenizer.explain tells you the rules being applied to get to the final tokenized result.
'''
tweet_doc = nlp.tokenizer.explain(first_tweet)   
for token in tweet_doc: 
  print(token)
'''

# Commented out but kept as an example of displaying spaCys behavior when turning 
# strings into doc objects (tokens). This output can also be used to verify that
# the rule we created to keep hashtags as a single token worked as intended.
'''

# Turn the first_tweet string into a spacy doc object (sequence of tokens).
tweet_doc = nlp(first_tweet)

print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in tweet_doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")
''';

# Part 2 - Tweet cleaning with spacy

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
# This cell takes in a csv file that contains a list of common contractions, and their "expanded" counterparts. 
# The csv file is converted to a python dictionary, which will make it easy to lookup the expanded form of a word for any given contraction.
# This dictionary will be used to convert all contractions to their expanded forms during the tweet cleaning process.
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

contraction_file = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/contractions.csv"

contractions_df = pd.read_csv(contraction_file)

contractions = list(contractions_df['Contraction'].to_numpy())
expanded_words = list(contractions_df['Expanded_Word'].to_numpy())

contraction_to_expansion = zip(contractions, expanded_words)

contraction_map = {}

for index, mapping in enumerate(contraction_to_expansion):
  contraction_map[mapping[0].lower()] = mapping[1]

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# "SMS language" is a term that refers to abbreviated or slang language that does not consist of proper dictionary words, but is nonetheless 
# commonly understood and widely used in digitial forms of communication. See wikipedia for more info on "SMS language" https://en.wikipedia.org/wiki/SMS_language
#
# This cell takes in a csv file that contains a list of common "SMS laguage" and the corresponding "expanded" form. For example SMS speak '<3' would be converted to 'love'. A 
# dictionary is created that maps the sms term the the associated 'correct wording'. 
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

sms_speak_file = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/sms_speak.csv"

sms_speak_df = pd.read_csv(sms_speak_file)

correct_wordings = list(sms_speak_df['Correct_Wording'].to_numpy())
sms_abbreviations = list(sms_speak_df['SMS_Wording'].to_numpy())

sms_and_correct_wordings = zip(sms_abbreviations, correct_wordings)

sms_map = {}

for index, mapping in enumerate(sms_and_correct_wordings):
  sms_map[str(mapping[0]).lower()] = mapping[1]

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------
# Sometimes hashtags get stuck together, as in: #hashtagone#hastagtwo#hashtagthree. This function separates hashtags that are
# stuck together so they get treated as their own entitiy.
# --------------------------------------------------------------------------------------------------------------------------------------
def split_chained_hashtags(word):
  
  output_string = "" 
  if word.startswith('#') and word.count("#") > 1:                              # If there is more than one # symbol in the word.

    individual_hashtags = word.split("#")                                       # Split the word into multiple words at each # symbol. 

    for tag in individual_hashtags: 

      if tag != "":

        output_string = output_string + "#" + tag + " "                         # Output string consists of each individual hashtag (#text) separated by whitespace (e.g. #hashtag1 #hashtag2)

  else: 
    output_string = word

  output_string = output_string.rstrip()
  return output_string

In [None]:
# ---------------------------------------------------------------------------------------------------------------------------------------------------------
# This is a helper function for the fix_contractions_punctuation_abbreviation function.
#
# When the parent function is ready to add a word to the output tweet, this function helps by checking to see if there are any contractions or abbreviations 
# that need to be transformed to their "expanded" forms first.
# ---------------------------------------------------------------------------------------------------------------------------------------------------------
def evaluate_word(word, contraction_mapping=contraction_map, sms_mapping=sms_map):

  output_word = ""

  if word.lower() in contraction_mapping:                                       # Check if this word is a contraction that can be expanded. If so, add the expanded form to the output tweet.
    output_word = output_word + " " + contraction_mapping[word.lower()]
  elif word.lower() in sms_mapping:                                             # Check if this word is an abbreviation that can be expanded. If so, add the expanded form to the output tweet.
    output_word = output_word + " " + sms_mapping[word.lower()]          
  elif word != "":                                                              # If this word is not a contraction or abbreviation, and the word is not empty, add the word to the output tweet.
    output_word = output_word + " " + word

  if word.startswith('#') and word.count("#") > 1:                              # If the word is of the form #text, check to see if it is multiple hashtags stuck together e.g. #text1#text2, if so, split them apart.
    output_word = split_chained_hashtags(word)

  if word.count('\'') >= 1:                                                     # Make sure there are not any apostrophes left in the word(s), if there are, remove them.
    output_word = output_word.replace('\'', "")

  return output_word

In [None]:
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# This function is used to preprocess tweets to remove contractions, punctuation and abbreviations before spaCy performs the tokenization and lemmatization process.
# ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def fix_contractions_punctuation_abbreviation(input_tweet, contraction_mapping=contraction_map, sms_mapping=sms_map):

  punctuation = ".&\()*+,-./:;<=>\"!?[\\]^_`{|}~"
  word = "" 
  output_tweet = ""

  for char in input_tweet:                                                      # Iterate through every character in the tweet.

    if char != " ":                                                             # If we haven't hit a white space, continue on to keep building up a word.
      if char in punctuation:                                                   # If we hit a punctuation mark, the current word is over.                                      
        word = evaluate_word(word)                                              # Check if this word needs to be transformed (i.e. handle contractions, abbreviations, etc.)
        output_tweet = output_tweet + " " + word                                # Add the appropriate word to the output tweet.
        word = ""                                              
      else:                                                                     # If this character was not a space or a punctuation mark, continue building up the current word.
        word = word + char
    
    else:                                                                       # If we reached a white space, the current word is over. 
      word = evaluate_word(word)                                                # Evaluate the word for contractions, abbreviations etc. Add the appropriate word to the output tweet.
      output_tweet = output_tweet + " " + word
      word = ""

  # If we are about to exit but still have one last word to add.
  if word != "":                                                                # If the final character is not a whitespace, we may be ready to exit the function but still have one 
    word = evaluate_word(word)                                                  # remaining word to add to the output tweet. This section handles that scenario.
    output_tweet = output_tweet + " " + word
    word = ""

  output_tweet = output_tweet.lstrip().rstrip()                                 # Strip any excess white space on the left and right edges of the word.
  output_tweet = re.sub(" +", " ", output_tweet)                                # Replace any duplicate white spaces in the middle (possible chain of words from expanding) with a single white space.

  return output_tweet

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Inputs: An unprocessed tweet string
# Outputs: A list of clean tweet tokens
#
# This function performs cleaning, tokenization and lemmatization on each tweet one at a time. This function can be used to clean the entire dataset of tweets
# all at once by calling it with a pandas .apply function. The process for cleaning a single tweet is as follows:
#
# Step 1: Preprocessing
#         - Ensure common contractions are replaced by its expanded word form.
#         - Ensure common "SMS speak" slang is replaced by its expanded word form.
#         - Ensure all punctuation marks are removed (retain hashtag symbols connected to text).
#         - Ensure any instances of multiple hashtags stuck together are split apart with white space.
#         
# Step 2: Tokenization with spaCy. 
#         - Convert the clean tweet string to a spaCy doc object to allow access to tokens and lemmas.
#         - Use spaCy to take the "lemma" of each individual token. 
#         - Reduce the number of unqiue words by changing all "lemmas" to lowercase.
#         - Filter out any '@User' tokens, as they are too generic to provide any sentiment insight.
#         - Filter out any lemmatized tokens that are in the list of "stop words" (words that are too generic to provide sentiment insight).
#         - Add the lemmatized token to the output list.
#
# Note: Many of the above steps are designed to minimize the number of unique tokens without losing valuable sentiment information.
#       Minimizing the number of unique tokens is desirable because this will lead to a matrix with lower dimensionality during the
#       modeling phases (i.e. less unique words helps minimze impacts from the curse of dimensionality, as significantly more data is
#       is needed to generalize well in higher dimensional spaces).
#
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def tweet_cleaner(dirty_tweet, nlp):

  # Preprocess before tokenization.
  partially_clean_tweet = fix_contractions_punctuation_abbreviation(dirty_tweet)

  # Get the list of default stop words in the pretrained spaCy model.
  stop_words = nlp.Defaults.stop_words

  # Turn the cleaned tweet into a spaCy doc object so we can access the tokens and lemmas. 
  tweet_doc = nlp(partially_clean_tweet)

  clean_tweet_tokens = []
  for token in tweet_doc:
    if token.lemma_ == "-PRON-":                                # -PRON- indicates that spaCys lemmatizer flagged this work as a generic pronoun.
      token_txt = token.lemma_.lower().strip('-')               # If the lemma of the token is -PRON- strip off the "-"'s and make the word lowercase.
      clean_tweet_tokens.append(token_txt)                      # Add the clean, lemmatized token to the output list.
    
    # 
    elif token.lemma_ != '@user' and token.lemma_ not in stop_words and token.lemma_ != "" and token.lemma_ != " ":
      token_txt = token.lemma_.lower().strip()               
      clean_tweet_tokens.append(token_txt)
  
  return clean_tweet_tokens

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# When performed using a pandas .apply function, the "tweet_cleaner" function shown above will create a new DataFrame column that contains of a list of clean, lemmatized tokens. 
#
# Having the clean tweet tokens stored in a python list is desireable for situations where the list will be accessed within the same notebook, however if the intent is to save the DataFrame as a .csv
# file for use in a later notebook this is not a good plan because when a DataFrame is saved to .csv all of its contents get converted to strings. This means a DataFrame column that contains a list
# will become a string formatted as '[item1, item2, item3]'. This is clearly undesirable as the important information becomes unecessarily cluttered with commas and brackets.
#
# This function can be used to convert the DataFrame column containing a list of clean lemmatized tokens (created by tweet cleaner) into a single string. This string can then easily be retokenized
# in a following notebook by splitting at white spaces.
# 
# Additionally, this function looks for and handles rare circumstances where a clean lemmatized token consists only of a hashtag with no attached text. Without the text portion these tokens
# do not have any valuable sentiment information, and are therefore removed.
#
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def create_clean_tweet_string(clean_tokens):
  clean_tweet_string = ""
  for token in clean_tokens:

    # Some hashtags with no words have not been cleaned yet, filter them here, 
    # as a hashtag with no words attached is not sentiment laden.
    if token != "#":
      clean_tweet_string = clean_tweet_string + token + " "
  
  clean_tweet_string = clean_tweet_string.strip()

  return clean_tweet_string

In [None]:
# Create a new DataFrame column that contains a list of clean, lemmatized tokens.
tweet_df['Fully_Clean_Tweet_Tokenized'] = tweet_df['tweet_emoji_cleaned'].apply(tweet_cleaner, args=(nlp,))

# Use the column containing a list of clean, lemmatized tokens to create a column containing the cleaned tweet as a single string.
tweet_df['Clean_Tweet'] = tweet_df['Fully_Clean_Tweet_Tokenized'].apply(create_clean_tweet_string)

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# After all of the above data cleaning steps, there are several tweets where no text is left at all. (Everything in the tweet was deemed too generic to provide valuable training information
# for hate speech identification, e.g. stop words). 
#
# Note: Several of these do contain emojis, so if the emoji sentiment dictionary were ever expanded there is a chance that some of these tweets may not be empty after data cleaning.
#
# For now, since an empty string tweet is not useful training or testing data, these rows are being dropped. Additionally, this dataset contains a significantly higher number of 
# non hate speech examples as compared to hate speech examples. All of these rows being dropped are non-hate speech examples, which is even further justification that removing them
# will not have a negative impact on model performance.
#
#---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


# Displaying rows where the tweet string is empty after the cleaning steps.
tweet_df.loc[tweet_df['Clean_Tweet'] == "", :]


Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized,Clean_Tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2261,0,always be ð½ð¢ð®ð¢ð¢ð§,always be,[],
2283,0,well done @user,well done @user,[],
2742,0,to show - -,to show - -,[],
4800,0,@user @user @user @user @user @user @user @us...,@user @user @user @user @user @user @user @us...,[],
5096,0,i am really â¢,i am really,[],
5236,0,@user @user @user @user @user @user @user @us...,@user @user @user @user @user @user @user @us...,[],
5585,0,@user always be ð¡ð¶ð±ð´ð±ð¡ð...,@user always be,[],
6128,0,no @user hasnt,no @user hasnt,[],
6646,0,@user well said,@user well said,[],
7820,0,@user always be ð½ð¢ð®ð¢ð¢ð§,@user always be,[],


In [None]:
# Dropping the rows where the clean tweet is an empty string.
index_values_to_drop = tweet_df[tweet_df['Clean_Tweet'] == ""].index
tweet_df.drop(index_values_to_drop, inplace=True)

In [None]:
# Verify the DataFrame no longer has any rows where the clean tweet is an empty string.
tweet_df.loc[tweet_df['Clean_Tweet'] == "", :]

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized,Clean_Tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [None]:
# Fixing index after dropping rows, so the index does not skip numbers.
tweet_df = tweet_df.reset_index(drop=True)

In [None]:
# Save the dataframe with the clean tweet string to a .csv file for use in later notebooks.
tweet_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_spacy_clean.csv", index=False)

tweet_df.head()

Unnamed: 0,label,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized,Clean_Tweet
0,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,"[father, dysfunctional, significant, selfish, ...",father dysfunctional significant selfish pron ...
1,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,"[thank, #lyft, credit, use, cause, pron, offer...",thank #lyft credit use cause pron offer wheelc...
2,0,bihday your majesty,bihday your majesty,"[bihday, pron, majesty]",bihday pron majesty
3,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,"[#model, love, pron, pron, time, pron, happy, ...",#model love pron pron time pron happy love hap...
4,0,factsguide: society now #motivation,factsguide: society now #motivation,"[factsguide, society, #motivation]",factsguide society #motivation
