<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/01_Data_Cleaning_With_Spacy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import spacy
!python -m spacy download en_core_web_md

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_md')


In [None]:
import nltk
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import en_core_web_md

from spacy.lang.en.stop_words import STOP_WORDS
from spacy.symbols import ORTH
from spacy.tokenizer import _get_regex_pattern

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# Read in the dirty twitter data (with emojis cleaned from the previous file), store in a dataframe and display the first several rows. 
filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_with_emojis_clean.csv"

tweet_df = pd.read_csv(filename, index_col=0)

tweet_df.head(10)

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty,bihday your majesty
4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation,factsguide: society now #motivation
6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...
7,0,@user camping tomorrow @user @user @user @use...,@user camping tomorrow @user @user @user @use...
8,0,the next school year is the year for exams.ð...,the next school year is the year for exams. su...
9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...
10,0,@user @user welcome here ! i'm it's so #gr...,@user @user welcome here ! i'm it's so #gr...


# Part 1 - Load and setup spacy

In [None]:
# Importing the trained Spacy model. 
nlp = en_core_web_md.load()

nlp.pipe_names

['tagger', 'parser', 'ner']

In [None]:
# Save off a single tweet to use when exploring tokenization behavior. 
first_tweet = tweet_df.loc[ tweet_df.index == 1, ['tweet_emoji_cleaned']]['tweet_emoji_cleaned'].to_numpy()

first_tweet = str(first_tweet[0])

first_tweet

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------
# Remove "#" from the list of default prefixes spacy looks for during tokenization. This will allow us to add a token_match regular
# expression that matches a hashtag and text (e.g. #hashTagsAreCool) as a single token. If '#' is not removed from the prefix list
# then hashtags will always be a separate token from the text, this is because prefixes are processed before token_match rules during 
# tokenization. Reference the spacy tokenization chart for a visual of how rules are processed: https://spacy.io/usage/linguistic-features#tokenization
#-------------------------------------------------------------------------------------------------------------------------------------------------------------

default_prefixes = list(nlp.Defaults.prefixes)                      # The default prefixes spacy will look for during tokenization
default_prefixes.remove('#')                                        # Remove hashtags for the default prefix list.
prefix_regex = spacy.util.compile_prefix_regex(default_prefixes)    # Update to use the new prefixes
nlp.tokenizer.prefix_search = prefix_regex.search

# Get the current regex that nlp is using for token matching.
nlp_token_matching_regex_pre_update = spacy.tokenizer._get_regex_pattern(nlp.tokenizer.token_match)

# Create a new regex that combines the current regex and a term that will treat hashtags as a single token. 
updated_token_matching_regex = f"({nlp_token_matching_regex_pre_update}|#\w+)"

# Update the token matching regex used by nlp with the regex created in the line above.
nlp.tokenizer.token_match = re.compile(updated_token_matching_regex).match

# This is kept as an example of how to use nlp.tokenizer.explain to debug the tokenizer
# tokenizer.explain tells you the rules being applied to get to the final result.
'''
tweet_doc = nlp.tokenizer.explain(first_tweet)   
for token in tweet_doc: 
  print(token)
'''

# Turn the first_tweet string into a spacy doc object (sequence of tokens).
tweet_doc = nlp(first_tweet)

# Commented out but kept as an example of displaying Spacys behavior when turning 
# strings into doc objects (tokens). This output can also be used to verify that
# the rule we created to keep hashtags as a single token worked as intended.
'''
print(f"Token\t\tLemma\t\tStopword")
print("="*40)
for token in tweet_doc:
    print(f"{token}\t\t{token.lemma_}\t\t{token.is_stop}")
''';

# Part 2 - Tweet cleaning with spacy

In [None]:
#-----------------------------------------------------------------------------------------------------------------------------------------------------------
# This cell takes in a csv file that contains a list of common contractions, and their "expanded" counterparts. 
# The csv file is converted to a python dictionary, which will make it easy to lookup the expanded form of a word for any given contraction.
# This dictionary will be used to convert all contractions to their expanded forms during the tweet cleaning process.
#-----------------------------------------------------------------------------------------------------------------------------------------------------------

contraction_file = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/contractions.csv"

contractions_df = pd.read_csv(contraction_file)

contractions = list(contractions_df['Contraction'].to_numpy())
expanded_words = list(contractions_df['Expanded_Word'].to_numpy())

contraction_to_expansion = zip(contractions, expanded_words)

contraction_map = {}

for index, mapping in enumerate(contraction_to_expansion):
  contraction_map[mapping[0].lower()] = mapping[1]

In [None]:
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# "SMS language" is a term that refers to abbreviated or slang language that does not consist of proper dictionary words, but is nonetheless 
# commonly understood and widely used in digitial forms of communication. See wikipedia for more info on "SMS language" https://en.wikipedia.org/wiki/SMS_language
#
#-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

sms_speak_file = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/sms_speak.csv"

sms_speak_df = pd.read_csv(sms_speak_file)

correct_wordings = list(sms_speak_df['Correct_Wording'].to_numpy())
sms_abbreviations = list(sms_speak_df['SMS_Wording'].to_numpy())

sms_and_correct_wordings = zip(sms_abbreviations, correct_wordings)

sms_map = {}

for index, mapping in enumerate(sms_and_correct_wordings):
  sms_map[str(mapping[0]).lower()] = mapping[1]

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------
# Sometimes hashtags get stuck together, as in: #hashtagone#hastagtwo#hashtagthree. This function separates hashtags that are
# stuck together so they get treated as their own entitiy.
# --------------------------------------------------------------------------------------------------------------------------------------
def split_chained_hashtags(word):
  
  output_string = "" 
  if word.startswith('#') and word.count("#") > 1:

    individual_hashtags = word.split("#")

    for tag in individual_hashtags: 

      if tag != "":

        output_string = output_string + "#" + tag + " "

  else: 
    output_string = word

  output_string = output_string.rstrip()
  return output_string

In [None]:
# ---------------------------------------------------------------------------------------------------------------------------------------------------------
# This is a helper function for the fix_contractions_punctuation_abbreviation function. When the parent function is ready to add a word to 
# the output tweet, this function helps by checking to see if there are any contractions or abbreviations we should use to transform the word first.
# ---------------------------------------------------------------------------------------------------------------------------------------------------------
def evaluate_word(word, contraction_mapping=contraction_map, sms_mapping=sms_map):

  output_word = ""

  if word.lower() in contraction_mapping:                                         # Check if this word is a contraction that can be expanded. If so, add the expanded word to the output tweet.
    output_word = output_word + " " + contraction_mapping[word.lower()]
  elif word.lower() in sms_mapping:                                               # Check if this word is an abbreviation that can be expanded.
    output_word = output_word + " " + sms_mapping[word.lower()]          
  elif word != "":                                                        # If this word is not a contraction or abbreviation, and the word is not empty, add the word to the output tweet.
    output_word = output_word + " " + word

  if word.startswith('#') and word.count("#") > 1:
    output_word = split_chained_hashtags(word)

  if word.count('\'') >= 1: 
    output_word = output_word.replace('\'', "")

  return output_word

In [None]:
def fix_contractions_punctuation_abbreviation(input_tweet, contraction_mapping=contraction_map, sms_mapping=sms_map):

  punctuation = ".&\()*+,-./:;<=>\"!?[\\]^_`{|}~"
  word = "" 
  output_tweet = ""

  for char in input_tweet:                                                      # Iterate through every character in the tweet.

    if char != " ":                                                             # If we haven't hit a white space, continue on to keep building up a word.
      if char in punctuation:                                                   # If we hit a punctuation mark, the current word is over.                                      
        word = evaluate_word(word)                                              # Check if this word needs to be transformed to something else, or should be added as is.
        output_tweet = output_tweet + " " + word                                # Add the appropriate word to the output tweet.
        word = ""                                              
      else:                                                                     # If this character was not a space or a punctuation mark, continue building up the word.
        word = word + char
    
    else:                                                                       # If we reached the space before a new word starts, evaluate the word to see if it is a 
      word = evaluate_word(word)                                                # contraction or abbreviation. Add the appropriate word to the output tweet.
      output_tweet = output_tweet + " " + word
      word = ""

  # If we are about to exit but still have one last word to add.
  if word != "":                                                                # If the final character is not a whitespace, we may be ready to exit the function but still have one 
    word = evaluate_word(word)                                                  # remaining word to add to the output tweet. This section handles that scenario.
    output_tweet = output_tweet + " " + word
    word = ""

  output_tweet = output_tweet.lstrip().rstrip()
  output_tweet = re.sub(" +", " ", output_tweet)

  return output_tweet

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Tweet Cleaner Function. When applied to a tweet, this function will do the following:
#
# - Remove punctuation marks.
# - Convert common contractions and abbreviations to their expanded forms.
# - Convert the tweet to a Spacy doc object (a set of tokens).
# - Use Spacys lemmatization process to group words together and reduce the overall number of unique tokens
#   Note: Reducing the number of unique tokens is desired because that will help reduce the dimensionality of the matrix we use when building models, 
#         and will therefore help mitigate the negative effects of working in high dimensional spaces (curse of dimensionality).
#
# - Convert all words to lowercase, to further reduce the number of unique words in the output.
# - Remove all stop words as defined in Spacys list of default stop words. Use nlp.Defaults.stop_words for a list.
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

def tweet_cleaner(dirty_tweet, nlp):

  partially_clean_tweet = fix_contractions_punctuation_abbreviation(dirty_tweet)
  stop_words = nlp.Defaults.stop_words

  tweet_doc = nlp(partially_clean_tweet)

  clean_tweet_tokens = []
  for token in tweet_doc:
    if token.lemma_ == "-PRON-":                                # -PRON- indicates that Spacys lemmatizer flagged this work as a generic pronoun.
      token_txt = token.lemma_.lower().strip('-')               # If the lemma of the token is -PRON- strip off the "-"'s and make the word lowercase.
      clean_tweet_tokens.append(token_txt)  
    
    # Ignore @users tokens and stopwords. They are generic and are unlikely to help provide insight.
    elif token.lemma_ != '@user' and token.lemma_ not in stop_words and token.lemma_ != "" and token.lemma_ != " ":
      token_txt = token.lemma_.lower().strip()               
      clean_tweet_tokens.append(token_txt)
  
  return clean_tweet_tokens

In [None]:
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Creates a new column "clean", tokenzied tweet.
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
tweet_df['Fully_Clean_Tweet_Tokenized'] = tweet_df['tweet_emoji_cleaned'].apply(tweet_cleaner, args=(nlp,))

tweet_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_spacy_clean.csv")

tweet_df.head()

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,"[father, dysfunctional, significant, selfish, ..."
2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,"[thank, #lyft, credit, use, cause, pron, offer..."
3,0,bihday your majesty,bihday your majesty,"[bihday, pron, majesty]"
4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,"[#model, love, pron, pron, time, pron, happy, ..."
5,0,factsguide: society now #motivation,factsguide: society now #motivation,"[factsguide, society, #motivation]"


In [None]:
tweet_df.head(25)

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned,Fully_Clean_Tweet_Tokenized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...,"[father, dysfunctional, significant, selfish, ..."
2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...,"[thank, #lyft, credit, use, cause, pron, offer..."
3,0,bihday your majesty,bihday your majesty,"[bihday, pron, majesty]"
4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...,"[#model, love, pron, pron, time, pron, happy, ..."
5,0,factsguide: society now #motivation,factsguide: society now #motivation,"[factsguide, society, #motivation]"
6,0,[2/2] huge fan fare and big talking before the...,[2/2] huge fan fare and big talking before the...,"[2, 2, huge, fan, fare, big, talking, pron, le..."
7,0,@user camping tomorrow @user @user @user @use...,@user camping tomorrow @user @user @user @use...,"[camping, tomorrow, danny]"
8,0,the next school year is the year for exams.ð...,the next school year is the year for exams. su...,"[school, year, year, exam, surprise, amazed, t..."
9,0,we won!!! love the land!!! #allin #cavs #champ...,we won!!! love the land!!! #allin #cavs #champ...,"[pron, win, love, land, #allin, #cavs, #champi..."
10,0,@user @user welcome here ! i'm it's so #gr...,@user @user welcome here ! i'm it's so #gr...,"[welcome, pron, pron, significant, #gr8]"
