<a href="https://colab.research.google.com/github/BradenAnderson/Twitter-Sentiment-Analysis/blob/main/00_Emoji_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# Read in the dirty twitter data, store in a dataframe and display the first several rows. 
filename = "/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/train_test_data/train_twitter_sentiment.csv"
tweet_df = pd.read_csv(filename, index_col=0)

# Read in the custom file that contains a sentiment string for various emojis. 
emoji_file = '/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/emoji_partial.csv'
emoji_df = pd.read_csv(emoji_file)

tweet_df.head(10)

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation
6,0,[2/2] huge fan fare and big talking before the...
7,0,@user camping tomorrow @user @user @user @use...
8,0,the next school year is the year for exams.ð...
9,0,we won!!! love the land!!! #allin #cavs #champ...
10,0,@user @user welcome here ! i'm it's so #gr...


In [None]:
# view the table that can cross reference emoji code point to the associated sentiment.
emoji_df.head()

Unnamed: 0,Codepoint,Emoji,Emoji_Description,Sentiment
0,U+1F600,😀,Grinning Face,happy
1,U+1F603,😃,Grinning Face with Big Eyes,happy joy
2,U+1F604,😄,Grinning Face with Smiling Eyes,happy joy
3,U+1F601,😁,Beaming Face with Smiling Eyes,happy joy
4,U+1F606,😆,Grinning Squinting Face,happy embarassed


# Replacing emojis with their sentiments

Emojis are replaced with text that describes the sentiment of the emoji if possible. If the sentiment for that particular emoji is unknown, the emoji is replaced with an empty string.

In [None]:
# -------------------------------------------------------------------------------------------------------------------
# This global variable is used to generate a list of all emojis that I currently do not have a sentiment for.
# If time allows I can go manually create sentiment strings for every emoji in this list.
# -------------------------------------------------------------------------------------------------------------------
global unknown_emoji_list
unknown_emoji_list = []

In [None]:
# ----------------------------------------------------------------------------------------------------------------
# Emoji codes are often written in the form "U+XXXXX". This function is used to take a list of 
# known emojis with this "U+XXXXX" code formatting, and reformat into the "latin" encoding, which is how
# the emojis are displayed in the twitter dataset tweets.
#
# Process: 
# 1. Start with U+XXXXX
# 2. Remove only the "XXXXX" portion
# 3. Convert "XXXXX" to a 16 bit hex number
# 4. Convert the 16 bit hex number to its unicode character representation.
# 5. Encode the unicode character as 'utf-8' and decode as 'latin-1'. Now the emoji code will be encoded in
#    the same manner as found in the tweets dataset.
# ----------------------------------------------------------------------------------------------------------------
def codePointToLatinUnicode(code_point):

  output_string = ""

  for char in code_point: 

    if char != "U" and char != "+":                   # Convert U+XXXXX to XXXXX

      output_string = output_string + char            # Output string = XXXXX

  output = int(output_string, 16)                     # Covert the output string to 16 bit hex.

  output = chr(output)                                # Return the unicode character representation of the integer stored above.

  output = output.encode('utf-8').decode('latin-1')   # Get the "latin" representation of the code point value.

  return output

In [None]:
# Use the function above to convert the emoji codes formatted as "U+XXXXX" to the "latin" format that is used in the twitter dataset.
emoji_df['Latin_Unicode'] = emoji_df['Codepoint'].apply(codePointToLatinUnicode)

emoji_df.head()

Unnamed: 0,Codepoint,Emoji,Emoji_Description,Sentiment,Latin_Unicode
0,U+1F600,😀,Grinning Face,happy,ð
1,U+1F603,😃,Grinning Face with Big Eyes,happy joy,ð
2,U+1F604,😄,Grinning Face with Smiling Eyes,happy joy,ð
3,U+1F601,😁,Beaming Face with Smiling Eyes,happy joy,ð
4,U+1F606,😆,Grinning Squinting Face,happy embarassed,ð


In [None]:
# ------------------------------------------------------------------------------------------------------------------------
# This function takes the emoji codes formatted as "U+XXXXX" and returns a dictionary that maps 
# latin unicode emoji codes to the associated emoji sentiment. 
# -------------------------------------------------------------------------------------------------------------------------
def get_emoji_map(emoji_dataframe=emoji_df):

  # List to hold emoji codes formatted the way we want.
  latin_unicode_values = []

  # Get emoji codes of the form U+XXXXX
  code_point_values = list(emoji_dataframe.loc[: , 'Codepoint'].to_numpy())  
  
  # For every code, change its formatting and store the new format.
  for code in code_point_values:
    value = codePointToLatinUnicode(code)
    latin_unicode_values.append(value)

  # Get a list of the sentiment strings for each emoji.
  emoji_sentiments = list(emoji_dataframe.loc[:, 'Sentiment'].to_numpy())

  # Zip the codes and sentiments together so we can easily iterate over them and build the mapping.
  code_to_sentiment = zip(latin_unicode_values, emoji_sentiments)
  emoji_map = {}

  # Building dictionary mapping of latin_unicode_values --> emoji sentiments
  for index, mapping in enumerate(code_to_sentiment): 
    emoji_map[mapping[0]] = mapping[1]

  return emoji_map

In [None]:
# Call the function above save the emoji map. 
emoji_map = get_emoji_map()

In [None]:
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# This function takes in an "unknown emoji code". An "unknown emoji code" may be:
#
# 1. Truly unknown (it is an emoji that a custom sentiment string was not created for).
# 2. Appears unknown because it is one or more emojis stuck together. 
#
# In the case of 2 above, if any known emojis are found, this function will return the correct sentiment string for the emoji(s). 
# Otherwise, the function will return an empty string, indicating no known emojis were found.
# --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
def emoji_search(unknown_emoji_code, emoji_map=emoji_map): 

  all_emoji_codes = list(emoji_map.keys()) 

  full_sentiment = "" 

  match_lists = []
  matched_codes = []

  for code in all_emoji_codes: 
    
    raw_code_match_string = r'.*{}.*'.format(code)

    pattern = re.compile(raw_code_match_string)

    match_list = pattern.findall(unknown_emoji_code)

    if match_list != []:

      full_sentiment = " " + full_sentiment + emoji_map[code] + " "

      pattern.sub("", code)

      matched_codes.append(code)
      
    match_list = []
    
  return full_sentiment
  

In [None]:
#----------------------------------------------------------------------------------------------------------------
# Uses the emoji_map dictionary to convert the emoji unicode representation to the desired sentiment.
# Some extra logic is included for handling situations where an emoji is not in the emoji_map dictionary
# or situations where multiple emojis are stuck together and therefore there codes run together as one.
#----------------------------------------------------------------------------------------------------------------
def getEmojiSentiment(emoji_code, emoji_map=emoji_map): 

  # Use this global variable to track emoji codes that we don't have a sentiment for yet.
  global unknown_emoji_list                             

  sentiment = ""

  # If we have a sentiment for this emoji, return the sentiment.
  if emoji_code in emoji_map.keys(): 
    sentiment = " " + emoji_map[emoji_code] + " "
    return sentiment
  
  # Either this is multiple emojis stuck together, or an emoji we don't have a sentiment for.
  else: 

    code = ""

    # check to see if any known emojis are embedded in this unknown string.
    sentiment = emoji_search(unknown_emoji_code=emoji_code)

    # if the emoji code is truly unknown, update the list of unknown emojis.
    if sentiment == "":

      unknown_emoji_list.append(emoji_code)

    return sentiment

In [None]:
#---------------------------------------------------------------------------------------------------------------------------------
# This function takes in a string containing a single tweet, and parses the tweet character by character. Since we know all tweets
# are written in english, and know that the Unicode code point value for all characters on a standard U.S. keyboard will be 
# 127 or less, we can identify which characters must be part of an emoji by simply using the python ord() function to identify
# characters that have a Unicode code point value of 128 or greater. 
#---------------------------------------------------------------------------------------------------------------------------------
def convertEmojiToSentiment(input_tweet): 

  building_emoji_code = False
  emoji_code = ""
  output_tweet = ""

  for char in input_tweet:  

    if ord(char) > 127 and building_emoji_code == False:          # If this character is the first character in a new emoji.

      building_emoji_code = True                                  # Indicate that we are now building an emoji code.
      emoji_code = emoji_code + char                              # Store the first character in the emoji code.
  
    elif ord(char) > 127 and building_emoji_code == True:         # If this character is part of an emoji, and we are already in the middle of building an emoji code.

      emoji_code = emoji_code + char                              # Continue building the emoji code, this will be used to get the emoji sentiment once the code is complete.

    elif ord(char) < 127 and building_emoji_code == True:         # If we were just building an emoji code, but this character is not part of an emoji.

      emoji_description = getEmojiSentiment(emoji_code)           # The emoji code is now complete, go get the sentiment for that emoji.

      output_tweet = output_tweet + emoji_description             # Add the emojis sentiment to the output tweet.

      emoji_code = ""                                             # Reset the emoji code and sentiment to prepare to store the next one.
      emoji_description = ""

      building_emoji_code = False                                 # Reset flag indicating we are no longer building an emoji. 

      output_tweet = output_tweet + char                          # This character was not part of an emoji, so pass it through to the output tweet.

    elif ord(char) < 127 and building_emoji_code == False:        # If this character is not part of an emoji, and we have not been building one. 

      output_tweet = output_tweet + char                          # This character was not part of an emoji, so pass it through to the output tweet.

  if len(emoji_code) != 0:                                        # This section handles a situation where an emoji code was the final character in a tweet.

    emoji_description = getEmojiSentiment(emoji_code)             # Go get the sentiment for this emoji.

    output_tweet = output_tweet + emoji_description               # Add the emojis sentiment to the output tweet.

  return output_tweet

In [None]:
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# Creates a new column in tweet_df where all emoji codes are removed. If a sentiment for that emojis exists in the emoji_map dictionary,
# then the sentiment is added in place of the emoji. If no sentiment exists, the emoji is replaced with an empty string.
# ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------
tweet_df['tweet_emoji_cleaned'] = tweet_df['tweet'].apply(convertEmojiToSentiment)

tweet_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/intermediate_output_files/train_tweets_with_emojis_clean.csv")

tweet_df.head()

Unnamed: 0_level_0,label,tweet,tweet_emoji_cleaned
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty,bihday your majesty
4,0,#model i love u take with u all the time in ...,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation,factsguide: society now #motivation


In [None]:
# Save the latin unicode to sentiment emoji map to a csv in case it is needed in a later file. 
map_df = pd.DataFrame(emoji_map, columns=['Unicode', 'Sentiment'])

map_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/latin_enicode_to_sentiment_map.csv")

In [None]:
# ------------------------------------------------------------------------------------------------------------------------------------
# This cell outputs the list of emojis we didn't have sentiments for to a csv. 
# ------------------------------------------------------------------------------------------------------------------------------------

# Create a list of every unique emoji in the global list of unknown emojis.
unknown_emojis = list(set(unknown_emoji_list))

# Create a dictionary of all emoji codes that we currently do not have sentiments for.
unlisted_emoji_dict = {'Code_From_Tweet' : [], 'Code_Converted' : []}

for emoji in unknown_emojis:

  unlisted_emoji_dict['Code_From_Tweet'].append(emoji)

  try: 

    unlisted_emoji_dict['Code_Converted'].append(emoji.encode('latin-1').decode('utf-8'))

  except UnicodeDecodeError:

    unlisted_emoji_dict['Code_Converted'].append("") # Append empty string if code 

emojis_no_sentiment_df = pd.DataFrame(unlisted_emoji_dict)

emojis_no_sentiment_df.to_csv(path_or_buf="/content/drive/MyDrive/Programming/Colab Notebooks/Coding_Dojo/Twitter_Sentiment_Project/support_data/unknown_emoji.csv")

In [None]:
emojis_no_sentiment_df.head()

Unnamed: 0,Code_From_Tweet,Code_Converted
0,ð¦,🐦
1,â¡â¡,♡♡
2,ð¦,👦
3,ð£ð¼ð£ð¼ð£ð¼,🚣🏼🚣🏼🚣🏼
4,â¡ï¸â¡ï¸â¡ï¸,⚡️⚡️⚡️


In [None]:
len(emojis_no_sentiment_df.index)

1348