In [None]:
import os
import sys
import tqdm

from collections import defaultdict
import json
import re

import matplotlib.pyplot as plt

sys.path.insert(0, '..')
from src import emojilib

# nlp
import nltk
from nltk.corpus import stopwords
import preprocessor as p
import emoji
import string

## Config

In [None]:
# define path variables
root_path = './../'
raw_tweets_file = 'tweet_by_ID_30_4_2022__08_11_07.txt'
clean_data_folder_path = os.path.join(root_path, 'data', 'clean_data')
raw_data_folder_path = os.path.join(root_path, 'data', 'raw_data')
raw_tweets_path = os.path.join(raw_data_folder_path, raw_tweets_file)

# target emojis
mapping = { 
    '❤':'0' , '😍':'1' , '😂':'2' , '💕':'3' , 
    '🔥':'4' , '😊':'5' , '😎':'6' , '✨':'7' , 
    '💙':'8' , '😘':'9' , '📷':'10' , '🇺🇸':'11' , 
    '☀':'12' , '💜':'13' , '😉':'14' , '💯':'15' , 
    '😁':'16' , '🎄':'17' , '📸':'18' , '😜':'19'
}

## Utility Methods

In [None]:
# compress consecutive occurences of each emoji to one. eg: "Hilarious 😂😂😂" becomes "Hilarious 😂"
def compress_tweet(tweet):
    
    # get emoji list
    emoji_list = emojilib.emoji_list(tweet)
    
    emoji_done = set()
    for emoji in emoji_list:
        emoji_code = emoji['code']
        if emoji_code not in emoji_done:
            # replace "<emoji><emoji><emoji>" as "<emoji>"
            tweet = re.sub(f'{emoji_code}+', f' {emoji_code} ', tweet)
            
            # replace "<emoji> <emoji> <emoji>" as "<emoji>"
            tweet = re.sub(f'{emoji_code} +', f' {emoji_code} ', tweet)
            
            emoji_done.add(emoji_code)
            
    # remove extra space
    tweet = re.sub(f' +', f' ', tweet)
        
    return tweet

In [None]:
def parse_tweet(text):
    
    # clean everything except emoji
    p.set_options(p.OPT.URL, p.OPT.MENTION, 
                  p.OPT.HASHTAG, p.OPT.RESERVED,
                  p.OPT.NUMBER, p.OPT.SMILEY)
    
    cleaned_tweet = p.clean(text)
    
    # remove all punctuations
    cleaned_tweet = "".join(l for l in cleaned_tweet if l not in string.punctuation)
    
    # remove this weird special character (seems to be occurring in many tweets)
    cleaned_tweet = re.sub('[…]', '', cleaned_tweet)
    
    # compress tweet - compress multiple consecutive emojis to one
    cleaned_tweet = compress_tweet(cleaned_tweet)
    
    # get the list of emojis
    emoji_list = emojilib.emoji_list(cleaned_tweet)
    
    # tokenize emojis
    p.set_options(p.OPT.URL, p.OPT.MENTION, 
                  p.OPT.HASHTAG, p.OPT.RESERVED,
                  p.OPT.NUMBER, p.OPT.SMILEY, 
                  p.OPT.EMOJI)    
        
    # for some reason this emoji alone is not being tokenized
    cleaned_tweet_1 = cleaned_tweet.replace('🇺🇸', '<EMOJI>')
    
    # replaces each emoji with a special token $EMOJI$
    tokenized_tweet = p.tokenize(cleaned_tweet_1)
    tokenized_tweet = tokenized_tweet.replace('$EMOJI$', '<EMOJI>')
    
    # returning the tweet with no emoji as well
    untokenized_tweet = tokenized_tweet.replace('<EMOJI> ', '')

    return emoji_list, cleaned_tweet, tokenized_tweet, untokenized_tweet

In [None]:
# check a sample parse tweet
parse_tweet("😂😂😂 Oh my god... That was hilarious!!! I am not sleeping … tonight wow 😂😂😂 …")

In [None]:
def parse_tweets(clean_data_folder_path, raw_tweets_file):
    
    # this file contains the raw tweets. Present inside data/raw_data/
    raw_tweets_file_name = raw_tweets_file.split('/')[-1]
    
    # open file handles to the files where we dump
    out_text = open(os.path.join(clean_data_folder_path, "tweets.text"), 'w')
    out_labels = open(os.path.join(clean_data_folder_path, "tweets.labels"), 'w')
    out_ids = open(os.path.join(clean_data_folder_path, "tweets.ids"), 'w')
    out_tokenized = open(os.path.join(clean_data_folder_path, "tweets.tokenized"), 'w')
    out_notoken = open(os.path.join(clean_data_folder_path, "tweets.notoken"), 'w')
    
    count = 0
    with open(raw_tweets_file) as f_in:
        for line in tqdm.tqdm(f_in):
            
            # each line is a json file with a lot of information. load the json
            json_data = json.loads(line)
            
            # extract the tweet id
            tweet_id = json_data['id']
            
            # extract the raw text
            tweet_text = json_data['text'].replace("\n","")
            
            # parse the tweet. check parse_tweet() function above for all details
            emoji_list, cleaned_tweet, tokenized_tweet, untokenized_tweet = parse_tweet(tweet_text)
            
            # print(f"emoji_list: {emoji_list}")
            # print(f"cleaned_tweet: {cleaned_tweet}")
            # print(f"tokenized_tweet: {tokenized_tweet}")
            # print("\n\n")
            
            # we ignore all tweets that have 0 emojis or more than 1 emoji
            if len(emoji_list) != 1:
                continue
            
            # dump clean tweet
            out_text.write(cleaned_tweet+"\n")
            
            # dump tokenized tweet - tokenized tweet is basically clean tweet with <EMOJI> token 
            # instead of the actual emoji itself
            out_tokenized.write(tokenized_tweet+"\n")
            
            try:
                # +1 to account for the start token
                emoji_word_location = tokenized_tweet.split(' ').index('<EMOJI>') + 1
            except Exception as e:
                print(f"tokenized_tweet: {tokenized_tweet}")
                print(f"emoji_list: {emoji_list}")
                continue
            
            # untokenized_tweet is the clean tweet (<EMOJI> token is also removed)
            out_notoken.write(untokenized_tweet+"\n")
            
            # dump the tweet id (not sure if we need this)
            out_ids.write(str(tweet_id)+"\n")
            
            # dump the emoji data as space separated triplets (code, location, name)
            # UPDATE: len(emoji_list) should be just 1
            for emoji in emoji_list:
                location = emoji['location']
                code = emoji['code']
                name = emoji['name']
                out_labels.write(f"{mapping[code]},{location[0]},{emoji_word_location},{name}")
            out_labels.write("\n") 
            
            count += 1
            # if count > 50:
            #     break
    print(f"Total tweets saved: {count}")

## Parse tweets (Run only once). Generated data available through the data.zip drive folder

In [None]:
emoji_freq = parse_tweets(clean_data_folder_path, raw_tweets_path)

## Visualize the distribution

In [None]:
emoji_freq = defaultdict(int)
with open(os.path.join(clean_data_folder_path, 'tweets.labels'), 'r') as f:
    for line in f:
        emoji = line.rstrip()
        emoji_code = int(emoji.split(',')[0])
        emoji_freq[emoji_code] += 1
plt.bar(range(len(emoji_freq)), list(emoji_freq.values()), align='center')
plt.xticks(range(len(emoji_freq)), list(emoji_freq.keys()))
plt.show()