In [None]:
import os
import sys
import tqdm

from collections import defaultdict
import json
import re

import matplotlib.pyplot as plt

sys.path.insert(0, '..')
from src import emojilib

# nlp
import nltk
from nltk.corpus import stopwords
import preprocessor as p
import emoji

## Config

In [None]:
root_path = './'
raw_tweets_file = 'tweet_by_ID_30_4_2022__08_11_07.txt'
clean_data_folder_path = os.path.join(root_path, 'data', 'clean_data')
raw_data_folder_path = os.path.join(root_path, 'data', 'raw_data')
raw_tweets_path = os.path.join(raw_data_folder_path, raw_tweets_file)

# target emojis
mapping = { 
    '❤':'0' , '😍':'1' , '😂':'2' , '💕':'3' , 
    '🔥':'4' , '😊':'5' , '😎':'6' , '✨':'7' , 
    '💙':'8' , '😘':'9' , '📷':'10' , '🇺🇸':'11' , 
    '☀':'12' , '💜':'13' , '😉':'14' , '💯':'15' , 
    '😁':'16' , '🎄':'17' , '📸':'18' , '😜':'19'
}

## Utility Methods

In [None]:
def parse_tweet(text):
    
    p.set_options(p.OPT.URL, p.OPT.MENTION, 
                  p.OPT.HASHTAG, p.OPT.RESERVED,
                  p.OPT.NUMBER, p.OPT.SMILEY)
    
    cleaned_tweet = p.clean(text)
    emoji_list = emojilib.emoji_list(cleaned_tweet)

    return emoji_list, cleaned_tweet

In [None]:
def parse_tweets(clean_data_folder_path, raw_tweets_file):
    raw_tweets_file_name = raw_tweets_file.split('/')[-1]
    out_text = open(os.path.join(clean_data_folder_path, raw_tweets_file_name + ".text"), 'w')
    out_labels = open(os.path.join(clean_data_folder_path, raw_tweets_file_name + ".labels"), 'w')
    out_ids = open(os.path.join(clean_data_folder_path, raw_tweets_file_name + ".ids"), 'w')
    
    count = 0
    with open(raw_tweets_file) as f_in:
        for line in tqdm.tqdm(f_in):
            json_data = json.loads(line)
            tweet_id = json_data['id']
            tweet_text = json_data['text'].replace("\n","")
            
            emoji_list, cleaned_tweet = parse_tweet(tweet_text)
            
            # dump clean tweet
            out_text.write(cleaned_tweet+"\n")
            
            # dump the tweet id
            out_ids.write(str(tweet_id)+"\n")
            
            # dump the emoji data as space separated triplets (code, location, name)
            for emoji in emoji_list:
                location = emoji['location']
                code = emoji['code']
                name = emoji['name']
                out_labels.write(f"({mapping[code]},{location[0]},{name}) ")
            out_labels.write("\n") 

## Parse tweets (Run only once)

In [None]:
#emoji_freq = parse_tweets(clean_data_folder_path, raw_tweets_path)

## Visualize the distribution

In [None]:
emoji_freq = defaultdict(int)
with open(os.path.join(clean_data_folder_path, 'tweet_by_ID_30_4_2022__08_11_07.txt.labels'), 'r') as f:
    for line in f:
        emojis = line.rstrip().split(' ')
        #print(emojis)
        if len(emojis) <= 1:
            continue
        for emoji in emojis:
            emoji_code = int(emoji.split(',')[0][1:])
            emoji_freq[emoji_code] += 1
plt.bar(range(len(emoji_freq)), list(emoji_freq.values()), align='center')
plt.xticks(range(len(emoji_freq)), list(emoji_freq.keys()))
plt.show()