In [1]:
import os
import csv
import emoji
import pandas as pd

In [2]:
path = './input/tweets'
output_path = './input/data'
dataset_path = os.path.join(output_path, 'tweets_dataset.csv')

In [3]:
def createDirectory(path):
    if not os.path.exists(path):
        os.makedirs(path)       
    return

In [4]:
def createDataset(path, output_path):
    
    createDirectory(output_path)
    files = os.listdir(path)
    filename = os.path.join(path, files[0])
    dataset_path = os.path.join(output_path, 'tweets_dataset.csv')
    
    data = pd.read_csv(filename, encoding = 'utf-8', sep = '\t', nrows = 3)
    header = list(data.columns) 
    header.append('file_source')

    with open(dataset_path, 'w', encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(header)
    
    return

In [5]:
createDataset(path, output_path)

In [6]:
def appendData(data):
    data.to_csv(dataset_path, mode = 'a', encoding = 'utf-8', header = False, index = False)
    print('File data saved succesfully')
    return 

In [7]:
def populateDataset(directory):
    for filename in os.listdir(directory):
        file = os.path.join(directory, filename)
        if 'BR' not in filename and os.path.isfile(file):
            data = pd.read_csv(file, encoding = 'utf-8', sep = '\t') 
            data['file_source'] = filename
            appendData(data)
            
    return

In [9]:
populateDataset(path)

File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully
File data saved succesfully


### Check emojis

In [42]:
from collections import Counter
import numpy as np
import re

In [11]:
df = pd.read_csv(dataset_path, encoding = 'utf-8')
df.head()

Unnamed: 0,tweet.created_at,tweet_text,tweet.retweet_count,tweet.source,tweet.user.created_at,tweet.user.followers_count,tweet.user.friends_count,tweet.user.location,tweet.user.name,file_source
0,2020-03-14 23:59:55,#CORONAVIRUS La Universidad de Buenos Aires #U...,0,Instagram,2013-09-23 02:17:08,202,804,Don Torcuato.,En el medio,all_AR.csv
1,2020-03-14 23:59:53,RT @UBARectorado: Suspensión temporal de la cu...,3630,Twitter for Android,2012-08-08 18:14:00,311,179,Ciudad Autónoma de Buenos Aire,Ro🌿💚,all_AR.csv
2,2020-03-14 23:59:52,Aplausos en balcones y terrazas de toda España...,0,Instagram,2015-10-17 01:26:37,3442,1927,"Buenos Aires, Argentina",F.C Desde Lejos Arg,all_AR.csv
3,2020-03-14 23:59:51,Emanuel Más sos mas malo que el #coronavirus .,0,Twitter for iPhone,2011-06-03 01:04:21,624,514,Capital Federal,RBQ,all_AR.csv
4,2020-03-14 23:59:51,RT @RubinsteinOk: En un escenario tan cambiant...,1902,Twitter Web App,2015-09-21 21:38:47,248,381,,Jose Maria,all_AR.csv


In [12]:
df.shape

(3236245, 10)

In [13]:
df.dropna(subset=['tweet_text', 'tweet.user.name'], axis = 0, inplace = True)
all_texts = ' '.join(df['tweet_text'].values)  
all_users = ' '.join(df['tweet.user.name'].values)

In [31]:
%%time
emoji_list = [c for c in all_texts if c in emoji.UNICODE_EMOJI['es']]

Wall time: 39.9 s


In [63]:
list(emoji.UNICODE_EMOJI['es'].keys())[0:3]

['😀', '😃', '😄']

In [None]:
count_emojis = Counter(emoji_list)
order_count_emojis = {k: v for k, v in sorted(count_emojis.items(), key=lambda item: -item[1])}

In [None]:
order_count_emojis

In [None]:
emoji_list = re.findall(r'[^\w\s,]', all_texts)
set(emoji_list)

In [None]:
emoji_list = re.findall(u"[\\U0001F600-\\U0001F64F]", all_texts)
set(emoji_list)

In [None]:
set([c for c in all_users if c in emoji.UNICODE_EMOJI['es']])

### Vectorize emojis

In [51]:
subdata  = df.iloc[0:1000, :].copy()
subdata.shape

(1000, 10)

In [52]:
def extract_emojis(text):
    emoji_list = [c for c in text if c in emoji.UNICODE_EMOJI['es']]
    first_emoji = emoji_list[0] if len(emoji_list) > 0 else ''
    emotion, emoji_icon = 'happy', first_emoji
    return len(emoji_list), emotion, emoji_icon

In [53]:
vfunc = np.vectorize(extract_emojis)

In [55]:
count, emotions, icons = vfunc(subdata['tweet_text'])

In [62]:
subdata['count'] = count
subdata['emotions'] = emotions
subdata['icons'] = icons
subdata.head(5)

Unnamed: 0,tweet.created_at,tweet_text,tweet.retweet_count,tweet.source,tweet.user.created_at,tweet.user.followers_count,tweet.user.friends_count,tweet.user.location,tweet.user.name,file_source,emoji_count,emotion,emoji_icon,count,emotions,icons
0,2020-03-14 23:59:55,#CORONAVIRUS La Universidad de Buenos Aires #U...,0,Instagram,2013-09-23 02:17:08,202,804,Don Torcuato.,En el medio,all_AR.csv,,,,0,happy,
1,2020-03-14 23:59:53,RT @UBARectorado: Suspensión temporal de la cu...,3630,Twitter for Android,2012-08-08 18:14:00,311,179,Ciudad Autónoma de Buenos Aire,Ro🌿💚,all_AR.csv,,,,0,happy,
2,2020-03-14 23:59:52,Aplausos en balcones y terrazas de toda España...,0,Instagram,2015-10-17 01:26:37,3442,1927,"Buenos Aires, Argentina",F.C Desde Lejos Arg,all_AR.csv,,,,0,happy,
3,2020-03-14 23:59:51,Emanuel Más sos mas malo que el #coronavirus .,0,Twitter for iPhone,2011-06-03 01:04:21,624,514,Capital Federal,RBQ,all_AR.csv,,,,0,happy,
4,2020-03-14 23:59:51,RT @RubinsteinOk: En un escenario tan cambiant...,1902,Twitter Web App,2015-09-21 21:38:47,248,381,,Jose Maria,all_AR.csv,,,,0,happy,
