In [4]:
import pandas as pd
import numpy as np
import re,os,json,pprint
import datetime
import emoji
import csv

In [2]:
def check_empty_and_unexist_files(file_path_list):
    good_file_path_list = []
    bad_file_path_list = []
    for file_path in file_path_list:
        if os.path.isfile(file_path) and (os.stat(file_path).st_size > 0):
            good_file_path_list.append(file_path)
        else:
            bad_file_path_list.append(file_path)
    if len(bad_file_path_list):
        print('These files are either empty or non-exist: ', bad_file_path_list)
    return good_file_path_list


def load_tweets_from_json(file_path):
    tweets = []
    for line in open(file_path,'r'):
        tweets.append(json.loads(line))
    return tweets


def get_value_by_two_steps(tweet,keys,targets,text):
    original_tweet_text = tweet['text']
    try:
        for key in keys:
            tweet = tweet[key]
    except KeyError:
        return None, text
    if len(tweet):
        if isinstance(targets,str):
            targets = (targets,)
        result = []
        for single_item in tweet:
            single_value = tuple((single_item[temp_target] for temp_target in targets))
            if len(single_value) == 1:
                single_value = single_value[0]
            result.append(single_value)
            #remove targets
            indices = single_item['indices']
            target_string = original_tweet_text[indices[0]:indices[1]]
            text = text.replace(target_string,'')
        if len(result) == 1:
            result = result[0]
        return result,text
    else:
        return None,text
    
def extract_emojis(txt):
    return ''.join(c for c in txt if c in emoji.UNICODE_EMOJI)

def extract_no_emojis_text(txt):
    txt =  ''.join(c for c in txt if c not in emoji.UNICODE_EMOJI)
    return ' '.join(word for word in txt.split() if '@' not in word)

In [3]:
dirPath = '../data/state_id_2014-06_unzip/'
file_name_list = os.listdir(dirPath)
file_name_list = [file_name for file_name in file_name_list if re.findall('.+\.json$',file_name)]
file_path_list = [dirPath + file_name for file_name in file_name_list]

file_path_list = check_empty_and_unexist_files(file_path_list)

In [10]:
header = ['Tweet ID', 'timestamp', 'week', 'user_id', 'state', 'original text', 'with_emoji_text', 'without_emoji_text','in_reply_to_status_id_str',  'emoji',
          'hashtag', 'media(type, url)', 'user_mentions', 'language']
final_result = []
fp = open("output.csv", "a") 
wr = csv.writer(fp, dialect='excel')

for file_path in file_path_list:
    state_id = file_path.split('/')[-1].split('_')[1]
    tweets = load_tweets_from_json(file_path)
    i = 0
    for tweet in tweets:
        text = tweet['text']
        print('---------------------------------------------------------')
        print('original text:',text)
        print('emojies: ',extract_emojis(text))
        tweet_id = tweet['id_str']
        user_id = tweet['user']['id']
        language = tweet['lang']
        timestamp = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y').timestamp()
        in_reply_to_status_id_str = tweet.get('in_reply_to_status_id_str',None)
        week = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y').strftime('%A')
        hashtags,text = get_value_by_two_steps(tweet,['entities','hashtags'],'text',text)
        media,text = get_value_by_two_steps(tweet,['entities','media'],('type','media_url_https'),text)
        symbols,text = get_value_by_two_steps(tweet,['entities','symbols'],'text',text)
        urls,text = get_value_by_two_steps(tweet,['entities','urls'],'url',text)
        user_mentions, text = get_value_by_two_steps(tweet,['entities','user_mentions'],'id_str',text)
        text = re.sub( '\s+', ' ', text ).strip()
        print('\nhashtags: ',hashtags,'\nmedia: ',media,'\nsymbols: ',symbols,
              '\nurls: ',urls,'\nuser_mentions: ',user_mentions)
        print('\n with emoji',text)
        print('\n without emoji',extract_no_emojis_text(text))
        list_a_data = ([tweet_id, timestamp, week, user_id, state_id, tweet['text'], text, extract_no_emojis_text(text),in_reply_to_status_id_str,extract_emojis(text),
                            hashtags, media, user_mentions, language])
        wr.writerow(list_a_data)
        i = i +1
        if i > 100:
            break
        
    break

---------------------------------------------------------
original text: I just made a fabulous grilled cheese omg 😍
emojies:  😍

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji I just made a fabulous grilled cheese omg 😍

 without emoji I just made a fabulous grilled cheese omg
---------------------------------------------------------
original text: "What the frick" - @ash_morrison12
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  1102590956

 with emoji "What the frick" -

 without emoji "What the frick" -
---------------------------------------------------------
original text: @paigeschwarzroc comin
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  1871717250

 with emoji comin

 without emoji comin
---------------------------------------------------------
original text: Glora-"Bryannnnnnn"
Me-"Wuuuuuuuut"
Gloria-"Youre gonna make me ugly cry"
emojies:  

hasht

 without emoji id be a cool Asian blonde right..??
---------------------------------------------------------
original text: When bae lookin fine http://t.co/EKkY6Gli5o
emojies:  

hashtags:  None 
media:  ('photo', 'https://pbs.twimg.com/media/Bq37qO5CUAAKQk2.jpg') 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji When bae lookin fine

 without emoji When bae lookin fine
---------------------------------------------------------
original text: So that's Armenian? #Sideways
emojies:  

hashtags:  Sideways 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji So that's Armenian?

 without emoji So that's Armenian?
---------------------------------------------------------
original text: I miss @derekklejeski
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  1428991230

 with emoji I miss

 without emoji I miss
---------------------------------------------------------
original text: I just really want steak and pota

In [21]:
pd.DataFrame(final_result,columns = header)

Unnamed: 0,Tweet ID,timestamp,week,user_id,state,original text,with_emoji_text,without_emoji_text,in_reply_to_status_id_str,emoji,hashtag,"media(type, url)",user_mentions,language
0,481316532782247938,1.403604e+09,Tuesday,83000504,27,I just made a fabulous grilled cheese omg 😍,I just made a fabulous grilled cheese omg 😍,I just made a fabulous grilled cheese omg,,😍,,,,en
1,481315959127293952,1.403604e+09,Tuesday,175445210,27,"""What the frick"" - @ash_morrison12","""What the frick"" -","""What the frick"" -",,,,,1102590956,en
2,481316674315251712,1.403604e+09,Tuesday,1506708626,27,@paigeschwarzroc comin,comin,comin,481315964794179584,,,,1871717250,en
3,481315955700547585,1.403604e+09,Tuesday,378156754,27,"Glora-""Bryannnnnnn""\nMe-""Wuuuuuuuut""\nGloria-""...","Glora-""Bryannnnnnn"" Me-""Wuuuuuuuut"" Gloria-""Yo...","Glora-""Bryannnnnnn"" Me-""Wuuuuuuuut"" Gloria-""Yo...",,,,,,en
4,481315991519895553,1.403604e+09,Tuesday,1332657980,27,I just want my big brother to come home ):,I just want my big brother to come home ):,I just want my big brother to come home ):,,,,,,en
5,481316289915269120,1.403604e+09,Tuesday,1191518725,27,Naked? 😍,Naked? 😍,Naked?,,😍,,,,en
6,481316442827403264,1.403604e+09,Tuesday,24752078,27,Lol workin (@ Brainerd Dispatch) http://t.co/a...,Lol workin (@ Brainerd Dispatch),Lol workin Brainerd Dispatch),,,,,,en
7,481316407645184000,1.403604e+09,Tuesday,246501877,27,“@FunnyVines: Blarghablabla.. https://t.co/fxV...,“: Blarghablabla.. ” this is us,“: Blarghablabla.. ” this is us,476055025878183936,,,,"[1396161031, 1234948957]",en
8,481316359255896065,1.403604e+09,Tuesday,126849745,27,@CallMeKarizma Then we're all dead,Then we're all dead,Then we're all dead,481313267315245056,,,,461676724,en
9,481316162413031424,1.403604e+09,Tuesday,434321517,27,@4orelllaaaa @LilMont_ nigga look like a Wendy...,nigga look like a Wendy's chicken nugget yo fa...,nigga look like a Wendy's chicken nugget yo fa...,481315646039281665,,,,"[39367535, 729907855940157440]",en
