In [1]:
import pandas as pd
import numpy as np
import re,os,json,pprint
import datetime
import emoji

In [2]:
def check_empty_and_unexist_files(file_path_list):
    good_file_path_list = []
    bad_file_path_list = []
    for file_path in file_path_list:
        if os.path.isfile(file_path) and (os.stat(file_path).st_size > 0):
            good_file_path_list.append(file_path)
        else:
            bad_file_path_list.append(file_path)
    if len(bad_file_path_list):
        print('These files are either empty or non-exist: ', bad_file_path_list)
    return good_file_path_list


def load_tweets_from_json(file_path):
    tweets = []
    for line in open(file_path,'r'):
        tweets.append(json.loads(line))
    return tweets


def get_value_by_two_steps(tweet,keys,targets,text):
    original_tweet_text = tweet['text']
    try:
        for key in keys:
            tweet = tweet[key]
    except KeyError:
        return None, text
    if len(tweet):
        if isinstance(targets,str):
            targets = (targets,)
        result = []
        for single_item in tweet:
            single_value = tuple((single_item[temp_target] for temp_target in targets))
            if len(single_value) == 1:
                single_value = single_value[0]
            result.append(single_value)
            #remove targets
            indices = single_item['indices']
            target_string = original_tweet_text[indices[0]:indices[1]]
            text = text.replace(target_string,'')
        if len(result) == 1:
            result = result[0]
        return result,text
    else:
        return None,text
    
def extract_emojis(txt):
    return ''.join(c for c in txt if c in emoji.UNICODE_EMOJI)

def extract_no_emojis_text(txt):
    return ''.join(c for c in txt if c not in emoji.UNICODE_EMOJI)

In [6]:
dirPath = '../data/state_id_2014-06_unzip/'
file_name_list = os.listdir(dirPath)
file_name_list = [file_name for file_name in file_name_list if re.findall('.+\.json$',file_name)]
file_path_list = [dirPath + file_name for file_name in file_name_list]

file_path_list = check_empty_and_unexist_files(file_path_list)

In [15]:
header = ['Tweet ID', 'timestamp', 'week', 'user_id', 'state', 'original text', 'with_emoji_text', 'with_emoji_text', 'emoji',
          'hashtag', 'media(type, url)', 'user_mentions', 'language']
final_result = []
for file_path in file_path_list:
    state_id = file_path.split('/')[-1].split('_')[1]
    tweets = load_tweets_from_json(file_path)
    i = 0
    for tweet in tweets:
        text = tweet['text']
        print('---------------------------------------------------------')
        print('original text:',text)
        print('emojies: ',extract_emojis(text))
        tweet_id = tweet['id_str']
        user_id = tweet['user']['id']
        language = tweet['lang']
        timestamp = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y').timestamp()
        week = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y').strftime('%A')
        hashtags,text = get_value_by_two_steps(tweet,['entities','hashtags'],'text',text)
        media,text = get_value_by_two_steps(tweet,['entities','media'],('type','media_url_https'),text)
        symbols,text = get_value_by_two_steps(tweet,['entities','symbols'],'text',text)
        urls,text = get_value_by_two_steps(tweet,['entities','urls'],'url',text)
        user_mentions, text = get_value_by_two_steps(tweet,['entities','user_mentions'],'id_str',text)
        text = re.sub( '\s+', ' ', text ).strip()
        print('\nhashtags: ',hashtags,'\nmedia: ',media,'\nsymbols: ',symbols,
              '\nurls: ',urls,'\nuser_mentions: ',user_mentions)
        print('\n with emoji',text)
        print('\n without emoji',extract_no_emojis_text(text))
        final_result.append([tweet_id, timestamp, week, user_id, state_id, tweet['text'], text, extract_no_emojis_text(text),extract_emojis(text),
                            hashtags, media, user_mentions, language])
        i = i +1
        if i > 100:
            break
        
    break

---------------------------------------------------------
original text: @Herc_Thurman you have xbox or ps3
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  393133331

 with emoji you have xbox or ps3

 without emoji you have xbox or ps3
---------------------------------------------------------
original text: OMG. @Redneck_lovee just followed me! 🙌🙌
emojies:  🙌🙌

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji OMG. @Redneck_lovee just followed me! 🙌🙌

 without emoji OMG. @Redneck_lovee just followed me! 
---------------------------------------------------------
original text: The hardest choice in life is usually between what you want, and what you deserve.
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji The hardest choice in life is usually between what you want, and what you deserve.

 without emoji The hardest choice in life is usually betwee

---------------------------------------------------------
original text: @AskMyselfWhy I back this 100%.
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  819311696804249601

 with emoji I back this 100%.

 without emoji I back this 100%.
---------------------------------------------------------
original text: If I text u and u don't text back I'm deleting your number
emojies:  

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  None

 with emoji If I text u and u don't text back I'm deleting your number

 without emoji If I text u and u don't text back I'm deleting your number
---------------------------------------------------------
original text: “@_PURPleBomb: “@_4EverDOPE_: @_PURPleBomb who” KP 😂😂✊💁”😂😂 I had a feeling u was talking about him
emojies:  😂😂✊💁😂😂

hashtags:  None 
media:  None 
symbols:  None 
urls:  None 
user_mentions:  ['388380241', '322430565', '388380241']

 with emoji “: “: who” KP 😂😂✊💁”😂😂 I had a fe

In [17]:
pd.DataFrame(final_result,columns = header)

Unnamed: 0,Tweet ID,timestamp,week,user_id,state,original text,with_emoji_text,with_emoji_text.1,emoji,hashtag,"media(type, url)",user_mentions,language
0,473707609397415936,1.401790e+09,Tuesday,289326876,21,@Herc_Thurman you have xbox or ps3,you have xbox or ps3,you have xbox or ps3,,,,393133331,en
1,473707099709378560,1.401790e+09,Tuesday,2220624454,21,OMG. @Redneck_lovee just followed me! 🙌🙌,OMG. @Redneck_lovee just followed me! 🙌🙌,OMG. @Redneck_lovee just followed me!,🙌🙌,,,,en
2,473706257212514304,1.401790e+09,Tuesday,408782163,21,The hardest choice in life is usually between ...,The hardest choice in life is usually between ...,The hardest choice in life is usually between ...,,,,,en
3,473706548376903680,1.401790e+09,Tuesday,415882070,21,"@EmreeWitt ACTUALLY, there was soap...S.E. gir...","ACTUALLY, there was soap...S.E. girl...","ACTUALLY, there was soap...S.E. girl...",,,,423639698,en
4,473707041513832448,1.401790e+09,Tuesday,41456280,21,It's 2am my butt bout to be kO'd. He better hu...,It's 2am my butt bout to be kO'd. He better hu...,It's 2am my butt bout to be kO'd. He better hu...,,,,,en
5,473706964045033473,1.401790e+09,Tuesday,408782163,21,"""@SnapchatProbbz: I LOVE NOT WEARING MAKEUP BC...",""": I LOVE NOT WEARING MAKEUP BC I CAN JUST RUB...",""": I LOVE NOT WEARING MAKEUP BC I CAN JUST RUB...",,,,971277218,en
6,473706239055380480,1.401790e+09,Tuesday,300627056,21,I'm too obsessed with shows about Big Foot.,I'm too obsessed with shows about Big Foot.,I'm too obsessed with shows about Big Foot.,,,,,en
7,473707335769006082,1.401790e+09,Tuesday,35908587,21,Got me listening to this like http://t.co/e4cK...,Got me listening to this like via,Got me listening to this like via,,,,10228272,en
8,473707250213998592,1.401790e+09,Tuesday,408782163,21,"""@ThSpoiledBitch: Truth is I don't think you'l...",""": Truth is I don't think you'll ever change""",""": Truth is I don't think you'll ever change""",,,,481792176,en
9,473706324904398848,1.401790e+09,Tuesday,357007553,21,Who do you love ?,Who do you love ?,Who do you love ?,,,,,en
