In [16]:
import numpy as np
import pandas as pd
from datetime import datetime
import json
import os
from glob import glob 
import itertools

In [10]:

def fix_datetime(df, timevar='created_at_h'):
    df[timevar] = pd.to_datetime(df[timevar])

def fix_token_counter(df):
    df.token_counter = df.token_counter.apply(lambda x: Counter(x))  

def fix_RT_id(df):
    df.RT_id = df.RT_id.astype(str) 


def convert_floats(df, float_dtype='float32'):
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].astype(float_dtype)
    return df

def tw_data_files_to_df_csv(files):
    '''append and concat data files into a pandas.DataFrame'''
    df = []
    [ df.append(pd.read_csv(file)) for file in files ]
    df = pd.concat(df, ignore_index=True)
    return df

def tw_data_files_to_df_csv2(files, frac=0.05, float_dtype=None):
    '''append and concat a sample of data into a pandas.DataFrame'''
    df = []
    [ df.append(pd.read_csv(file, low_memory=True)
        .sample(frac=frac, replace=True)) for file in files ]
    df = pd.concat(df, ignore_index=True)
    if float_dtype is None: return df
    return convert_floats(df, float_dtype)


def tw_data_files_to_df_json(files, lines=False):
    '''append and concat data files into a pandas.DataFrame'''
    df = []
    [ df.append(pd.read_json(file, orient='records', lines=lines)) for file in files ]
    df = pd.concat(df, ignore_index=True)
    return df


def tw_data_files_to_df_json3(files, lines=False, frac=0.05, float_dtype=None, verbose=False):
    '''append and concat a sample of data into a pandas.DataFrame'''
    df = []
    for file in files:
        if verbose: print('loading ' + file)
        df.append(pd.read_json(file, orient='records', lines=lines)
                 .sample(frac=frac, replace=True)) 
    df = pd.concat(df, ignore_index=True)
    if float_dtype is None: return df
    return convert_floats(df, float_dtype)

def keep_recent_files(files, base_timestamp, file_type= '.json', days = 14):
    timestamps = [pd.Timestamp(file.split('created_at_',1)[1]
                               .replace(file_type,'').replace('_',' ')) for file in files ]
    keep_idx1 = [(base_timestamp - timestamp) <= pd.Timedelta(days, unit='d') for timestamp in timestamps]
    return(list(itertools.compress(files,keep_idx1)))



In [172]:
data_path = '/Users/kotaminegishi/big_data_training/python/dash_demo1/'
data_dest = '/Users/kotaminegishi/big_data_training/python/dash_demo1/'

process_datatime = pd.to_datetime(datetime(2020,7,13))
process_datatime_d = process_datatime.floor('d')
#_dt = latest_datatime.floor('d').to_pydatetime()


In [157]:
df = pd.read_json('/Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/retweet/created_at_2020-07-04_00:00:00.json', orient='records', lines=True)
idx = [mark_tokens_contain_keyword(df, keyword) for keyword in ['Los Angeles','L.A.','L. A.']]



In [164]:
pd.DataFrame(idx).agg(max).astype(bool)


0    False
1    False
2    False
3    False
4    False
5    False
dtype: bool

In [247]:
def mark_tokens_contain_keyword(df, keyword):
    # returns an index indicating whether variable 'tokens' contains keyword
    return df.tokens.apply(lambda x: keyword.lower() in x)

def mark_tokens_contain_keywords(df, keywords):
    idx = [mark_tokens_contain_keyword(df, keyword) for keyword in keywords]
    return pd.DataFrame(idx).agg(max).astype(bool)
    
def mark_tokens_contain_keyword_jointly(df, keywords):
    # returns an index indicating whether variable 'tokens' contains keyword
    idx = [mark_tokens_contain_keyword(df, keyword) for keyword in keywords]
    return pd.DataFrame(idx).agg(min).astype(bool) 
    
def get_columns_json(file):
    chunk1 = pd.read_json(file, chunksize=1, orient='records', lines=True)
    data1 = [d.iloc[0] for d in chunk1]
    return list(data1[0].keys())

def get_columns_csv(file):
    chunk1 = pd.read_csv(file, chunksize=1)
    return list(chunk1.read(1).keys())


def tw_data_files_to_df_json_filter(files, filter_word, lines=True, float_dtype=None, verbose=False):
    '''append and concat filtered data into a pandas.DataFrame'''
    if type(filter_word) != list: raise ValueError("filter_word must be a list")

    df = []
    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            
        df_file = pd.read_json(file, orient='records', lines=lines)
        if (len(filter_word) >1): idx = mark_tokens_contain_keywords(df_file, filter_word)
        else: idx = mark_tokens_contain_keyword(df_file, filter_word[0])
        df_file_filtered = df_file[idx]
        if len(df_file_filtered)>0:
            df.append(df_file_filtered)
    
    if len(df)==0: return df_null
    df = pd.concat(df, ignore_index=True)
    if float_dtype is None: return df
    return convert_floats(df, float_dtype)


In [565]:
def get_columns_csv(file):
    chunk1 = pd.read_csv(file, chunksize=1)
    return list(chunk1.read(1).keys())


In [566]:
get_columns_csv(files_sentiments[0])
#chunk1 = pd.read_csv(files_sentiments[0], chunksize=1)
#[d for d in chunk1]

['id', 'created_at_h', 'neg', 'neu', 'pos', 'compound']

In [173]:
cum_retweet = pd.read_json(data_dest + "data_cumulative/retweet/2020_all_retweets.json",
         lines=True, orient='records')

In [312]:
cum_retweet.RT_id.astype(str)
cum_retweet.user_id.astype(str)
cum_retweet.created_at.astype(str)

0                  30799939
1                 319901933
2                 422798683
3       1163289131394334720
4                2473243064
               ...         
8316    1115637931094360064
8317               68568238
8318             1654567320
8319               49330477
8320               19598680
Name: user_id, Length: 8321, dtype: object

In [314]:
def df_vars_convert_to_str(df, vars):
    for var in vars:
        df[var] = df[var].astype(str)
        
        

In [318]:
df_vars_convert_to_str(cum_retweet, ['RT_id','user_id','created_at','created_at_h'])


In [319]:
cum_retweet.created_at

0       2020-06-30 13:52:34
1       2020-06-19 16:29:04
2       2020-06-29 23:07:04
3       2020-05-29 23:37:51
4       2020-06-03 00:44:20
               ...         
8316    2020-05-27 22:59:23
8317    2020-06-01 15:32:15
8318    2020-06-05 13:37:39
8319    2020-06-02 18:09:41
8320    2020-06-08 23:37:51
Name: created_at, Length: 8321, dtype: object

In [179]:
tmp_idx = mark_tokens_contain_keyword(cum_retweet, '#LosAngeles')
filtered_retweet2 = cum_retweet[tmp_idx]
filtered_retweet2

Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens
4019,1267092045870567424,2020-05-31 07:54:19,887275420990795776,Mkilic76,1913,169,#DirençliMadde #ResilientMatter Mutlak hiçlik ...,"You are the police, you secure the people.\n\n...",220,https://t.co/Qq0fN4A80b,"[#LosAngeles, #GeorgeFloydProtests, #Anonymous...",[https://t.co/ewvOus9OlL],en,2020-05-31 07:00:00,"[police,, secure, people., cuff, batons., kill..."
6468,1282640691944779776,2020-07-13 05:39:05,2940882906,BLMLA,133202,436,Official Twitter for #BlackLivesMatter-Los Ang...,"Today, July 13th, is the 7 year anniversary of...",101,https://t.co/K3MnLNwuCj,"[#BlackLivesMatter,, #LosAngeles, #TrayvonMart...",[https://t.co/yJXpWOOeN7],en,2020-07-13 05:00:00,"[today,, july, 13th,, 7, year, anniversary, #b..."


In [176]:
tmp_idx = mark_tokens_contain_keyword_jointly(cum_retweet, ['Los', 'Angeles'])
filtered_retweet3 = cum_retweet[tmp_idx]


In [177]:
filtered_retweet3

Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens
442,1266978310556389376,2020-05-31 00:22:23,1703320855,ArianaToday,371394,196,Your ultimate fan-source for the latest @Arian...,Another photo of Ariana at the #BlackLivesMatt...,14705,https://t.co/IpeD7QznPp,[#BlackLivesMatter],[],en,2020-05-31 00:00:00,"[another, photo, ariana, protest, los, angeles]"
2590,1267239801448783872,2020-05-31 17:41:27,1012800001,PontoffelPock,151,467,"You pull on the pull-em, and push on the push-...","PROTEST! But, keep this in mind before you des...",346,https://t.co/f6hYFeEDVO,"[#BlackLivesMatter, #protests2020, #Minneapoli...",[https://t.co/WX4oUqXLWA],en,2020-05-31 17:00:00,"[protest!, but,, keep, mind, destroy, people's..."
2793,1279828664448778240,2020-07-05 11:25:06,297254650,calexity,6489,2304,📈 Growth Designer. Designing for impact @lexro...,"""Under DA Jackie Lacey's watch, 609 people in ...",100,https://t.co/4r1njMRpBE,"[#BlackLivesMatter, #JackieLaceyMustGo]",[],en,2020-07-05 11:00:00,"[""under, da, jackie, lacey's, watch,, 609, peo..."
2815,1279826150752223232,2020-07-05 11:15:07,109326781,ricci_sergienko,3856,2323,my views 100% represent my employer | we are g...,Full page ad in today’s LA times:\n\nOver 609 ...,100,https://t.co/Y888pWdEoC,"[#BlackLivesMatter, #JackieLaceyMustGo]",[https://t.co/Rogj5X6pba],en,2020-07-05 11:00:00,"[full, page, ad, today’s, la, times:, 609, peo..."
2927,1270078331149357056,2020-06-08 13:40:45,525589542,6AMGroup,20422,7959,Digital platform ➕ online resource inspiring &...,".@yakooza (IG) captured the 20,000 peaceful pr...",226,https://t.co/XtJVcLrVyH,"[#BlackLivesMatter, #losangelesprotest, #Equal...",[https://t.co/QKmvQWy2xY],en,2020-06-08 13:00:00,"[(ig), capture, 20,000, peaceful, protestors, ..."
3088,1276746284074823680,2020-06-26 23:16:49,1276318554992726016,ConcernedLACity,365,26,Current and former City of LA staff in support...,"In support of #BlackLivesMatter , Los Angeles ...",137,https://t.co/punnSM08Ub,"[#BlackLivesMatter, #PeoplesBudgetLA.]",[https://t.co/xNgZBUDFsj],en,2020-06-26 23:00:00,"[support, los, angeles, city, workers, call, m..."
4205,1267577663536852992,2020-06-01 16:04:00,2940882906,BLMLA,131744,436,Official Twitter for #BlackLivesMatter-Los Ang...,We need everyone to show up...virtually...to t...,2234,https://t.co/4gQFpymC6w,"[#GeorgeFloyd...It's, #BlackLivesMatter]",[https://t.co/p3sevLDwUQ],en,2020-06-01 16:00:00,"[need, everyone, show, up...virtually...to, te..."
4931,1267600885389066240,2020-06-01 17:36:16,1187773045949247488,BTWthisShit,101,31,#BlackLiveMatters✊🏻✊🏼✊🏽✊🏾✊🏿,Halsey and YUNGBLUD are on the streets of Los ...,847,https://t.co/r3q9tjIr7A,"[#PizzaGate, #BlackLivesMatter, #Anonymuos]",[https://t.co/x9Tib7VSu5],en,2020-06-01 17:00:00,"[halsey, yungblud, street, los, angeles, medic..."
4978,1269458708611899392,2020-06-06 20:38:36,4429003533,PopCrave,529264,2297,"Your Go-to Source for Pop Culture News, Chart ...",Billie Eilish at a #BlackLivesMatter protest ...,2623,https://t.co/9to3ANQCDu,[#BlackLivesMatter],[],en,2020-06-06 20:00:00,"[billie, eilish, protest, los, angeles, today.]"
5479,1267930507494465536,2020-06-02 15:26:04,766674511475474432,BestAffleck,5014,130,"The best of @BenAffleck . Director, actor, wri...",Ben Affleck and Ana de Armas \nattend the #Bla...,362,https://t.co/mclwU1GXoQ,[#BlackLivesMatter],[],en,2020-06-02 15:00:00,"[ben, affleck, ana, de, armas, attend, protest..."


In [244]:
files_retweet = glob.glob(data_path + "data_cumulative/retweet/*")

In [245]:
filtered_retweet = tw_data_files_to_df_json_filter(files_retweet, ['Minneapolis'])
filtered_retweet

                    RT_id          created_at              user_id  \
339   1271228625933619200 2020-06-11 17:51:37           2835451658   
564   1277286903641866240 2020-06-28 11:05:03             98956941   
635   1266515656037588992 2020-05-29 17:43:57  1258993079287197696   
1561  1266752338221506560 2020-05-30 09:24:27            175065805   
2232  1266782245869805568 2020-05-30 11:23:17   743998433455808512   
2277  1267058623911428096 2020-05-31 05:41:31  1219534043210973184   
2682  1267218415736872960 2020-05-31 16:16:28            195271137   
2905  1267001484387840000 2020-05-31 01:54:28           1538581122   
2914  1265744876823609344 2020-05-27 14:41:09  1187684601315115008   
3163  1266286100743520256 2020-05-29 02:31:47           2835451658   
3505  1276695049628155904 2020-06-26 19:53:14  1090715513586679808   
3932  1266224353420759040 2020-05-28 22:26:25   793282577733971968   
4122  1267249755949658112 2020-05-31 18:21:00            373157754   
4290  12653241978464

Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens
0,1271228625933619200,2020-06-11 17:51:37,2835451658,MrAndyNgo,457035,667,"Editor-at-large - @TPostMillennial. ""Unmasked""...",This is what rioters did to the Minneapolis Sh...,21050,https://t.co/GTAGV805i6,[#BlackLivesMatter],[https://t.co/GDYvNRmYi2],en,2020-06-11 17:00:00,"[rioter, minneapolis, sheraton, take, riots., ..."
1,1277286903641866240,2020-06-28 11:05:03,98956941,afbranco,44179,2197,Nationally syndicated Political Cartoonist (Cr...,A.F. Branco Cartoon - Alpha News: Welcome to M...,207,https://t.co/Web3mRCuEP,"[#DefundThePolice, #AntifaTerrorist, #Democrat...",[https://t.co/zqUYrv0CPE],en,2020-06-28 11:00:00,"[a.f., branco, cartoon, alpha, news:, welcome,..."
2,1266515656037588992,2020-05-29 17:43:57,1258993079287197696,NedWhat,825,203,#GLROfficial\nGreat Lakes Region \nContent of ...,Support from the #Amish community in Minneapol...,100842,https://t.co/IBcGGubFAo,"[#Amish, #AllLivesMatter, #BlackLivesMatter, #...",[https://t.co/qEkVkmOhGA],en,2020-05-29 17:00:00,"[support, #amish, community, minneapolis, #all..."
3,1266752338221506560,2020-05-30 09:24:27,175065805,dviyer,14843,6822,"South Asian American activist, lawyer. Senior ...",The story of the #GandhiMahal restaurant in Mi...,198,https://t.co/cHGBMwtDRh,"[#GandhiMahal, #solidarity, #BlackLivesMatter]",[https://t.co/sseVxOiFCe],en,2020-05-30 09:00:00,"[story, #gandhimahal, restaurant, minneapolis,..."
4,1266782245869805568,2020-05-30 11:23:17,743998433455808512,JoeyMillsXXX,273650,474,14x award winning pornstar— Your mothers worst...,All proceeds for the next 48 hours that I rece...,406,https://t.co/c0EzZoJYvE,"[#BlackLivesMatter, #JusticeForGeorgeFlyod, #J...","[https://t.co/uqQog24F7V, https://t.co/4jXkhvd...",en,2020-05-30 11:00:00,"[proceeds, next, 48, hour, receive, onlyfans, ..."
5,1267058623911428096,2020-05-31 05:41:31,1219534043210973184,ilcanhavhav,1570,276,fırat aydınus fan club,A man shoots a black protester with a shotgun ...,1783,https://t.co/7qU01ryYV0,[#BlackLivesMatter],[],en,2020-05-31 05:00:00,"[man, shoot, black, protester, shotgun, minnea..."
6,1267218415736872960,2020-05-31 16:16:28,195271137,larryelder,796374,98,Sage from South Central; Larry Elder Show; Sal...,"Black lives matter. Black businesses, not so m...",435,https://t.co/aZPPHBLtSZ,"[#BlackLivesMatter, #BlackBusinessMatters, #Ge...","[https://t.co/nl7Hzo919z, https://t.co/wtjSmVE...",en,2020-05-31 16:00:00,"[black, live, matter., black, businesses,, muc..."
7,1267001484387840000,2020-05-31 01:54:28,1538581122,Zionocracy,19074,290,Founder of #PalestineRemainsForEver & #Palesti...,Dear America\nWhat Happened to #GeorgeFloyd in...,2568,https://t.co/T9y8UT4btT,"[#GeorgeFloyd, #BlackLivesMatter, #Palestinian...",[https://t.co/a3Z50ffn1c],en,2020-05-31 01:00:00,"[dear, america, happened, #georgefloyd, minnea..."
8,1265744876823609344,2020-05-27 14:41:09,1187684601315115008,selenesrat,532,434,Ｉｍｍａ Ｓｅｌｅｎａ Ｓｔａｎ ｉｎ ｔｈｅ ｐｕｒｅｓｔ ｆｏｒｍ💖\n\n𝓘 𝓪𝓵𝓼𝓸...,Please call either (612) 348-5550 or (844) 278...,135,https://t.co/t2pBxgo176,"[#1087), #7162), #GeorgeFloyd, #BlackLivesMatter]",[https://t.co/0VFZORnGcf],en,2020-05-27 14:00:00,"[call, either, (612), 348-5550, (844), 278-283..."
9,1266286100743520256,2020-05-29 02:31:47,2835451658,MrAndyNgo,462908,665,"Editor-at-large - @TPostMillennial. ""Unmasked""...",Person at Minneapolis BLM race riot makes it c...,3140,https://t.co/XL6Dil4vX1,"[#BlackLivesMatter, #GeorgeFloyd, #antifa]",[https://t.co/5JNhq0qags],en,2020-05-29 02:00:00,"[person, minneapolis, blm, race, riot, make, c..."


In [246]:
print(len(filtered_retweet))
#filtered_retweet.head()

36


In [248]:
filtered_retweet

Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens
0,1271228625933619200,2020-06-11 17:51:37,2835451658,MrAndyNgo,457035,667,"Editor-at-large - @TPostMillennial. ""Unmasked""...",This is what rioters did to the Minneapolis Sh...,21050,https://t.co/GTAGV805i6,[#BlackLivesMatter],[https://t.co/GDYvNRmYi2],en,2020-06-11 17:00:00,"[rioter, minneapolis, sheraton, take, riots., ..."
1,1277286903641866240,2020-06-28 11:05:03,98956941,afbranco,44179,2197,Nationally syndicated Political Cartoonist (Cr...,A.F. Branco Cartoon - Alpha News: Welcome to M...,207,https://t.co/Web3mRCuEP,"[#DefundThePolice, #AntifaTerrorist, #Democrat...",[https://t.co/zqUYrv0CPE],en,2020-06-28 11:00:00,"[a.f., branco, cartoon, alpha, news:, welcome,..."
2,1266515656037588992,2020-05-29 17:43:57,1258993079287197696,NedWhat,825,203,#GLROfficial\nGreat Lakes Region \nContent of ...,Support from the #Amish community in Minneapol...,100842,https://t.co/IBcGGubFAo,"[#Amish, #AllLivesMatter, #BlackLivesMatter, #...",[https://t.co/qEkVkmOhGA],en,2020-05-29 17:00:00,"[support, #amish, community, minneapolis, #all..."
3,1266752338221506560,2020-05-30 09:24:27,175065805,dviyer,14843,6822,"South Asian American activist, lawyer. Senior ...",The story of the #GandhiMahal restaurant in Mi...,198,https://t.co/cHGBMwtDRh,"[#GandhiMahal, #solidarity, #BlackLivesMatter]",[https://t.co/sseVxOiFCe],en,2020-05-30 09:00:00,"[story, #gandhimahal, restaurant, minneapolis,..."
4,1266782245869805568,2020-05-30 11:23:17,743998433455808512,JoeyMillsXXX,273650,474,14x award winning pornstar— Your mothers worst...,All proceeds for the next 48 hours that I rece...,406,https://t.co/c0EzZoJYvE,"[#BlackLivesMatter, #JusticeForGeorgeFlyod, #J...","[https://t.co/uqQog24F7V, https://t.co/4jXkhvd...",en,2020-05-30 11:00:00,"[proceeds, next, 48, hour, receive, onlyfans, ..."
5,1267058623911428096,2020-05-31 05:41:31,1219534043210973184,ilcanhavhav,1570,276,fırat aydınus fan club,A man shoots a black protester with a shotgun ...,1783,https://t.co/7qU01ryYV0,[#BlackLivesMatter],[],en,2020-05-31 05:00:00,"[man, shoot, black, protester, shotgun, minnea..."
6,1267218415736872960,2020-05-31 16:16:28,195271137,larryelder,796374,98,Sage from South Central; Larry Elder Show; Sal...,"Black lives matter. Black businesses, not so m...",435,https://t.co/aZPPHBLtSZ,"[#BlackLivesMatter, #BlackBusinessMatters, #Ge...","[https://t.co/nl7Hzo919z, https://t.co/wtjSmVE...",en,2020-05-31 16:00:00,"[black, live, matter., black, businesses,, muc..."
7,1267001484387840000,2020-05-31 01:54:28,1538581122,Zionocracy,19074,290,Founder of #PalestineRemainsForEver & #Palesti...,Dear America\nWhat Happened to #GeorgeFloyd in...,2568,https://t.co/T9y8UT4btT,"[#GeorgeFloyd, #BlackLivesMatter, #Palestinian...",[https://t.co/a3Z50ffn1c],en,2020-05-31 01:00:00,"[dear, america, happened, #georgefloyd, minnea..."
8,1265744876823609344,2020-05-27 14:41:09,1187684601315115008,selenesrat,532,434,Ｉｍｍａ Ｓｅｌｅｎａ Ｓｔａｎ ｉｎ ｔｈｅ ｐｕｒｅｓｔ ｆｏｒｍ💖\n\n𝓘 𝓪𝓵𝓼𝓸...,Please call either (612) 348-5550 or (844) 278...,135,https://t.co/t2pBxgo176,"[#1087), #7162), #GeorgeFloyd, #BlackLivesMatter]",[https://t.co/0VFZORnGcf],en,2020-05-27 14:00:00,"[call, either, (612), 348-5550, (844), 278-283..."
9,1266286100743520256,2020-05-29 02:31:47,2835451658,MrAndyNgo,462908,665,"Editor-at-large - @TPostMillennial. ""Unmasked""...",Person at Minneapolis BLM race riot makes it c...,3140,https://t.co/XL6Dil4vX1,"[#BlackLivesMatter, #GeorgeFloyd, #antifa]",[https://t.co/5JNhq0qags],en,2020-05-29 02:00:00,"[person, minneapolis, blm, race, riot, make, c..."


In [94]:
#filtered_retweet2 = tw_data_files_to_df_json_filter(files_retweet, 'sometown')

In [96]:
#print(len(filtered_retweet2))
#filtered_retweet2.head()

0


Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens,token_counter


In [168]:
filtered_retweet3 = tw_data_files_to_df_json_filter(files_retweet, ['L.A.', 'Los Angeles','LA'])
print(len(filtered_retweet3))

5


In [169]:
filtered_retweet3

Unnamed: 0,RT_id,created_at,user_id,user_name,followers_count,following_count,user_description,text,retweet_count,t_co,tags,urls,lang,created_at_h,tokens,token_counter
0,1267294687657816064,2020-05-31 21:19:33,46764346,MisterPreda,245059,522,a little bit of everything | producer • creato...,NUMBERS TO CALL INCASE OF UNLAWFUL ARRESTS AT ...,2505,https://t.co/SqKVx7ezfP,[#BlackLivesMatter],[],en,2020-05-31 21:00:00,"[numbers, call, incase, unlawful, arrests, pro...",
1,1279931800828227584,2020-07-05 18:14:55,38354090,ScottHech,86230,4474,Public defender. Imagining new ways to amplify...,609. LA District Attorney has declined to pros...,100,https://t.co/3SKkYWRbAl,[],[],en,2020-07-05 18:00:00,"[609., la, district, attorney, decline, prosec...","{'609.': 1, 'la': 1, 'district': 1, 'attorney'..."
2,1279850015477776384,2020-07-05 12:49:56,23402579,CristineDeBerry,748,526,Mom. Immigrant. Wife. Lawyer. Chief of Staff @...,609 cases of police killing civilians and not ...,222,https://t.co/Y1oPM8mu2I,"[#BlackLivesMatter, #LADA2020, #PoliceBrutality]",[https://t.co/gP6H58FV1J],en,2020-07-05 12:00:00,"[609, case, police, kill, civilian, 1, warrant...",
3,1271797825412702208,2020-06-13 07:33:25,91597950,zoeyy227,13823,1173,"part-time beauty enthusiast, part-time nerd, p...","She also owns La Face products, if anyone know...",2765,https://t.co/cSPBdXCDV7,[],[],en,2020-06-13 07:00:00,"[also, la, face, products,, anyone, know, them...","{'also': 1, 'la': 1, 'face': 1, 'products,': 1..."
4,1268052021069807616,2020-06-02 23:28:55,373157754,YourAnonCentral,6459014,725,We support the weak against the powerful. #Bla...,LA Sheriff Villanueva admitting that curfews a...,6705,https://t.co/6gmVPHEQXs,"[#ICantBreathe, #GeorgeFloyd, #BlackLivesMatter]",[https://t.co/GrFs301mtU],en,2020-06-02 23:00:00,"[la, sheriff, villanueva, admit, curfew, use, ...","{'la': 1, 'sheriff': 1, 'villanueva': 1, 'admi..."


In [180]:
files_original = keep_recent_files(glob.glob(data_path + "data_cumulative/original/*"),
        base_timestamp = process_datatime_d, days=7)

In [230]:
def tw_data_files_to_df_json_filter(files, filter_word, lines=True, float_dtype=None, verbose=False):
    '''append and concat filtered data into a pandas.DataFrame'''
    if type(filter_word) != list: raise ValueError("filter_word must be a list")

    df = []
    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            
        df_file = pd.read_json(file, orient='records', lines=lines)
        if (len(filter_word) >1): idx = mark_tokens_contain_keywords(df_file, filter_word)
        else: idx = mark_tokens_contain_keyword(df_file, filter_word[0])
        df_file_filtered = df_file[idx]
        print(df_file_filtered)
        if len(df_file_filtered)>0:
            df.append(df_file_filtered)
    
    if len(df)==0: return df_null
    df = pd.concat(df, ignore_index=True)
    if float_dtype is None: return df
    return convert_floats(df, float_dtype)

In [229]:

type(['abc']) == list
['abc'][0]

'abc'

In [191]:
len(files_original)

162

In [213]:
tmp_data = pd.read_json(files_original[10],
         lines=True, orient='records')

tmp_idx = mark_tokens_contain_keyword(tmp_data, 'Minneapolis')

#tmp_filtered_original = tw_data_files_to_df_json_filter(files_original[:30], 'Minneapolis')

In [None]:
sum(tmp_idx)
tmp_filtered_original = tmp_data[tmp_idx]


In [231]:
tmp_filtered_original = tw_data_files_to_df_json_filter(files_original[:5], ['Minneapolis'])

                       id          created_at  is_retweet  \
6291  1281278415501512704 2020-07-09 11:25:53       False   
8052  1281274310817087488 2020-07-09 11:09:35        True   
8953  1281273077410566144 2020-07-09 11:04:41        True   

                    RT_id  RT_retweet_count              user_id  \
6291                                      0  1182301315738882048   
8052  1277269898494914560            258299           2844074260   
8953  1277269898494914560            258299           4489866973   

         user_name  followers_count  following_count  \
6291   _gHOST3301_             2938                8   
8052  GracieJazmin             1399              944   
8953       ha_jpeg              134              316   

                                                   text  \
6291  Minneapolis Police Department Officer Thomas L...   
8052                                                      
8953                                                      

                    

In [282]:
#tmp_filtered_original

In [233]:
len(tmp_filtered_original)

18

In [234]:
tmp_filtered_original.head()

Unnamed: 0,id,created_at,is_retweet,RT_id,RT_retweet_count,user_id,user_name,followers_count,following_count,text,quoted_text,RT_text,t_co,tags,urls,lang,created_at_h,tokens
0,1281278415501512704,2020-07-09 11:25:53,False,,0,1182301315738882048,_gHOST3301_,2938,8,Minneapolis Police Department Officer Thomas L...,,,[https://t.co/gWH1SsAec5],"[#GeorgeFloyd, #GhostSec, #Anonymous, #GhostsI...",[],en,2020-07-09 11:00:00,"[minneapolis, police, department, officer, tho..."
1,1281274310817087488,2020-07-09 11:09:35,True,1.2772698984949143e+18,258299,2844074260,GracieJazmin,1399,944,,Don’t let the #BlackLivesMatter protests disa...,,[https://t.co/TqSMix6Mpx],[#BlackLivesMatter],[],en,2020-07-09 11:00:00,"[don’t, let, protest, disappear, tls., minneap..."
2,1281273077410566144,2020-07-09 11:04:41,True,1.2772698984949143e+18,258299,4489866973,ha_jpeg,134,316,,Don’t let the #BlackLivesMatter protests disa...,,[https://t.co/TqSMix6Mpx],[#BlackLivesMatter],[],en,2020-07-09 11:00:00,"[don’t, let, protest, disappear, tls., minneap..."
3,1281978012880171008,2020-07-11 09:45:50,False,,0,2439158484,FairTax4America,16514,14818,"If you believe #BlackLivesMatter, support more...",The Minneapolis police officers' claims come a...,,"[https://t.co/jQM34Nne6G, https://t.co/xMmEJWQ...","[#BlackLivesMatter,, #GOP, #KAG2020, #MAGA, #M...",[],en,2020-07-11 09:00:00,"[believe, #blacklivesmatter,, support, funding..."
4,1281980549599551488,2020-07-11 09:55:55,True,1.281778843611693e+18,3,284198929,BernadeiaJ,3781,437,,150 Minneapolis officers engage in a sick-out ...,Now that’s the headline Javier!,[],[#BlackLivesMatter],[],en,2020-07-11 09:00:00,"[150, minneapolis, officer, engage, sick-out, ..."


In [235]:
tmp_filtered_original.text[0]

'Minneapolis Police Department Officer Thomas Lane, Implicated In The Killing Of George Floyd , Attorney, Seeks Dismissal Of Charges In #GeorgeFloyd Killing. Has Filed A Motion To Dismiss. #GhostSec #Anonymous #GhostsInYourWires #protests #BlackLivesMatter #BlackLivesMattters https://t.co/gWH1SsAec5'

In [236]:
tmp_filtered_original.quoted_text[0]

''

In [237]:
'minneapolis' in tmp_filtered_original.tokens[0]

True

In [249]:
def mark_var_in_valuelist(df, var, valuelist):
    # returns an index indicating whether variable var is in valuelist
    return df[var].apply(lambda x: x in valuelist)

In [273]:
tmp1 =  pd.read_json(files_original[], orient='records', lines=True)

In [274]:
tmp2 = mark_var_in_valuelist(tmp1, 'RT_id', filtered_retweet.RT_id.astype(str))

In [275]:
sum(tmp2)


0

In [277]:
def tw_data_files_to_df_json_match_id(files, varname_id, list_ids,
                                      lines=True, float_dtype=None, verbose=False):
    '''append and concat filtered data into a pandas.DataFrame'''
    if type(list_ids) != list: raise ValueError("list_ids must be a list")

    df = []
    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            
        df_file = pd.read_json(file, orient='records', lines=lines)
        idx = mark_var_in_valuelist(df_file, varname_id, list_ids)
        df_file_filtered = df_file[idx]
        if len(df_file_filtered)>0:
            df.append(df_file_filtered)
    
    if len(df)==0: return df_null
    df = pd.concat(df, ignore_index=True)
    if float_dtype is None: return df
    return convert_floats(df, float_dtype)

    
    

In [280]:
tmp_matched_original = tw_data_files_to_df_json_match_id(files_original[:20], 
                                  'RT_id', list(filtered_retweet.RT_id.astype(str)))

In [460]:
#tmp_matched_original

In [431]:
cities = ['Minneapolis','LosAngeles','Denver']
city_filterwords = {'Minneapolis': ['Minneapolis', '#Minneapolis','mlps', ['St.', 'Paul']],
                    'LosAngeles':['LosAngeles','LA', 'L.A.', '#LA', ['Los', 'Angeles']],
                    'Denver': ['Denver', '#Denver']}

In [346]:
type(city_filterwords['Minneapolis'][0])
type(city_filterwords['LosAngeles'][3])
idx ={}
idx[str('a')] = [True,False,True]
idx[str(['a','b'])] = [False,False,True]

pd.DataFrame(idx).agg(max, axis=1).astype(bool)

0     True
1    False
2     True
dtype: bool

In [357]:
def mark_var_contain_filterwords(df, varname, filterwords):
    if type(filterwords) != list: raise ValueError("filterwords must be a list")
    idx = {}
    for word in filterwords:
        if type(word)==str:
            idx[str(word)] = df[varname].apply(lambda x: word.lower() in x)
        elif type(word)==list:
            # assess whether all components of 'word' are jointly present 
            loc_idx = [df[varname].apply(lambda x: w.lower() in x) for w in word]
            idx[str(word)] = pd.DataFrame(loc_idx).agg(min).astype(bool)
        else: raise ValueError('each item in filterwords must be str or list')
        # assess whether any component of 'filterwords' are present 
    return pd.DataFrame(idx).agg(max, axis=1).astype(bool)    

In [354]:
idx1 = mark_var_contain_filterwords(cum_retweet, 'tokens', city_filterwords['Minneapolis'])
print(sum(idx1))

idx2 = mark_var_contain_filterwords(cum_retweet, 'tokens', city_filterwords['LosAngeles'])
print(sum(idx2))

idx3 = mark_var_contain_filterwords(cum_retweet, 'tokens', city_filterwords['Denver'])
print(sum(idx3))


48
43
5


In [365]:
def retweet_files_by_city_json(files, cities, city_filterwords, data_path,
                               lines=True, float_dtype='float16', verbose=False):
    city_df = {}

    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            for city in cities:
                city_df[city] = []
        
        df_file = pd.read_json(file, orient='records', lines=lines)
        df_vars_convert_to_str(df_file, ['RT_id','user_id','created_at','created_at_h'])
        convert_floats(df_file, float_dtype)
        
        for city in cities:
            filter_word = city_filterwords[city]    
            idx = mark_var_contain_filterwords(df_file, 'tokens', filter_word)
            if sum(idx)>0: city_df[city].append(df_file[idx])
    
    for city in cities:
        if len(city_df[city])==0: city_data = df_null
        else: city_data = pd.concat(city_df[city], ignore_index=True)
        filename = 'data_cumulative/city_date/' + city + '/retweet/2020_all_retweets' + '.json'
        city_data.to_json(data_path + filename, 
                          orient='records', lines=lines)
        print('updated: ', filename)


In [366]:
retweet_files_by_city_json(files_retweet, cities, city_filterwords, data_path)

updated:  data_cumulative/city_date/Minneapolis/2020_all_retweets.json
updated:  data_cumulative/city_date/LosAngeles/2020_all_retweets.json
updated:  data_cumulative/city_date/Denver/2020_all_retweets.json


In [459]:
city = 'Minneapolis'
filename = 'data_cumulative/city_date/' + city + '/retweet/2020_all_retweets' + '.json'
RT_id = pd.read_json(data_path + filename, orient='records', lines=True).RT_id.astype(str)


In [434]:
def get_unique_dates(df, varname):
    tmp = pd.to_datetime(df[varname]).dt.floor('d')
    dates = tmp.unique()
    dates_str = [str(date)[:10] for date in dates]
    return dates, dates_str

def filter_df_by_date(df, varname, date, var_as_string=True):
    tmp_df = df
    varname_d = varname + '_d'
    tmp_df[varname_d] = pd.to_datetime(tmp_df[varname]).dt.floor('d')
    filtered_df = tmp_df[tmp_df[varname_d] == pd.to_datetime(date)].drop(columns = [varname_d])
    if var_as_string: filtered_df[varname] = filtered_df[varname].astype(str)
    return filtered_df

def append_to_json(filename, df, lines=True):
    df0 = pd.read_json(filename, orient='records', lines=lines)
    return df0.append(df)


def original_files_by_city_date_json(files, cities, city_filterwords, data_path,
                               lines=True, float_dtype='float16', verbose=False):
    city_df = {}
    city_RT_ids = {}
    
    for city in cities:
        # retrieve relevant RT_id to match 
        filename = 'data_cumulative/city_date/' + city + '/retweet/2020_all_retweets' + '.json'
        RT_id = pd.read_json(data_path + filename, 
                             orient='records', lines=True).RT_id.astype(str)
        city_RT_ids[city] = list(RT_id)
    
    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            for city in cities:
                city_df[city] = []
        
        df_file = pd.read_json(file, orient='records', lines=lines)
        df_vars_convert_to_str(df_file, ['id','RT_id','created_at','created_at_h'])
        convert_floats(df_file, float_dtype)
        
        for city in cities:
            if verbose: print('processing data for ' + city)  
            filter_word = city_filterwords[city]
            # idx1: 'tokens' containing filter_word
            idx1 = mark_var_contain_filterwords(df_file, 'tokens', filter_word)
            # idx2: relevant retweet's that are matched  
            idx2 = mark_var_in_valuelist(df_file, 'RT_id', city_RT_ids[city])
            # idx: either idx1 or idx2 being True
            idx = pd.DataFrame(data={'idx1':idx1, 'idx2': idx2}).agg(max, axis=1)
            print(sum(idx1),sum(idx2), sum(idx))
            if sum(idx)>0: city_df[city].append(df_file[idx])
    
    for city in cities:
        if len(city_df[city])==0: city_data = df_null
        else: city_data = pd.concat(city_df[city], ignore_index=True)
        dates, dates_str = get_unique_dates(city_data,'created_at_h')
        for date in dates_str:
            if verbose: print('processing date of ' + date)  
            df_date = filter_df_by_date(city_data, 'created_at_h', date)
            filename = 'data_cumulative/city_date/' + city + '/original/records_'+ date + '.json'
            new_file = glob.glob(data_path + filename)==[]
            if new_file:
                df_date.to_json(data_path + filename, 
                              orient='records', lines=lines)
                print('created: ', filename)
            else:
                df_date = append_to_json(data_path + filename, df_date)
                df_date.to_json(data_path + filename, 
                              orient='records', lines=lines)
                print('appended: ', filename)

In [494]:
original_files_by_city_date_json(files_original[:3], cities, city_filterwords, data_path, verbose=True)



loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/original/created_at_2020-07-09_11:00:00.json
processing data for Minneapolis
3 11 14
processing data for LosAngeles
35 0 35
processing data for Denver
0 0 0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/original/created_at_2020-07-11_09:00:00.json
processing data for Minneapolis
3 0 3
processing data for LosAngeles
4 0 4
processing data for Denver
0 0 0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/original/created_at_2020-07-06_09:00:00.json
processing data for Minneapolis
7 1 8
processing data for LosAngeles
192 3 195
processing data for Denver
1 0 1
processing date of 2020-07-09
created:  data_cumulative/city_date/Minneapolis/original/records_2020-07-09.json
processing date of 2020-07-11
created:  data_cumulative/city_date/Minneapolis/original/records_2020-07-11.json
processing date of 2020-07-06
created:  data_cumulative/city_date/Mi

In [492]:
files_words = keep_recent_files(glob.glob(data_path + "data_cumulative/words/*"),
        base_timestamp = process_datatime_d, days=7)

In [493]:
files_words[:3]

['/Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/words/created_at_2020-07-09_11:00:00.json',
 '/Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/words/created_at_2020-07-11_09:00:00.json',
 '/Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/words/created_at_2020-07-06_09:00:00.json']

In [517]:
def keep_recent_files(files, base_timestamp, file_type= '.json', days = 14,
                      prefix = 'created_at_'):
    timestamps = [pd.Timestamp(file.split(prefix,1)[1]
                               .replace(file_type,'').replace('_',' ')) for file in files ]
    keep_idx1 = [(base_timestamp - timestamp) <= pd.Timedelta(days, unit='d') for timestamp in timestamps]
    return(list(itertools.compress(files,keep_idx1)))


In [509]:
tmp2 = pd.read_json(files_words[0], orient='records', lines=True)
tmp2 

Unnamed: 0,id,created_at_h,token_counter
0,1281271899784228864,2020-07-09 11:00:00,"{'peggy': 1, 'shepard,': 1, 'shed': 1, 'light'..."
1,1281271900015071232,2020-07-09 11:00:00,{}
2,1281271899985793024,2020-07-09 11:00:00,{}
3,1281271900291776512,2020-07-09 11:00:00,{}
4,1281271900593938432,2020-07-09 11:00:00,"{'yet': 1, 'countless': 1, 'black': 1, 'baby':..."
...,...,...,...
21630,1281285912421261312,2020-07-09 11:00:00,
21631,1281285913792913408,2020-07-09 11:00:00,
21632,1281285915944640512,2020-07-09 11:00:00,
21633,1281285916091461632,2020-07-09 11:00:00,


In [520]:

def words_files_by_city_date_json(files_words, cities, data_path,
                                  process_datetime, process_days = 14,
                                  lines=True, verbose=False):
    city_df = {}
    city_ids = {}
    for city in cities:
        files_city_original = keep_recent_files(
            glob.glob(data_path + "data_cumulative/city_date/" + city  + "/original/*"),
            prefix = 'records_', base_timestamp = process_datatime, days=7)
        tmp_ids = []
        for file in files_city_original:
            # retrieve relevant id to match
            if verbose: print('reading ids from ' + file)
            ids = pd.read_json(file, orient='records', lines=True).id.astype(str)
            tmp_ids.append(ids)
        city_ids[city] = list(pd.concat(tmp_ids, ignore_index=True))
    
    for file in files_words:
        if verbose: print('loading ' + file)  
        if file==files_words[0]:
            columns = get_columns_json(file)
            df_null = pd.DataFrame(columns=columns)
            for city in cities:
                city_df[city] = []
        
        df_file = pd.read_json(file, orient='records', lines=lines)
        df_vars_convert_to_str(df_file, ['id','created_at_h'])
        
        for city in cities:
            if verbose: print('processing data for ' + city)  
            # idx: relevant original tweet's that are matched  
            idx = mark_var_in_valuelist(df_file, 'id', city_ids[city])
            print(sum(idx))
            if sum(idx)>0: city_df[city].append(df_file[idx])
    
    for city in cities:
        if len(city_df[city])==0: city_data = df_null
        else: city_data = pd.concat(city_df[city], ignore_index=True)
        dates, dates_str = get_unique_dates(city_data, 'created_at_h')
        for date in dates_str:
            if verbose: print('processing date of ' + date)  
            df_date = filter_df_by_date(city_data, 'created_at_h', date)
            filename = 'data_cumulative/city_date/' + city + '/words/records_'+ date + '.json'
            new_file = glob.glob(data_path + filename)==[]
            if new_file:
                df_date.to_json(data_path + filename, 
                              orient='records', lines=lines)
                print('created: ', filename)
            else:
                df_date = append_to_json(data_path + filename, df_date)
                df_date.to_json(data_path + filename, 
                              orient='records', lines=lines)
                print('appended: ', filename)

In [521]:
words_files_by_city_date_json(files_words[:5], cities, data_path,
                                  process_datetime, process_days = 14,
                                  verbose=True)

reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Denver/original/records_2020-07-0

In [542]:
files_sentiments = keep_recent_files(glob.glob(data_path + "data_cumulative/sentiments/*"),
        base_timestamp = process_datetime,  file_type= '.csv', days=7)

tmp2 = pd.read_csv(files_sentiments[0])
tmp2 


Unnamed: 0,id,created_at_h,neg,neu,pos,compound
0,1278538162915037184,2020-07-01 21:00:00,0.000,0.909,0.091,0.0258
1,1278538167474442240,2020-07-01 21:00:00,0.347,0.583,0.069,-0.8750
2,1278538169319936000,2020-07-01 21:00:00,0.527,0.473,0.000,-0.7003
3,1278538170456473600,2020-07-01 21:00:00,0.000,0.901,0.099,0.0258
4,1278538183693795328,2020-07-01 21:00:00,0.000,1.000,0.000,0.0000
...,...,...,...,...,...,...
214321,1278815756894035968,2020-07-02 16:00:00,0.389,0.491,0.121,-0.7269
214322,1278815757963591680,2020-07-02 16:00:00,,,,
214323,1278815761662988288,2020-07-02 16:00:00,,,,
214324,1278815761855844352,2020-07-02 16:00:00,,,,


In [545]:
tmp2.id

0         1278538162915037184
1         1278538167474442240
2         1278538169319936000
3         1278538170456473600
4         1278538183693795328
                 ...         
214321    1278815756894035968
214322    1278815757963591680
214323    1278815761662988288
214324    1278815761855844352
214325    1278815762933784576
Name: id, Length: 214326, dtype: int64

In [578]:

def files_id_matched_by_city_date_json(
    files, cities, data_path, folder, process_datetime, process_days = 14,
    file_type ='.json', float_dtype='float16', lines=True, verbose=False):

    '''
    Looks for recent files in /city_date/[city]/original/*, extract relevant ids,
    generate data matched with those ids by city, and create data files  
    '''
    if file_type not in ['.json', '.csv'] :
        raise ValueError('file_type must be either json or csv')
            
    city_df = {}
    city_ids = {}
    for city in cities:
        files_city_original = keep_recent_files(
            glob.glob(data_path + "data_cumulative/city_date/" + city  + "/original/*"),
            prefix = 'records_', file_type= '.json', 
            base_timestamp = process_datetime, days=7)
        tmp_ids = []
        for file in files_city_original:
            # retrieve relevant id to match
            if verbose: print('reading ids from ' + file)
            ids = pd.read_json(file, orient='records', lines=True).id.astype(str)
            tmp_ids.append(ids)
        city_ids[city] = list(pd.concat(tmp_ids, ignore_index=True))
    
    for file in files:
        if verbose: print('loading ' + file)  
        if file==files[0]:
            columns = get_columns_json(file) if file_type =='.json' else get_columns_csv(file)
            df_null = pd.DataFrame(columns=columns)
            for city in cities:
                city_df[city] = []
        
        if file_type =='.json': 
            df_file = pd.read_json(file, orient='records', lines=lines)
        elif file_type =='.csv': 
            df_file = pd.read_csv(file)

        df_vars_convert_to_str(df_file, ['id','created_at_h'])
        convert_floats(df_file, float_dtype)

        for city in cities:
            if verbose: print('processing data for ' + city)  
            # idx: relevant original tweet's that are matched  
            idx = mark_var_in_valuelist(df_file, 'id', city_ids[city])
            print(sum(idx))
            if sum(idx)>0: city_df[city].append(df_file[idx])
    
    for city in cities:
        if len(city_df[city])==0: city_data = df_null
        else: city_data = pd.concat(city_df[city], ignore_index=True)
        dates, dates_str = get_unique_dates(city_data, 'created_at_h')
        for date in dates_str:
            if verbose: print('processing date of ' + date)  
            df_date = filter_df_by_date(city_data, 'created_at_h', date)
            filename = 'data_cumulative/city_date/' + city + '/' + folder + '/records_'+ date + file_type
            new_file = glob.glob(data_path + filename)==[]
            if file_type =='.json': 
                if not new_file:
                    df_date = append_to_json(data_path + filename, df_date)
                df_date.to_json(data_path + filename, 
                              orient='records', lines=lines)
            if file_type =='.csv':
                mode = 'a' if new_file else 'w'
                df_date.to_csv(data_path + filename, index=False, mode=mode)
            if new_file: print('created: ', filename)
            else: print('appended: ', filename)

In [579]:
files_id_matched_by_city_date_json(
    files_sentiments[:10], cities, data_path, 'sentiments', 
    process_datetime, process_days = 14,
    file_type='.csv', verbose=True)

reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Denver/original/records_2020-07-0

In [581]:
files_emotions = keep_recent_files(glob.glob(data_path + "data_cumulative/emotions/*"),
        base_timestamp = process_datetime,  file_type= '.csv', days=7)

tmp2 = pd.read_csv(files_emotions[0])
tmp2 


  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,created_at_h,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy
0,1278538162915037184,2020-07-01 21:00:00,False,True,True,True,True,True,True,False,True,True
1,1278538167474442240,2020-07-01 21:00:00,False,False,False,False,False,False,True,False,False,False
2,1278538169319936000,2020-07-01 21:00:00,False,False,True,False,False,True,True,False,False,False
3,1278538170456473600,2020-07-01 21:00:00,False,False,True,True,True,True,True,False,False,False
4,1278538183693795328,2020-07-01 21:00:00,True,True,True,False,True,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...
214321,1278815756894035968,2020-07-02 16:00:00,True,True,False,False,False,False,True,False,False,False
214322,1278815757963591680,2020-07-02 16:00:00,,,,,,,,,,
214323,1278815761662988288,2020-07-02 16:00:00,,,,,,,,,,
214324,1278815761855844352,2020-07-02 16:00:00,,,,,,,,,,


In [582]:
files_id_matched_by_city_date_json(
    files_emotions[:10], cities, data_path, 'emotions', 
    process_datetime, process_days = 14,
    file_type='.csv', verbose=True)

reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Denver/original/records_2020-07-0

  if (await self.run_code(code, result,  async_=asy)):


processing data for Minneapolis
0
processing data for LosAngeles
0
processing data for Denver
0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/emotions/created_at_2020-07-12_09:41:01.csv
processing data for Minneapolis
0
processing data for LosAngeles
0
processing data for Denver
0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/emotions/created_at_2020-07-07_15:02:42.csv
processing data for Minneapolis
0
processing data for LosAngeles
0
processing data for Denver
0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/emotions/created_at_2020-07-11_10:19:49.csv
processing data for Minneapolis
4
processing data for LosAngeles
5
processing data for Denver
0
loading /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/emotions/created_at_2020-07-08_13:24:14.csv
processing data for Minneapolis
0
processing data for LosAngeles
0
processing data for Denver
0
loading /Users/kotam

In [583]:
files_id_matched_by_city_date_json(
    files_words[:3], cities, data_path, 'words', 
    process_datetime, process_days = 14,
    file_type='.json', verbose=True)


reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Minneapolis/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-11.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-06.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/LosAngeles/original/records_2020-07-09.json
reading ids from /Users/kotaminegishi/big_data_training/python/dash_demo1/data_cumulative/city_date/Denver/original/records_2020-07-0

In [611]:
glob.glob(data_dest + "data_cumulative/city_date/Minneapolis/retweet/*")[:3]
cum_data_path = data_dest + "data_cumulative/city_date/" + 'Denver'
tmp4= keep_recent_files(glob.glob(cum_data_path + "/sentiments/*"),
                            base_timestamp=process_datatime, prefix='records_',
                            file_type = '.csv', days=14) 

len(tmp4)

0

In [None]:
def tw_data_files_to_df_csv(files):
    '''append and concat data files into a pandas.DataFrame'''
    df = []
    [ df.append(pd.read_csv(file)) for file in files ]
    df = pd.concat(df, ignore_index=True)
    return df


def tw_data_files_to_df_json(files, lines=False):
    '''append and concat data files into a pandas.DataFrame'''
    df = []
    [ df.append(pd.read_json(file, orient='records', lines=lines)) for file in files ]
    df = pd.concat(df, ignore_index=True)
    return df

In [590]:
from summarizing_helpers import *

In [673]:
def get_columns_json(file):
    chunk1 = pd.read_json(file, chunksize=1, orient='records', lines=True)
    for d in chunk1:
        data1 = d.iloc[0]
        break
    return list(data1.keys())

def get_columns_csv(file):
    chunk1 = pd.read_csv(file, chunksize=1)
    return list(chunk1.read(1).keys())


def load_null_df(data_path):
    
    null_stat_sentiments = pd.DataFrame(columns = get_columns_csv(
                     glob(data_path + 'data_cumulative/sentiments/*')[0]))
    
    null_stat_emotions = pd.DataFrame(columns = get_columns_csv(
                     glob(data_path + 'data_cumulative/emotions/*')[0]))
    
    null_cum_words = pd.DataFrame(columns = get_columns_json(
                     glob(data_path + 'data_cumulative/words/*')[0]))
    
    null_cum_original = pd.DataFrame(columns = get_columns_json(
                     glob(data_path + 'data_cumulative/original/*')[0]))
    
    null_cum_retweet = pd.DataFrame(columns = get_columns_json(
                     glob(data_path + 'data_cumulative/retweet/*')[0]))
    
    return null_stat_sentiments, null_stat_emotions, null_cum_words, null_cum_original, null_cum_retweet




In [674]:
null_stat_sentiments, null_stat_emotions, null_cum_words, null_cum_original, null_cum_retweet = load_null_df(data_path)

In [693]:
def fix_datetime(df, timevar='created_at_h'):
    df[timevar] = pd.to_datetime(df[timevar])

def fix_token_counter(df):
    df.token_counter = df.token_counter.apply(lambda x: Counter(x))  

def fix_RT_id(df):
    df.RT_id = df.RT_id.astype(str) 

In [623]:
#get_columns_json(glob.glob(data_path + 'data_cumulative/original/*'))
from glob import glob 
glob(data_path + 'data_cumulative/original/*')[0]
glob(data_path + 'data_cumulative/retweet/*')[0]
glob(data_path + 'data_cumulative/words/*')[0]

pd.DataFrame(columns = get_columns_csv(glob(data_path + 'data_cumulative/sentiments/*')[0]))

Unnamed: 0,id,created_at_h,neg,neu,pos,compound


In [694]:
def get_city_data(city, data_path, base_timestamp):

        # load recent cumulative data
        print('  Loading cumulative data: sentiments and emotions...')
        cum_data_path = data_path + "data_cumulative/city_date/" + city
        curr_data_path = data_path + "data_current/city/" + city

        files_sentiments = keep_recent_files(glob(cum_data_path + "/sentiments/*"),
                            base_timestamp=base_timestamp, prefix='records_',
                            file_type = '.csv', days=14) 
        if len(files_sentiments)>0: 
            cum_sentiments = tw_data_files_to_df_csv(files_sentiments)
            cum_sentiments = cum_sentiments.drop_duplicates(subset = 'id')
            fix_datetime(cum_sentiments)
            stat_sentiments = calc_stat_sentiments(cum_sentiments)
        else:
            stat_sentiments = null_stat_sentiments
            
        files_emotions = keep_recent_files(glob(cum_data_path + "/emotions/*"),
                            base_timestamp=base_timestamp, prefix='records_',
                            file_type = '.csv', days=14)
        if len(files_emotions)>0:
            cum_emotions = tw_data_files_to_df_csv(files_emotions)
            cum_emotions = cum_emotions.drop_duplicates(subset = 'id')
            fix_datetime(cum_emotions)    
            stat_emotions = calc_stat_emotions(cum_emotions)
        else:
            stat_emotions = null_stat_emotions

        print('  Loading cumulative data: words...')
        files_words = keep_recent_files(glob(cum_data_path + "/words/*"),
                                        base_timestamp=base_timestamp, prefix='records_',
                                        file_type = '.json', days=7) 
        
        if len(files_words)>0:
            cum_words = tw_data_files_to_df_json(files_words, lines=True)
            fix_datetime(cum_words)
            fix_token_counter(cum_words)
        else:
            cum_words = null_cum_words

        print('  Loading cumulative data: original tweets and retweets...')   
        # load recent cumulative data     
        files_original = keep_recent_files(glob(cum_data_path + "/original/*"),
            base_timestamp = base_timestamp, days=7,
            prefix='records_', file_type = '.json')
        if len(files_original)>0:
            cum_original = tw_data_files_to_df_json(files_original, lines=True)
            fix_datetime(cum_original)        
            fix_RT_id(cum_original)
        else:
            cum_original = null_cum_original

        files_retweet = cum_data_path + "/retweet/2020_all_retweets.json"
        try: 
            cum_retweet = pd.read_json(files_retweet, orient='records', lines=True)
            fix_datetime(cum_retweet)
            fix_RT_id(cum_retweet)
        except:
            cum_retweet = null_cum_retweet


        latest_datatime = cum_original.created_at_h.max()
        time_now =  min([latest_datatime, base_timestamp])

        cum_data = cumulative_data(cum_ori = cum_original, 
                                  cum_rt = cum_retweet,
                                  cum_words = cum_words,
                                  now = time_now
                                  )

        cum_data.add_words_subsets()
        cum_data.add_tweet_subsets()
        cum_data.add_user_subsets()

        return stat_sentiments, stat_emotions, cum_data.stat_words, cum_data.top_tweets, cum_data.top_users
        


def update_current_data_city(cities, data_path, base_timestamp):
    for city in cities:
        print('\nUpdating current city data files for ' + city)

        stat_sentiments, stat_emotions, stat_words, top_tweets, top_users = get_city_data(city, data_path, base_timestamp)
        
        curr_data_path = data_path + "data_current/city/" + city

        # update current data: recent cumulative files
        stat_sentiments.to_csv(curr_data_path + '/stat_sentiments.csv', index = False)
        stat_emotions.to_csv(curr_data_path +  '/stat_emotions.csv', index = False)
        print('  Updated current data: stat_sentiments and stat_emotions.')

        stat_words.to_json(curr_data_path + '/stat_words.json', orient='records', lines=True)
        top_users.to_csv(curr_data_path + '/top_users.csv', index=False)
        top_tweets.to_csv(curr_data_path + '/top_tweets.csv', index=False)
        print('  Updated current city data: stat_words, top_users, and top_tweets.')

        

In [695]:
update_current_data_city(cities, data_dest, process_datatime)


Updating current city data files for Minneapolis
  Loading cumulative data: sentiments and emotions...
  Loading cumulative data: words...
  Loading cumulative data: original tweets and retweets...
IN time_subsets():
IN time_subsets():
IN add_words_subsets():
Empty DataFrame
Columns: [id, created_at_h, token_counter, created_at_d]
Index: []
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
                                         token_counter  count
0                                                   {}      0
sum  {'police': 3, 'minneapolis': 2, 'believe': 1, ...      4
0                                                   {}      0
sum  {'painting': 33, 'hate': 33, 'minneapolis': 25...     45
IN add_tweet_subsets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
       subset                RT_id      user_name followers_count  \
0      now_1h                                                       
0       toda

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


IN time_subsets():
IN time_subsets():
IN add_words_subsets():
Empty DataFrame
Columns: [id, created_at_h, token_counter, created_at_d]
Index: []
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
                                         token_counter  count
0                                                   {}      0
sum  {'racist': 4, 'join': 2, 'us': 2, 'rest': 2, '...      5
0                                                   {}      0
sum  {'la': 205, 'police': 192, '#policebrutality':...    408
IN add_tweet_subsets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
       subset                RT_id user_name followers_count  \
0      now_1h                                                  
0       today                                                  
0   yesterday                                                  
0  seven_days  1279846328621690880     BLMLA          132113   

                            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


IN time_subsets():
IN add_words_subsets():
Empty DataFrame
Columns: [id, created_at_h, token_counter, created_at_d]
Index: []
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
IN calc_stat_words():
                                         token_counter  count
0                                                   {}      0
sum  {'look': 2, 'i'm': 1, 'support': 1, 'st.': 1, ...      1
0                                                   {}      0
sum  {'look': 2, 'i'm': 1, 'support': 1, 'st.': 1, ...      1
IN add_tweet_subsets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
IN get_top_tweets():
       subset RT_id user_name followers_count text t_co tags retweet_timespan  \
0      now_1h                                                                   
0       today                                                                   
0   yesterday                                                                   
0  seven_days                               

In [417]:
def append_to_json(filename, df, lines=True):
    df0 = pd.read_json(filename, orient='records', lines=lines)
    return df0.append(df)
    


## the following was used to retroactively change retweet data structure

In [436]:
cum_retweet = pd.read_json(data_dest + "data_cumulative/retweet/2020_all_retweets.json",
         lines=True, orient='records')

In [454]:
import re
import string
import nltk
#nltk.download('vader_lexicon') 
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')
#nltk.download('punkt')
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sys
from nrclex import NRCLex
from collections import Counter
import itertools 



def tw_data_format_created_at(df):
    # assumes variable 'created_at' exists
    # uses CST as timestamp
    df['created_at'] = pd.to_datetime(df.created_at, unit='s') + pd.DateOffset(hours=-6) # CST
    df['created_at_h'] =  df['created_at'].dt.floor("h")
    return df

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def clean_tokens(tweet_texts):
    mytokens = [tw.split() for tw in tweet_texts]

    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(['#blacklivesmatter', '&amp;', 'please','retweet'])

    cleaned_tokens_list = []

    for tokens in mytokens:
        cleaned_tokens_list.append(remove_noise(tokens, stopwords))
    return cleaned_tokens_list


def join_token(token):
    return " ".join(str(word) for word in token)



def get_df_and_ids(self):
    '''used in assign_sentiments() and assign_emotions'''
    if (self.type=='original'):
        df = self.df[[ len(token) > 0 for token in self.df.tokens] ].reset_index() 
        ids = df.id
    elif (self.type=='retweet'):
        df = self.df
        ids = df.RT_id
    return df, ids


class tmp_new_tw_data():
    
    def __init__(self, df, type='retweet'):
        if (type in ['original', 'retweet']):
            self.type = type 
            if self.type=='original': self.id = 'id'
            else: self.id = 'RT_id'
        
        else:
            print('value error: type must be either "original" or "retweet".')
            sys.exit()
            
        df = tw_data_format_created_at(df)
        if hasattr(df, 'quoted_text'):
            df['tokens'] = clean_tokens(df.text + df.quoted_text)
        else:
            df['tokens'] = clean_tokens(df.text)
        self.df = df

            
    def assign_sentiments(self):
        if self.df is None:
            self.df_sentiments = None
            return
        
        df, ids = get_df_and_ids(self)
        if len(df)==0: 
            self.df_sentiments = None
            return
    
        idx_ini = 0
        idx_end = len(ids) # 50
        try:
            tweets = df.tokens[idx_ini:idx_end]
            ids = ids[idx_ini:idx_end]
            created_at_h = df.created_at_h[idx_ini:idx_end]

            sid = SentimentIntensityAnalyzer()
            score_1 = sid.polarity_scores(join_token(tweets[0]))

            sentiments = []

            for i, tweet in enumerate(tweets):
                    score = sid.polarity_scores(join_token(tweet))
                    sentiments.append([score[key] for key in score])

            df_sentiments = pd.DataFrame(data = sentiments, columns=score_1.keys())
            df_sentiments = df_sentiments.set_index([pd.Index(ids), pd.Index(created_at_h)])
        
        except Exception as e:
            print(e.__doc__)
            
        self.df_sentiments = df_sentiments
        
        
    def assign_emotions(self):
        if self.df is None: 
            self.df_emotions = None
            return
        df, ids = get_df_and_ids(self)
        if len(df)==0: 
            self.df_emotions = None
            return

        idx_ini = 0
        idx_end = len(ids) # 50
        try:
            tweets = df.tokens[idx_ini:idx_end]
            ids = ids[idx_ini:idx_end]
            created_at_h = df.created_at_h[idx_ini:idx_end]

            nrc_1 = NRCLex(join_token(tweets[0]))
            emo_labels = nrc_1.affect_frequencies.keys()
            top_emotions = []

            for i, tweet in enumerate(tweets):
                nrc = NRCLex(join_token(tweet))
                emos = [ i[0] for i in nrc.top_emotions]
                top_emotions.append([ i in emos for i in emo_labels])

            df_top_emotions = pd.DataFrame(data = top_emotions, 
             columns = emo_labels)

            df_top_emotions = df_top_emotions.set_index([pd.Index(ids), pd.Index(created_at_h)])
        
        except Exception as e:
            print(e.__doc__)
        
        self.df_top_emotions = df_top_emotions


    def count_words(self):
        if self.df is None: 
            self.df_words = None
            return
        df = self.df
        try:
            df['token_counter'] = [ Counter(token) for token in df['tokens'] ]
            if self.type == 'original':
                df = df.set_index(['id', 'created_at_h'])
            elif self.type == 'retweet':
                df = df.set_index(['RT_id', 'created_at_h'])
        
        except Exception as e:
            print(e.__doc__)
        
        self.df_words = df[['token_counter']]
        self.df = self.df.drop(columns = ['token_counter']) 



In [455]:
rt = tmp_new_tw_data(cum_retweet)

In [484]:
rt.assign_sentiments()
rt.assign_emotions()
rt.count_words()

In [488]:
file_loc = 'data_cumulative/retweet/'

rt.df_words = rt.df_words.reset_index()
rt.df_words.created_at_h = rt.df_words.created_at_h.astype(str)

rt.df_sentiments = rt.df_sentiments.reset_index()
rt.df_top_emotions = rt.df_top_emotions.reset_index()

rt.df_words.to_json(data_dest + file_loc + '2020_all_words.json', orient='records', lines=True)
rt.df_sentiments.to_csv(data_dest + file_loc + '2020_all_sentiments.csv', index=False)
rt.df_top_emotions.to_csv(data_dest + file_loc + '2020_all_emotions.csv', index=False)    

In [471]:
#file_loc = 'data_cumulative/'
#tmp2 = pd.read_json()
file1 = 'created_at_2020-07-09_11:00:00'

ori_df = pd.read_json(data_path + 'data_cumulative/original/' + file1 + '.json', 
                     orient = 'records', lines=True)
ori_df_words = pd.read_json(data_path + 'data_cumulative/words/' + file1 + '.json', 
                     orient = 'records', lines=True)

file2 = 'created_at_2020-07-09_13:55:53'
ori_df_sentiments = pd.read_csv(data_path + 'data_cumulative/sentiments/' + file2 + '.csv')
ori_df_emotions = pd.read_csv(data_path + 'data_cumulative/emotions/' + file2 + '.csv')


In [478]:
def fix_datetime(df, timevar='created_at_h'):
    df[timevar] = pd.to_datetime(df[timevar])
    return df

fix_datetime(ori_df_sentiments)
fix_datetime(ori_df_emotions)
fix_datetime(ori_df_words)
fix_token_counter(ori_df_words)
fix_datetime(ori_df)
fix_RT_id(ori_df)

Unnamed: 0,RT_id,created_at_h,token_counter
0,1278053836381392896,2020-06-29 07:00:00,{'sit': 1}
1,1274106956609777664,2020-06-18 10:00:00,"{'liberate': 1, 'washington': 1, 'state!': 1, ..."
2,1277830992473047040,2020-06-28 17:00:00,"{'breaking:': 1, 'three': 1, 'aurora': 1, 'pol..."
3,1266604715355312128,2020-05-28 17:00:00,"{'oh': 1, 'bitch': 1, 'got': 1, 'amish': 1, 'c..."
4,1268070997321814016,2020-06-01 18:00:00,"{'end': 1, 'stream!': 1, 'raise': 1, '$14,871...."
...,...,...,...
8316,1265870261699534848,2020-05-26 16:00:00,"{'""feeling': 1, 'uncomfortable""': 1, 'show': 1..."
8317,1267569674012651520,2020-05-31 09:00:00,"{'must.': 1, 'sign.': 1, 'petition.': 1}"
8318,1268990386447532032,2020-06-04 07:00:00,"{'blm': 1, 'feel': 1, 'free': 1, '#blm': 1, '#..."
8319,1267971682440019968,2020-06-01 12:00:00,"{'🎉my': 1, 'shop': 1, 'open!': 1, '🎉': 1, 'rem..."


In [481]:

ref_words = rt.df_words
fix_RT_id(ref_words)
fix_datetime(ref_words)
fix_token_counter(ref_words)


def merge_datasets(or_df, or_data, ref_data):
    if or_data is None: return()
    
    col_data = [*or_data.columns]
    
    ref_data['RT_id'] = ref_data['RT_id'].astype(str) 
    
    or_non_empty = or_df[or_df.RT_id != '']
    
    if len(or_non_empty)>0:
        retweeted_data = (or_non_empty[['id','RT_id','created_at_h']]
             .join(ref_data.set_index('RT_id'), 
                   on='RT_id', rsuffix='_rt')
            )

        df_merged = (
            or_data.reset_index()[['id','created_at_h', *col_data]] 
            .append(retweeted_data[['id','created_at_h', *col_data]])
            )
        
        print('Num rows = {} + {} = {}'
          .format(len(or_data), len(retweeted_data), len(df_merged)))
        return df_merged
   
    else: 
        print('Num rows = {}'.format(len(or_data)))
        return or_data.reset_index()[['id','created_at_h', *col_data]] 


new_words = merge_datasets(or_df = ori_df, 
                           or_data = ori_df_words, 
                           ref_data = ref_words)

Num rows = 21635 + 9658 = 31293


In [482]:
ref_words.head()


Unnamed: 0,RT_id,created_at_h,token_counter
0,1278053836381392896,2020-06-29 07:00:00,{'sit': 1}
1,1274106956609777664,2020-06-18 10:00:00,"{'liberate': 1, 'washington': 1, 'state!': 1, ..."
2,1277830992473047040,2020-06-28 17:00:00,"{'breaking:': 1, 'three': 1, 'aurora': 1, 'pol..."
3,1266604715355312128,2020-05-28 17:00:00,"{'oh': 1, 'bitch': 1, 'got': 1, 'amish': 1, 'c..."
4,1268070997321814016,2020-06-01 18:00:00,"{'end': 1, 'stream!': 1, 'raise': 1, '$14,871...."


In [486]:
ref_sentiments 

Unnamed: 0_level_0,Unnamed: 1_level_0,neg,neu,pos,compound
RT_id,created_at_h,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1278053836381392896,2020-06-29 07:00:00,0.000,1.000,0.000,0.0000
1274106956609777664,2020-06-18 10:00:00,0.000,0.860,0.140,0.4738
1277830992473047040,2020-06-28 17:00:00,0.079,0.921,0.000,-0.0516
1266604715355312128,2020-05-28 17:00:00,0.487,0.513,0.000,-0.5859
1268070997321814016,2020-06-01 18:00:00,0.232,0.582,0.186,-0.1756
...,...,...,...,...,...
1265870261699534848,2020-05-26 16:00:00,0.221,0.606,0.173,-0.2732
1267569674012651520,2020-05-31 09:00:00,0.000,1.000,0.000,0.0000
1268990386447532032,2020-06-04 07:00:00,0.000,0.548,0.452,0.5106
1267971682440019968,2020-06-01 12:00:00,0.154,0.614,0.232,0.3147


In [489]:
ref_sentiments = rt.df_sentiments

new_sentiments = merge_datasets(or_df = ori_df, 
                           or_data = ori_df_sentiments, 
                           ref_data = ref_sentiments)

new_sentiments

Num rows = 4247 + 9658 = 13905


Unnamed: 0,id,created_at_h,id.1,created_at_h.1,neg,neu,pos,compound
0,1280987491906932736,2020-07-08 16:00:00,1280987491906932736,2020-07-08 16:00:00,0.072,0.833,0.094,0.1531
1,1280987496403341312,2020-07-08 16:00:00,1280987496403341312,2020-07-08 16:00:00,0.000,0.778,0.222,0.6458
2,1280987497871421440,2020-07-08 16:00:00,1280987497871421440,2020-07-08 16:00:00,0.000,0.488,0.512,0.6369
3,1280987504821272576,2020-07-08 16:00:00,1280987504821272576,2020-07-08 16:00:00,0.130,0.582,0.288,0.5707
4,1280987507480551424,2020-07-08 16:00:00,1280987507480551424,2020-07-08 16:00:00,0.000,0.748,0.252,0.4660
...,...,...,...,...,...,...,...,...
11972,1281285912421261312,2020-07-09 11:00:00,1281285912421261312,2020-07-09 11:00:00,,,,
11973,1281285913792913408,2020-07-09 11:00:00,1281285913792913408,2020-07-09 11:00:00,,,,
11974,1281285915944640512,2020-07-09 11:00:00,1281285915944640512,2020-07-09 11:00:00,,,,
11975,1281285916091461632,2020-07-09 11:00:00,1281285916091461632,2020-07-09 11:00:00,,,,


In [491]:
ref_emotions = rt.df_top_emotions

new_emotions = merge_datasets(or_df = ori_df, 
                           or_data = ori_df_emotions, 
                           ref_data = ref_emotions)

new_emotions

Num rows = 4247 + 9658 = 13905


Unnamed: 0,id,created_at_h,id.1,created_at_h.1,fear,anger,anticip,trust,surprise,positive,negative,sadness,disgust,joy
0,1280987491906932736,2020-07-08 16:00:00,1280987491906932736,2020-07-08 16:00:00,False,False,False,False,False,True,False,False,False,False
1,1280987496403341312,2020-07-08 16:00:00,1280987496403341312,2020-07-08 16:00:00,False,False,False,True,False,False,False,False,False,False
2,1280987497871421440,2020-07-08 16:00:00,1280987497871421440,2020-07-08 16:00:00,True,False,True,False,True,True,True,True,True,True
3,1280987504821272576,2020-07-08 16:00:00,1280987504821272576,2020-07-08 16:00:00,False,False,False,True,False,False,False,False,False,False
4,1280987507480551424,2020-07-08 16:00:00,1280987507480551424,2020-07-08 16:00:00,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11972,1281285912421261312,2020-07-09 11:00:00,1281285912421261312,2020-07-09 11:00:00,,,,,,,,,,
11973,1281285913792913408,2020-07-09 11:00:00,1281285913792913408,2020-07-09 11:00:00,,,,,,,,,,
11974,1281285915944640512,2020-07-09 11:00:00,1281285915944640512,2020-07-09 11:00:00,,,,,,,,,,
11975,1281285916091461632,2020-07-09 11:00:00,1281285916091461632,2020-07-09 11:00:00,,,,,,,,,,
