In [1]:
import os
import pandas as pd
import argparse
import pickle
from littlebird import TweetReader, TweetWriter
from collections import Counter
from tqdm import tqdm
import json
import plotly.graph_objects as go

from datetime import datetime, timedelta

In [None]:
input_folder = '/export/b03/achinta3/files/filtered_tweets'
tweets = []
for file in tqdm(os.listdir(input_folder)):
    with open(os.path.join(input_folder,file), 'r') as f:
        tweets.extend(json.load(f))   

In [None]:
df = pd.DataFrame(tweets)
df

In [None]:
#print unique countries
places = df.place
df_places = pd.DataFrame.from_records(places)
df_places.country_code.unique()

In [None]:
#sort countries based on number of tweets
df_places.groupby('country_code')['country_code'].count().sort_values(ascending=False)

In [None]:
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 1000000)

In [None]:
#full country names for reference
df_places.groupby('country')['country'].count().sort_values(ascending=False)

In [None]:
#ACLED Dataset
acled = pd.read_csv('/home/aadelucia/files/minerva/data/2014-01-01-2020-01-01_acled_reduced_only_fatalities.csv') 
acled

In [None]:
#Sort countries based on number of ACLED recorded events
acled.groupby('country')['country'].count().sort_values(ascending=False)

In [None]:
#Is fatalites a good indicator of a major event?
acled[acled['fatalities']>10 ][['event_date','event_type', 'sub_event_type', 'actor1','assoc_actor_1','region', 
                                 'country','location','source','source_scale', 'notes', 'fatalities']]

# Civil Unrest Over Time

In [64]:
from datetime import datetime, timedelta
 
#function to plot percentage of tweets that discuss civil unrest over time    
def show_plot(datestr,country):

    date = datetime.strptime(datestr, "%m/%d/%Y")
    start_date = date - timedelta(days=30)
    end_date = date + timedelta(days=30)
    civil_unrest_ratios = []
    dates = []
    ratio_str = []

    for date in (start_date + timedelta(n) for n in range(60)):
        #print(date.strftime('%y_%m_%d'))
        file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
        try:
            with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                tweets=json.load(f)  
        except:
            continue
        
        reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
        alltweets = list(reader.read_tweets())

        civil_unrest_ratios.append(len(tweets)/len(alltweets))
        ratio_str.append(str(len(tweets))+'/'+str(len(alltweets)))
        dates.append(date.strftime('%m/%d/%Y'))

    scatter = go.Scatter(x=dates,y=[i*100 for i in civil_unrest_ratios],text=ratio_str)
    layout = go.Layout(title='civil unrest ',yaxis={'title': 'ratio'})
    fig = go.Figure(data=[scatter], layout=layout)
    
    fig.update_layout(
        title={
            #'text': f'Percentage of Tweets Discussing Civil Unrest in {country}',
            'text': f'Percentage of Tweets Discussing Civil Unrest in South Africa',
            'y':0.9,
            'x':0.5,
            'font_size': 24,
            'xref': 'paper',
            'xanchor': 'center',
            'yanchor': 'top'},
        yaxis_title='Percentage',
        yaxis_title_font_size=18,
        yaxis_range=[0,35]
    )
    fig.update_xaxes(
        nticks=15,
        tickfont_size=15,
        tickangle=75
    )
    fig.show()

In [63]:
#source: https://en.wikipedia.org/wiki/Burayu_massacre
show_plot('09/16/2018','ET')

In [65]:
#source: https://en.wikipedia.org/wiki/2019_Johannesburg_riots
show_plot('09/03/2019','ZA')

In [None]:
show_plot('01/13/2019','BD')

In [None]:
show_plot('06/19/2019','NA')
#https://monitor.civicus.org/updates/2019/11/04/namibians-take-streets-separate-protests/

In [None]:
show_plot('06/15/2017','TZ')
#no clear peaks

In [None]:
show_plot('12/12/2015','NG')

In [None]:
show_plot('01/03/2018','NG')

In [None]:
acled_full = pd.read_csv('/home/aadelucia/files/minerva/data/2014-01-01-2020-01-01_acled_reduced_all.csv') 
acled_full[(acled_full.country=='Ethiopia') & (acled_full.event_date=='13 September 2018') ][['event_date','event_type', 'sub_event_type', 
                                 'country','location','notes', 'fatalities']]

In [None]:
# 23 people were killed in unrest in Addis Ababa between September 12-14th and over 2,500 youths arrested. Fatalities spread over 9 events.
# 6+2+1 = 9 events
# 12+6+2 = 20 deaths

In [None]:
acled_test = acled_full.dropna(subset=['notes'])
acled_test = acled_test[acled_test['notes'].str.contains("size=")]
acled_test[['event_date','event_type', 'sub_event_type','country','location','notes', 'fatalities']]

# Hashtag Trends

In [None]:
from collections import Counter

def get_popular_hashtags(datestr,country):
    #datestr = '09/03/2019'
    #country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f) 

    hashtags = []
    for tweet in tweets:
        hashtags_list = tweet.get('entities').get('hashtags')
        if len(hashtags_list)==0:
            continue
        for h in hashtags_list:
            hashtags.append(str(h.get('text')).lower())
    return Counter(hashtags).most_common(20)
    
get_popular_hashtags('09/03/2019','NA')

In [5]:
def hashtag_trends1(datestr,hashtag,country,filtered):
    date = datetime.strptime(datestr, "%m/%d/%Y")
    start_date = date - timedelta(days=7)
    end_date = date + timedelta(days=7)
    hashtag_count = {}
 
    for date in (start_date + timedelta(n) for n in range(14)):
        #print(date.strftime('%y_%m_%d'))
        file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
        try:
            if not filtered:
                reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
                tweets = list(reader.read_tweets()) 
            else:
                with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f)
        except:
            continue
        count = 0
        for tweet in tweets:
            hashtags_list = tweet.get('entities').get('hashtags')
            if len(hashtags_list)==0:
                continue
            for h in hashtags_list:
                if h.get('text').lower() == hashtag:
                    count+=1
        hashtag_count[date.strftime('%m/%d/%Y')]=count
    
    return hashtag_count

In [None]:
#curious to see how sashutdown and xenophobia trend before vs after the riots
date = '09/02/2019'
hashtag = 'xenophobia'
hashtag_count = hashtag_trends1(date,hashtag,'ZA',True)
date1 = '09/02/2019'
hashtag1 = 'pretoriacbd'
hashtag_count1 = hashtag_trends1(date1,hashtag1,'ZA',True)
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values()),name=hashtag))
fig.add_trace(go.Scatter(x=list(hashtag_count1.keys()),y=list(hashtag_count1.values()),name=hashtag1))

fig.update_layout(
    #title="Plot Title",
    xaxis_title=f'trend over time',
    yaxis_title='number of hashtags'
    
)
fig.show()

In [None]:
    datestr = '09/02/2019'
    country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
    tweets = list(reader.read_tweets())

    hashtags = []
    for tweet in tweets:
        hashtags_list = tweet.get('entities').get('hashtags')
        if len(hashtags_list)==0:
            continue
        #print([h.get('text') for h in hashtags_list])
        if 'pretoriacbd' in [h.get('text').lower() for h in hashtags_list]:
            hashtags.append(tweet['text'])
            

In [None]:
hashtags

In [None]:
from collections import Counter

def get_popular_hashtags_day(datestr,country,hcount,filtered=True):
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'

    if filtered:
        with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
            tweets=json.load(f)
    else:
        reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
        tweets = list(reader.read_tweets()) 
    hashtags = []
    for tweet in tweets:
        hashtags_list = tweet.get('entities').get('hashtags')
        if len(hashtags_list)==0:
            continue
        for h in hashtags_list:
            hashtags.append(str(h.get('text')).lower())
    return Counter(hashtags).most_common(hcount)
    
get_popular_hashtags_unfiltered('09/02/2019','ZA',30)

In [2]:
def get_popular_hashtags_weekly(datestr,country,hcount,filtered):
    #datestr = '09/03/2019'
    #country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    start_date = date - timedelta(days=7)
    end_date = date + timedelta(days=7)
    #hashtag_count = {}
    hashtags = []
    for date in (start_date + timedelta(n) for n in range(14)):
        #print(date.strftime('%y_%m_%d'))
        file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
        try:
            if filtered:
                with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f)
            else:
                reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
                tweets = list(reader.read_tweets()) 
        except:
            continue
        
        for tweet in tweets:
            hashtags_list = tweet.get('entities').get('hashtags')
            if len(hashtags_list)==0:
                continue
            for h in hashtags_list:
                hashtags.append(str(h.get('text')).lower())
    return Counter(hashtags).most_common(hcount)

In [3]:
get_popular_hashtags_weekly('09/02/2019','ZA',20,True)

[('uyinenemrwetyana', 682),
 ('enoughisenough', 583),
 ('sashutdown', 549),
 ('aminext', 465),
 ('xenophobia', 425),
 ('saynotoxenophobia', 389),
 ('joburgcbd', 326),
 ('pretoriacbd', 322),
 ('shutdownsouthafrica', 307),
 ('ripuyinene', 271),
 ('xenophobiainsouthafrica', 263),
 ('menaretrash', 232),
 ('ripuyinenemrwetyana', 218),
 ('aminextprotest', 207),
 ('stopkillingourwomen', 200),
 ('dearmrpresident', 193),
 ('notallmen', 165),
 ('southafrica', 164),
 ('ripmugabe', 156),
 ('prayforsouthafrica', 153)]

In [None]:
get_popular_hashtags_weekly('09/02/2019','ZA',20,False)

In [None]:
hashtag = 'uyinenemrwetyana'
hashtag_count = hashtag_trends1('09/03/2019',hashtag,'ZA',True)
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values())))

fig.update_layout(
    #title="Plot Title",
    xaxis_title=f'trend over time for #{hashtag}',
    yaxis_title='number of hashtags'
    
)
fig.show()

In [None]:
hashtag = 'dearmrpresident'
hashtag_count = hashtag_trends1('09/03/2019',hashtag,'ZA',True)

fig = go.Figure()
fig.add_trace(go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values())))
fig.update_layout(
    xaxis_title=f'trend over time for #{hashtag}',
    yaxis_title='number of hashtags'
    )
fig.show()

In [None]:
def plot_daily_popular_hashtags(country,date,filtered=True):
    hash_list = [a[0] for a in get_popular_hashtags_day(date,country,5,filtered)]

    l = []
    for h in hash_list:
        l.append(hashtag_trends1(date,h,country,True))
    fig = go.Figure()
    for i,hashtag_count in enumerate(l):
        fig.add_trace(go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values()),name=[('#'+str(i)) for i in hash_list][i]))
    fig.update_layout(
        title={
            'text': f"Daily Popular Hashtags in {country}",
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        #xaxis_title=f'Trend over time for popular hashtags',
        yaxis_title='Number of hashtags'   
    )
    fig.show()

plot_daily_popular_hashtags('ZA','09/03/2019')

In [None]:
plot_daily_popular_hashtags('ZA','06/21/2016',False)

In [41]:
def plot_weekly_popular_hashtags(country,date,filtered=True,title=None):
    hash_list = [a[0] for a in get_popular_hashtags_weekly(date,country,10,filtered=True)]

    l = []
    for h in hash_list:
        l.append(hashtag_trends1(date,h,country,True))

    fig = go.Figure()
    for i,hashtag_count in enumerate(l):
        fig.add_trace(go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values()),name=hash_list[i]))
    fig.update_layout(
        title={
            'text': title if title else f"Weekly Popular Hashtags in {country}",
            'font_size': 24,
            'y':0.9,
            'x':0.5,
            'xref': 'paper',
            'xanchor': 'center',
            'yanchor': 'top'},
        yaxis_title='Number of hashtags',
        yaxis_title_font_size=18,
        legend={
            "xanchor": "left",
            "x": 0.01,
            "font": {
                "size": 16
            }
        }
    )
    fig.update_xaxes(
        tickfont_size=14
    )
    fig.update_yaxes(
        range=[0,15]
    )

    fig.show()


In [43]:
plot_weekly_popular_hashtags('ET','09/15/2018',True,title="Weekly Popular Hashtags in Ethiopia During Burayu Massacre")

In [32]:
plot_weekly_popular_hashtags('ZA','09/03/2019',True,title="Weekly Popular Hashtags in South Africa During Johannesburg Riots")

In [None]:
plot_weekly_popular_hashtags('ZA','08/26/2019',False)

In [None]:
plot_weekly_popular_hashtags('ZA','06/23/2016',False)

In [None]:
plot_weekly_popular_hashtags('ET','09/16/2018',True)

In [None]:
#Johannesburg riots
#Date 1–5 September 2019
#     8 September 2019

In [None]:
acled_full[(acled_full.country=='South Africa') & (acled_full.event_date=='03 September 2019') ][['event_date','event_type', 'sub_event_type', 
                                 'country','location','notes', 'fatalities']]

In [None]:
#source: https://en.wikipedia.org/wiki/2016_Tshwane_riots
show_plot('06/23/2016','ZA')

In [None]:
acled_full[(acled_full.country=='South Africa') & (acled_full.event_date=='23 June 2016') ][['event_date','event_type', 'sub_event_type', 
                                 'country','location','notes', 'fatalities']]



In [None]:
get_popular_hashtags('06/23/2016','ZA')

In [None]:
acled_ZA = acled_full[(acled_full.country=='South Africa')][['event_date','event_type', 'sub_event_type', 
                                 'country','location','notes', 'fatalities']]
acled_ZA_gb = acled_ZA.groupby("event_date")["event_date"].count()

fig = go.Figure()
fig.add_trace(go.Histogram(histfunc="count",x=acled_ZA_gb.values))
fig.update_layout(
        title={
           # 'text': f"Weekly Popular Hashtags in {country}",
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
        xaxis_title='Number of events per day',
        yaxis_title='Number of days'   
    )
fig.show()

In [None]:
#making a dictionary with number of total tweets and number of filtered tweets 
input_folder = '/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en'
counts_dict = {}
for file in (os.listdir(input_folder)):
    if ('ZA' in str(file)):
        #print(file[:10])
        try:
            with open(f'/export/b03/achinta3/files/filtered_tweets/{file}_filtered.txt', 'r') as f:
                filtered_tweets=json.load(f)
        except:
            continue

        reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file}')
        alltweets = list(reader.read_tweets())
        sub_dict = {}
        
        sub_dict['all tweets'] = len(alltweets)
        sub_dict['filtered tweets'] = len(filtered_tweets)
        counts_dict[str(file)[:10]] = sub_dict

In [None]:
ratios = []
for value in counts_dict.values():
    ratios.append(value['filtered tweets']/value['all tweets'])
df_ratios = pd.DataFrame(counts_dict).transpose().reset_index()
df_ratios['ratio']=df_ratios['filtered tweets']/df_ratios['all tweets']

In [None]:
acled_ZA_df = pd.DataFrame(list(zip(acled_ZA_gb.keys(),acled_ZA_gb.values)),columns=['date','count'])
acled_ZA_df.date = acled_ZA_df.date.apply(lambda x: datetime.strptime(x, '%d %B %Y').strftime('%Y_%m_%d'))
df_ratios.join(acled_ZA_df,lsuffix='index',rsuffix='date')
merged_df = pd.merge(df_ratios, acled_ZA_df, left_on='index', right_on='date',how='left')
merged_df

In [None]:
merged_df['count'] = merged_df['count'].fillna(0)

In [None]:
#Here we try to get the accuracy metrics with different combinations of ratio and count (of events per day) thresholds
from sklearn.metrics import f1_score

def get_f1(ratio,count):
    pred_labels = np.array(merged_df.ratio>ratio)
    true_labels = np.array(merged_df['count']>=count)
    return f1_score(true_labels,pred_labels)

def get_accuracy(ratio,count):
    a = np.array(merged_df.ratio>ratio)
    b = np.array(merged_df['count']>count)
    y = np.invert(a^b)
    return np.sum(y)/len(y)

def get_dates(ratio,count):
    a = np.array(merged_df.ratio>ratio)
    b = np.array(merged_df['count']>count)
    y = np.invert(a^b)
    return merged_df[a&b]
    
get_accuracy(0.12,4)

In [None]:
ratios=[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2]
counts = [1,2,3,4,5,6,7,8,9,10]

d1 = {}
for c in counts:
    temp =[]
    for r in ratios:
        print(f'accuracy for {r} and {c} : {get_accuracy(r,c)}')
        temp.append(get_accuracy(r,c))
        d1[c]=temp

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=ratios,y=d1[1],name='1 event or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d1[2],name='2 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d1[3],name='3 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d1[4],name='4 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d1[5],name='5 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d1[6],name='6 events or more per day'))

fig.update_layout(
    #title="Plot Title",
    xaxis_title="Ratio of civil unrest tweets",
    yaxis_title="Accuracy score"
    
)
fig.show()

In [None]:
ratios=[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2]
counts = [1,2,3,4,5,6,7,8,9,10]

d = {}
for c in counts:
    temp =[]
    for r in ratios:
        print(f'f1 score for {r} and {c} : {get_f1(r,c)}')
        temp.append(get_f1(r,c))
        d[c]=temp

In [None]:
fig = go.Figure()

# Add traces
fig.add_trace(go.Scatter(x=ratios,y=d[1],name='1 event or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d[2],name='2 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d[3],name='3 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d[4],name='4 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d[5],name='5 events or more per day'))
fig.add_trace(go.Scatter(x=ratios,y=d[6],name='6 events or more per day'))

fig.update_layout(
    #title="Plot Title",
    xaxis_title="Ratio of civil unrest tweets",
    yaxis_title="F1 score"
    
)
fig.show()

In [None]:
df_high_risk = get_dates(0.12,4)

In [None]:
def get_events_from_date(date):
    #date = datetime.strptime(date, '%Y_%m_%d').strftime('%d %B %Y')
    return acled_full[(acled_full.country=='South Africa') & (acled_full.event_date==date) ][['event_date','event_type', 'sub_event_type', 
                                 'country','location','notes', 'fatalities']]


In [None]:
get_popular_hashtags(datetime.strptime(df_high_risk.iloc[3]['index'], '%Y_%m_%d').strftime('%m/%d/%Y'),'ZA')

In [None]:
get_events_from_date(datetime.strptime(df_high_risk.iloc[3]['index'], '%Y_%m_%d').strftime('%d %B %Y'))

In [None]:
get_events_from_date('17 March 2016')

In [None]:
#Total number of fatalities per day
acled_full_copy = acled_full.copy()
acled_full_copy.event_date =  acled_full_copy.event_date.apply(lambda x:datetime.strptime(x, '%d %B %Y').strftime('%Y/%m/%d') )
acled_gb_fatalities = acled_full_copy.groupby(["event_date","country"])["fatalities"].sum().reset_index()
acled_gb_fatalities

In [None]:
#total number of events per day
acled_full_copy = acled_full.copy()
acled_full_copy.event_date =  acled_full_copy.event_date.apply(lambda x:datetime.strptime(x, '%d %B %Y').strftime('%Y/%m/%d') )
acled_gb = acled_full_copy.groupby(["event_date","country"])["event_date"].count().reset_index(name='events')
acled_gb


In [None]:
acled_gb['fatalities']=acled_gb_fatalities['fatalities']
acled_gb

In [None]:
acled_gb.to_csv("acled_daily_events_by_country.csv", index=False)

In [None]:
def hashtag_trends(datestr,hashtag,country='ZA'):
    date = datetime.strptime(datestr, "%m/%d/%Y")
    start_date = date - timedelta(days=7)
    end_date = date + timedelta(days=7)
    hashtag_count = {}
 
    for date in (start_date + timedelta(n) for n in range(14)):
        file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
        try:
            reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
            tweets = list(reader.read_tweets()) 
        except:
            continue
        count = 0
        for tweet in tweets:
            hashtags_list = tweet.get('entities').get('hashtags')
            if len(hashtags_list)==0:
                continue
            for h in hashtags_list:
                if h.get('text').lower() == hashtag:
                    count+=1
        hashtag_count[date.strftime('%m/%d/%Y')]=count
    
    #return hashtag_count
    scatter = go.Scatter(x=list(hashtag_count.keys()),y=list(hashtag_count.values()))
    layout = go.Layout(title=f'trend over time for #{hashtag}',yaxis={'title': 'number of hashtags'})
    fig = go.Figure(data=[scatter], layout=layout)
    fig.show()

#show_plot('02/03/2015','BD')
hashtag_trends('06/15/2017','taxistrike')

In [None]:
scatter = go.Scatter(x=list(h.keys()),y=list(h.values()))
layout = go.Layout(title='trend over time for #taxistrike',yaxis={'title': 'number of hashtags'})
fig = go.Figure(data=[scatter], layout=layout)
fig.show()

In [None]:
def get_popular_users(datestr,country):
    #datestr = '09/03/2019'
    #country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f) 

    users_favs = {}
    for tweet in tweets:
        username = tweet.get('user').get('screen_name')
        fav_count = tweet['favorite_count']
        if not username in users_favs:
            users_favs[username] = fav_count
        else:
            users_favs[username] += fav_count
        
    return users_favs

In [None]:
d = get_popular_users('09/03/2019','ZA')
dict(sorted(d.items(), key=lambda item: item[1]))

In [None]:
    #checking percentage of tweets from verified users  
    datestr = '08/26/2019'
    country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    reader = TweetReader(f'/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/{file_name}')
    alltweets = list(reader.read_tweets())

    users_favs = {}
    verified_tweets = []
    for tweet in alltweets:
        if tweet['user']['verified']:
            verified_tweets.append(tweet['text'])
    print(len(verified_tweets)/len(alltweets))


In [None]:
    ###Incorrectly classified tweets
    datestr = '09/03/2019'
    country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f) 

    wrong_preds = []
    for tweet in tweets:
        if tweet['user']['verified']:
        #username = tweet.get('user').get('screen_name')
        #fav_count = tweet['favorite_count']
            if tweet['user']['description'] == "We report traffic flows on Cape Town Freeways, N1 till the Tunnel, N2 to Sir Lowry's Pass, N7 to Potsdam, R300 and M5 between N1 and N2 https://t.co/M3tvQK9v2C":
                wrong_preds.append(tweet['text'])
            
    print(len(wrong_preds))

In [None]:
with open('/home/achinta3/files/wrong_preds.txt', 'w') as f:
    f.write("\n".join(wrong_preds))

In [None]:
from collections import Counter
Counter(verified_tweets).most_common(100)

In [None]:
    datestr = '07/06/2016'
    country = 'ZA'
    date = datetime.strptime(datestr, "%m/%d/%Y")
    file_name = date.strftime('%Y_%m_%d')+f'_{country}.gz'
    with open(f'/export/b03/achinta3/files/filtered_tweets/{file_name}_filtered.txt', 'r') as f:
                    tweets=json.load(f) 

    verified_tweets = []
    for tweet in tweets:
        if tweet['user']['verified']:
        #username = tweet.get('user').get('screen_name')
        #fav_count = tweet['favorite_count']
            print('TWEET: '+tweet['text'])
            print(tweet['id'])
            print('BIO:')
            print(tweet.get('user').get('description'))
            print('=========================================')
            verified_tweets.append(tweet['user']['description'])
            
    print(len(verified_tweets))
        

In [None]:
Counter(verified_tweets).most_common(100)

In [None]:
import gzip
import json
from littlebird import TweetReader
import os
from collections import Counter

#f=gzip.open('/home/achinta3/files/test1o/test.txt_out.gz','rb')
#print(f.read())

input_folder = '/home/achinta3/files/test_out'
d = {}
d1 ={}
for file in os.listdir(input_folder):
    reader = TweetReader(os.path.join(input_folder,file))
    tweets = list(reader.read_tweets())
    count=0
    for tweet in tweets:
        count+=tweet['favorite_count'] 
    d[file[:10]]=(count,count/len(tweets))
    d1[file[:10]]=sorted(tweets, key=lambda k: k['favorite_count'],reverse=True)[:20]
    #s_tweets = sorted(alltweets, key=lambda k: k['favorite_count']) 
#alltweets

In [None]:
import plotly.graph_objects as go
scatter = go.Scatter(x=list(d.keys()),y=list(i[1] for i in d.values()))
layout = go.Layout(title='likes per day (filtered dataset)',yaxis={'title': 'avg likes per tweet'})
fig = go.Figure(data=[scatter], layout=layout)
fig.show()

In [None]:
df = pd.DataFrame.from_dict(d_user).T
df.reset_index(level=0, inplace=True)
df

In [None]:
df_influencial = df.sort_values('likes',ascending=False)[:200]
df_influencial = df_influencial.rename(columns={'index': 'username'})
df_influencial

In [None]:
input_folder = '/home/aadelucia/files/minerva/raw_tweets_deduplicated/tweets_en/'
d_user = {}
for file in os.listdir(input_folder):
    #since updating likes script takes forever, I worked on a small subset
    if file in ['2018_09_13_ET.gz','2018_09_14_ET.gz','2018_09_15_ET.gz','2018_09_16_ET.gz','2018_09_17_ET.gz','2018_09_18_ET.gz','2018_09_19_ET.gz']:
        reader = TweetReader(os.path.join(input_folder,file))
        tweets = list(reader.read_tweets())
        #print(file)
        #print(tweets)
        #break
        for tweet in tweets:
            if tweet.get('deleted'):continue
            if not tweet.get('civil_unrest_related'):continue
            if not tweet.get('user').get('screen_name') in d_user:
                d_user[tweet.get('user').get('screen_name')]={'likes':tweet.get('updated_likes'),
                                                              'verified':tweet.get('user').get('verified'),
                                                              'bio':tweet.get('user').get('description'),
                                                              'n_tweets':1}
            else:
                d_user[tweet.get('user').get('screen_name')]['likes']+=tweet.get('updated_likes')
                d_user[tweet.get('user').get('screen_name')]['n_tweets']+=1

In [None]:
#Here I displayed the most popular users who tweeted that week
df = pd.DataFrame.from_dict(d_user).T
df.reset_index(level=0, inplace=True)
df_influencial = df.sort_values('likes',ascending=False)[:200]
df_influencial = df_influencial.rename(columns={'index': 'username'})
df_influencial