In [7]:
from os.path import expanduser, join as osjoin
from os import listdir
import json
import pandas as pd

def extract_engagement_data():
    path = expanduser('~/spark_apps/unesco_tweets/')

    df_data = []
    for f in listdir(path):
        with open(osjoin(path,f),'r') as fp:
            data = json.load(fp)
            df_data.extend(data)


    dates = []
    hashtags = []

    for d in df_data:
        if 'entities' in d and 'hashtags' in d['entities']:
            to_add = [i['tag'] for i in d['entities']['hashtags']]
            hashtags.extend(to_add)
            dates.extend([d['created_at']]*len(to_add))
        else:
            hashtags.append('')
            dates.append(d['created_at'])

    return df_data, dates, hashtags

def preprocess_engagement_data(df_data, dates, hashtags):
    df = pd.DataFrame(df_data)
    df['created_at'] = pd.to_datetime(df['created_at'])
    df = pd.concat((df,pd.json_normalize(df['public_metrics'])),axis = 1)

    df = df[['created_at','retweet_count', 'reply_count', 'like_count', 'quote_count']]

    hash = pd.DataFrame(data = {'created_at': dates,'hashtags':hashtags})
    hash['created_at'] = pd.to_datetime(hash['created_at'])
    analysis = hash.join(df.set_index('created_at'), on = 'created_at')
    analysis['hashtags'] = analysis['hashtags'].str.lower().str.strip()

    analysis['bucket'] = analysis['created_at'].dt.strftime('%B, %Y')
    analysis['bucket_idx'] = pd.to_datetime(analysis['bucket'],format = '%B, %Y')

    baseline = analysis.mean(numeric_only=True)

    return analysis, baseline

In [8]:
analysis, _ = preprocess_engagement_data(*extract_engagement_data())

In [9]:
analysis.to_csv('../../unesco_tweets.csv',index = False)

In [1]:
import tweepy
from os import environ,listdir
import json, pandas as pd
from os.path import expanduser, join

In [2]:
path = expanduser('~/spark_apps/unesco_tweets/')

In [3]:
df_data = []
for f in listdir(path):
    with open(join(path,f),'r') as fp:
        data = json.load(fp)
        df_data.extend(data)

# Engagement

In [5]:
df = pd.DataFrame(df_data)

In [6]:
df = pd.concat((df,pd.json_normalize(df['public_metrics'])),axis = 1)

In [24]:
df['created_at'] = pd.to_datetime(df['created_at'])

In [9]:
engagement = df[['created_at','retweet_count', 'reply_count', 'like_count', 'quote_count','id']]

In [19]:
dates = []
hashtags = []

for d in df_data:
    if 'entities' in d and 'hashtags' in d['entities']:
        to_add = [i['tag'] for i in d['entities']['hashtags']]
        hashtags.extend(to_add)
        dates.extend([d['created_at']]*len(to_add))
    else:
        hashtags.append('')
        dates.append(d['created_at'])

    

In [20]:
hash = pd.DataFrame(data = {'created_at': dates,'hashtags':hashtags})

In [25]:
hash['created_at'] = pd.to_datetime(hash['created_at'])

In [26]:
analysis = hash.join(engagement.set_index('created_at'), on = 'created_at')

In [29]:
analysis['hashtags'] = analysis['hashtags'].str.lower().str.strip()

In [28]:
from datetime import datetime as dt, date as d

In [39]:
day = d.today()
today = dt.fromordinal(day.toordinal())

In [34]:
from datetime import timedelta as td

In [41]:
before = today - td(days = 5)

In [None]:
analysis['created_at'].dt.ceil

In [56]:
before = before.astimezone()
today = today.astimezone()

In [35]:
analysis['bucket'] = analysis['created_at'].dt.strftime('%B, %Y')

In [38]:
analysis['bucket_idx'] = pd.to_datetime(analysis['bucket'],format = '%B, %Y')

In [42]:
rf = analysis.groupby(['bucket','bucket_idx','hashtags'],as_index=False).sum(numeric_only=True).sort_values('bucket_idx')

In [43]:
import plotly.express as px

In [109]:
analysis['bucket_idx'] = analysis['created_at'].dt.year+analysis['created_at'].dt.month