In [102]:
# Import packages
import pandas as pd
import ast 
import numpy as np
import json
import re
import os

data_dir = os.path.join('..', 'data')
rawdata_p = os.path.join(data_dir, 'tpb_tweets_news-outlets_20211208.json')
labelled_p = os.path.join(data_dir, 'tpb_tweets_simple-filter_labelled_20220308.json')
out_excel = os.path.join(data_dir, 'tpb_tweets_filtered_20220328.xlsx')
out_tagset = os.path.join(data_dir, 'prodigy', 'tpb_tagset01.json')

keep_columns = ['created_at', 'name', 'username', 'text', 'tweet_link', 'covid', 'label', 'id', 'hashtags', 'urls', 'retweet_count', 
                'reply_count', 'like_count', 'quote_count', 'referenced_type', 'referenced_id']

In [40]:
# load data

with open(rawdata_p, "r") as f:
    all_data = json.load(f)
    
labelled_df = pd.read_json(labelled_p)

In [41]:
labelled_df.head()

Unnamed: 0,created_at,name,id,label
0,2021-12-02 15:33:00+00:00,InfoMigrants,1466430204302217232,blocked mobilities
1,2021-12-01 10:29:05+00:00,The New York Times,1465991334184071169,blocked mobilities
2,2021-09-24 14:57:00+00:00,InfoMigrants,1441416381417893889,blocked mobilities
3,2021-08-24 16:12:00+00:00,InfoMigrants,1430201232065536004,blocked mobilities
4,2021-07-18 20:11:29+00:00,The New York Times,1416853150896726025,blocked mobilities


In [42]:
regex_string_covid = r'\bpandemic\b|\bcovid\b|\bcovid-19\b|\bcorona|\bvaccine|\bquarantine|\b#pandemic\b|\b#covid\b|\b#covid-19\b|\b#corona|\b#vaccine|\b#quarantine'
regex_string_migrat = r'\bmigrant|\bimmigrant|\brefugee|\btransit|\bdisplacement\b|\bdisplaced\b|\bborder|\breturn|\bpushback|\b(pushed back)\b|\bboat|\bdrown\b|\bhunger|\b#migrant|\b#immigrant|\b#refugee|\b#transit|\b#displacement\b|\b#displaced\b|\b#border|\b#return|\b#pushback|\b#boat|\b#drown\b|\b#hunger'
regex_string_geos = r'\blebanon\b|\blebanese\b|\bsyria|\bjordan|\biraq|\bgreece\b|\bgreek|\bturkey\b|\bturkish\b|\bcyprus\b|\bcypriot|\bmediterranean\b|\bEU\b|\btunisia|\bitaly\b|\bitalian\b|\beuropean\b|\b#lebanon\b|\b#lebanese\b|\b#syria|\b#jordan|\b#iraq|\b#greece\b|\b#greek|\b#turkey\b|\b#turkish\b|\b#cyprus\b|\b#cypriot|\b#mediterranean\b|\b#EU\b|\b#tunisia|\b#italy\b|\b#italian\b|\b#european\b'

regex_covid = re.compile(regex_string_covid, re.IGNORECASE)
regex_migrat = re.compile(regex_string_migrat, re.IGNORECASE)
regex_geos = re.compile(regex_string_geos, re.IGNORECASE)

In [43]:
## immigration and geos
data_filter_ig = []

for entry in all_data.get('data'):
    if regex_migrat.search(entry.get('text')) and regex_geos.search(entry.get('text')):
        data_filter_ig.append(entry)
len(data_filter_ig)

3735

In [44]:
## adding covid variable

for entry in data_filter_ig:
    if regex_covid.search(entry.get('text')):
        entry['covid'] = 1
    else:
        entry['covid'] = 0

In [45]:
# Convert data to df

df_tweets = pd.DataFrame.from_records(data_filter_ig)
df_users = pd.DataFrame.from_records(all_data.get('includes').get('users'))

In [46]:
# Functions for unnesting

def fix_dicts(string):
    if string is np.nan:
        return(string)
    if not isinstance(string, dict):
        string_as_dict = ast.literal_eval(string)
        return(string_as_dict)
    else:
        return(string)

def unnest_hashtags(entities):
    try:
        hashtags = list(entities.get('hashtags'))
    except:
        return(list())
    if isinstance(hashtags, list):
        hashtags_list = [hashtag.get('tag') for hashtag in hashtags]
        return(hashtags_list)
    else:
        return
    
def unnest_mentions(entities):
    try:
        mentions = list(entities.get('mentions'))
    except:
        return(list())
    if isinstance(mentions, list):
        mentions_list = [mention.get('username') for mention in mentions]
        return(mentions_list)
    else:
        return

def unnest_urls(entities):
    try:
        urls = list(entities.get('urls'))
    except:
        return(list())
    if isinstance(urls, list):
        urls_list = [url.get('url') for url in urls]
        return(urls_list)
    else:
        return
    
def unnest_cashtags(entities):
    try:
        cashtags = list(entities.get('cashtags'))
    except:
        return(list())
    if isinstance(cashtags, list):
        cashtags_list = [cashtag.get('tag') for cashtag in cashtags]
        return(cashtags_list)
    else:
        return

In [47]:
# Functions for wrangling data frame

def wrangle_df_tweets(df, drop_cols):
    
    # Fix dictionaries
    df['public_metrics'] = df['public_metrics'].apply(fix_dicts)
    df['entities'] = df['entities'].apply(fix_dicts)
    
    # Unnest
    df = pd.concat([df, pd.json_normalize(df['public_metrics'])], axis = 1)
    df['hashtags'] = df['entities'].apply(unnest_hashtags)
    df['mentions'] = df['entities'].apply(unnest_mentions)
    df['urls'] = df['entities'].apply(unnest_urls)
    df['cashtags'] = df['entities'].apply(unnest_cashtags)
    df = df.loc[:, ~df.columns.isin(drop_cols)]
    
    # Referenced tweets
    
    
    
    return(df)

def wrangle_df_users(df, drop_cols):
    
    # Fix dicts
    df['public_metrics'] = df['public_metrics'].apply(fix_dicts)
    
    # Unnest
    df = pd.concat([df, pd.json_normalize(df['public_metrics'])], axis = 1)
    df = df.rename(columns = {'id': 'author_id', 'created_at': 'author_created_at'})
    df = df.loc[:, ~df.columns.isin(drop_cols)]
    
    return(df)

In [95]:
# Wrangle data frame

drop_cols = ['public_metrics', 'entities']

df_tweets_unnest = wrangle_df_tweets(df_tweets, drop_cols)
df_users_unnest = wrangle_df_users(df_users, drop_cols)

In [96]:
# Add referenced tweet info
df_tweets_unnest = df_tweets_unnest.explode('referenced_tweets')
df_tweets_unnest['referenced_type'] = np.nan
df_tweets_unnest['referenced_id'] = np.nan
df_tweets_unnest.loc[df_tweets_unnest['referenced_tweets'].notna(), 'referenced_type'] = df_tweets_unnest.loc[df_tweets_unnest['referenced_tweets'].notna(), 'referenced_tweets'].apply(lambda entry: entry.get('type'))
df_tweets_unnest.loc[df_tweets_unnest['referenced_tweets'].notna(), 'referenced_id'] = df_tweets_unnest.loc[df_tweets_unnest['referenced_tweets'].notna(), 'referenced_tweets'].apply(lambda entry: entry.get('id'))

In [97]:
# Combine data with pd.merge

df_combined = pd.merge(df_tweets_unnest, df_users_unnest, how = 'left', left_on = 'author_id', right_on = 'author_id').drop_duplicates(subset = ['id'])

In [98]:
# Recoding and adding variables
df_combined['covid'] = df_combined['covid'].astype(bool)
df_combined['tweet_link'] = 'https://twitter.com/' + df_combined['username'] + '/status/' + df_combined['id']
df_combined['id'] = df_combined['id'].astype('int64')

In [99]:
# Adding existing labels

df_combined = pd.merge(df_combined, labelled_df.loc[:, ['id', 'label']], on = 'id', how = 'left').loc[:, keep_columns]

In [100]:
df_combined.head()

Unnamed: 0,created_at,name,username,text,tweet_link,label,id,hashtags,urls,retweet_count,reply_count,like_count,quote_count,referenced_type,referenced_id
0,2021-12-07T22:01:47.000Z,Al Jazeera English,AJEnglish,RT @AJEnglish: Part of one of the world’s olde...,https://twitter.com/AJEnglish/status/146833998...,,1468339986059730949,[],[],192,0,0,0,retweeted,1.4682790141208453e+18
1,2021-12-07T17:59:30.000Z,Al Jazeera English,AJEnglish,Part of one of the world’s oldest surviving wo...,https://twitter.com/AJEnglish/status/146827901...,,1468279014120845313,[],[https://t.co/zOEesS0p28],192,17,491,16,,
2,2021-12-07T15:19:58.000Z,Al Jazeera English,AJEnglish,RT @AJEnglish: Libyan authorities get support ...,https://twitter.com/AJEnglish/status/146823886...,,1468238862770216960,[],[],60,0,0,0,retweeted,1.4681394669465354e+18
3,2021-12-07T14:25:45.000Z,Al Jazeera English,AJEnglish,"""Funds from the EU and member states, sometime...",https://twitter.com/AJEnglish/status/146822522...,,1468225220125331467,[AJOpinion],[https://t.co/1lv9uCE73H],18,4,40,1,,
4,2021-12-07T12:22:00.000Z,InfoMigrants,InfoMigrants,The Council of Europe has decided to shelve a ...,https://twitter.com/InfoMigrants/status/146819...,,1468194076738600971,[],[https://t.co/UJd7S5JKGe],0,0,0,0,,


In [101]:
df_combined.shape

(3745, 15)

In [103]:
# Export prodigy set 01

df_tagset = df_combined.loc[df_combined['referenced_type'].isna(), ['id', 'username', 'text', 'tweet_link']]
df_tagset.to_json(out_tagset, orient = 'records')