In [1]:
# import packages

import pandas as pd
import ast 
import numpy as np
import json
import re

In [2]:
# load data

with open('../data/tpb_tweets_news-outlets_20211208.json', "r") as outfile:
    all_data = json.load(outfile)

In [3]:
regex_string_covid = r"\b#pandemic\b|\b#covid\b|\b#covid-19\b|\bpandemic\b|\bcovid\b|\bcovid-19\b|\bcorona|\bvaccine|\bquarantine"
regex_string_immigrations = r"\b#migrant|\b#refugee|\b#transit|\b#displacement\b|\b#border|\b#return\b|\b#pushback|\b#boat|\b#drowning\b|\b#hunger\b|\bmigrant|\brefugee|\btransit|\bdisplacement\b|\bborder|\breturn\b|\bpushback|\bboat|\bdrowning\b|\bhunger"
regex_string_geos = r"\blebanon\b|\blebanese\b|\bsyria|\bjordan|\biraq|\bgreece\b|\bgreek|\bturkey\b|\bturkish\b|\bcyprus\b|\bcypriot|\bmediterranean\b|\bEU\b|\btunisia|\bitaly\b|\bitalian\b|\beuropean\b"

regex_covid = re.compile(regex_string_covid, re.IGNORECASE)
regex_immigrations = re.compile(regex_string_immigrations, re.IGNORECASE)
regex_geos = re.compile(regex_string_geos, re.IGNORECASE)

In [4]:
# applying different filters

## all filters
data_filter_cig = []

for entry in all_data.get('data'):
    if regex_covid.search(entry.get('text')) and regex_immigrations.search(entry.get('text')) and regex_geos.search(entry.get('text')):
        data_filter_cig.append(entry)
len(data_filter_cig)

387

In [5]:
## immigration and geos
data_filter_ig = []

for entry in all_data.get('data'):
    if regex_immigrations.search(entry.get('text')) and regex_geos.search(entry.get('text')):
        data_filter_ig.append(entry)
len(data_filter_ig)

3516

In [15]:
## immigration and covid
data_filter_ci = []

for entry in all_data.get('data'):
    if regex_covid.search(entry.get('text')) and regex_immigrations.search(entry.get('text')):
        data_filter_ci.append(entry)
len(data_filter_ci)

1129

In [16]:
## immigration only
data_filter_i = []

for entry in all_data.get('data'):
    if regex_immigrations.search(entry.get('text')):
        data_filter_i.append(entry)
len(data_filter_i)

13157

In [17]:
## geos only
data_filter_g = []

for entry in all_data.get('data'):
    if regex_geos.search(entry.get('text')):
        data_filter_g.append(entry)
len(data_filter_g)

14197

In [18]:
## immigration or geo
data_filter_iorg = []

for entry in all_data.get('data'):
    if regex_immigrations.search(entry.get('text')) or regex_geos.search(entry.get('text')):
        data_filter_iorg.append(entry)
len(data_filter_iorg)

25234

In [19]:
## covid and immigration or geo
data_filter_ciorg = []

for entry in all_data.get('data'):
    if regex_covid.search(entry.get('text')) and (regex_immigrations.search(entry.get('text')) or regex_geos.search(entry.get('text'))):
        data_filter_ciorg.append(entry)
len(data_filter_ciorg)

2152

In [7]:
# Convert data to df

df_tweets = pd.DataFrame.from_records(data_filtered)
df_users = pd.DataFrame.from_records(all_data.get('includes').get('users'))

In [8]:
# Functions for unnesting

def fix_dicts(string):
    if string is np.nan:
        return(string)
    if not isinstance(string, dict):
        string_as_dict = ast.literal_eval(string)
        return(string_as_dict)
    else:
        return(string)

def unnest_hashtags(entities):
    try:
        hashtags = list(entities.get('hashtags'))
    except:
        return(list())
    if isinstance(hashtags, list):
        hashtags_list = [hashtag.get('tag') for hashtag in hashtags]
        return(hashtags_list)
    else:
        return
    
def unnest_mentions(entities):
    try:
        mentions = list(entities.get('mentions'))
    except:
        return(list())
    if isinstance(mentions, list):
        mentions_list = [mention.get('username') for mention in mentions]
        return(mentions_list)
    else:
        return

def unnest_urls(entities):
    try:
        urls = list(entities.get('urls'))
    except:
        return(list())
    if isinstance(urls, list):
        urls_list = [url.get('url') for url in urls]
        return(urls_list)
    else:
        return
    
def unnest_cashtags(entities):
    try:
        cashtags = list(entities.get('cashtags'))
    except:
        return(list())
    if isinstance(cashtags, list):
        cashtags_list = [cashtag.get('tag') for cashtag in cashtags]
        return(cashtags_list)
    else:
        return

In [9]:
# Functions for wrangling data frame

def wrangle_df_tweets(df, drop_cols):
    
    # Fix dictionaries
    df['public_metrics'] = df['public_metrics'].apply(fix_dicts)
    df['entities'] = df['entities'].apply(fix_dicts)
    
    # Unnest
    df = pd.concat([df, pd.json_normalize(df['public_metrics'])], axis = 1)
    df['hashtags'] = df['entities'].apply(unnest_hashtags)
    df['mentions'] = df['entities'].apply(unnest_mentions)
    df['urls'] = df['entities'].apply(unnest_urls)
    df['cashtags'] = df['entities'].apply(unnest_cashtags)
    df = df.loc[:, ~df.columns.isin(drop_cols)]
    
    return(df)

def wrangle_df_users(df, drop_cols):
    
    # Fix dicts
    df['public_metrics'] = df['public_metrics'].apply(fix_dicts)
    
    # Unnest
    df = pd.concat([df, pd.json_normalize(df['public_metrics'])], axis = 1)
    df = df.rename(columns = {'id': 'author_id', 'created_at': 'author_created_at'})
    df = df.loc[:, ~df.columns.isin(drop_cols)]
    
    return(df)

In [10]:
# Wrangle data frame

drop_cols = ['public_metrics', 'entities']

df_tweets_unnest = wrangle_df_tweets(df_tweets, drop_cols)
df_users_unnest = wrangle_df_users(df_users, drop_cols)

In [11]:
# Combine data with pd.merge

df_combined = pd.merge(df_tweets_unnest, df_users_unnest, how = 'left', left_on = 'author_id', right_on = 'author_id').drop_duplicates(subset = ['id'])
df_combined.head()

Unnamed: 0,created_at,text,id,author_id,referenced_tweets,retweet_count,reply_count,like_count,quote_count,hashtags,...,description,verified,name,url,username,author_created_at,followers_count,following_count,tweet_count,listed_count
0,2021-12-07T23:50:06.000Z,A U.S. judge on Tuesday blocked the Biden admi...,1468367242064760841,3108351,,41,12,111,3,[],...,Sign up for our newsletters and email alerts: ...,True,The Wall Street Journal,https://t.co/GhhR6PLfem,WSJ,2007-04-01T06:22:13.000Z,19263346,1046,357831,122904
848,2021-12-07T23:18:46.000Z,Analysis: The wide and dangerous gap between T...,1468359359080054784,2467791,,44,33,114,1,[],...,Democracy Dies in Darkness,True,The Washington Post,https://t.co/1KN78z0bbe,washingtonpost,2007-03-27T11:19:39.000Z,18516470,1715,412041,2
1632,2021-12-07T23:00:47.000Z,Why are many pregnant women hesitant to take t...,1468354833719697427,4970411,,16,5,34,1,[],...,Hear the human story and join the discussion. ...,True,Al Jazeera English,https://t.co/fDSF1GNVXl,AJEnglish,2007-04-17T08:23:08.000Z,7292176,238,293680,54442
2387,2021-12-07T22:15:08.000Z,While much is still unknown about the Omicron ...,1468343345642618884,807095,,59,94,192,7,[],...,News tips? Share them here: https://t.co/ghL9O...,True,The New York Times,http://t.co/ahvuWqicF9,nytimes,2007-03-02T20:41:42.000Z,51136117,860,455829,211884
3158,2021-12-07T21:56:04.000Z,6/ What has played out in Lower Manhattan amon...,1468338544582316037,807095,"[{'type': 'replied_to', 'id': '146833854276188...",3,1,10,0,[],...,News tips? Share them here: https://t.co/ghL9O...,True,The New York Times,http://t.co/ahvuWqicF9,nytimes,2007-03-02T20:41:42.000Z,51136117,860,455829,211884
