In [1]:
import pandas as pd
import json
import os
import numpy as np
import re #regex 
import preprocessor as p
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
dfs = []
for r, d, f in os.walk('input'):
    for file in f:
        if 'withheldtweets.json' in file or 'plus_one_control.json' in file:  # alt: if ‘control' in file:
            dfs.append(pd.read_json('%s/%s' % (r, file), lines=True))

df_cen = pd.concat(dfs)
df_cen.drop_duplicates()
#df_cen = df_cen.dropna(subset=['withheld_in_countries'])

In [None]:
worthKeeping = ["text", "truncated", "user",
                "withheld_in_countries", "entities", "lang",
                "possibly_sensitive", "extended_tweet"]
df_cen = df_cen[worthKeeping]

In [None]:
df_cen

In [None]:
#replace the NaN with coherent values to make further processing easier
df_cen['possibly_sensitive'] = df_cen['possibly_sensitive'].fillna(0.0)
df_cen

In [None]:
#recover the full text for truncated tweets

dfRaw = df_cen.values
for line in dfRaw:
    if not pd.isna(line[-1]):
        line[0] = line[-1]["full_text"]
        
    #remove urls from tweets
    #they are shortened anyway so we can't make use of them
    line[0] = re.sub(r'http\S+', '', line[0])
    
    #flatten retweets
    line[0] = re.sub(r'RT @\S+:', '', line[0])

dfRaw = np.delete(dfRaw, len(worthKeeping)-1, axis=1) #remove "extended_tweet"
worthKeeping.remove("extended_tweet")

dfRaw = np.delete(dfRaw, 1, axis=1) #remove "truncated"
worthKeeping.remove("truncated")

In [None]:
#extract hashtags seperately

for line in dfRaw:
    line[3] = [x["text"] for x in line[3]["hashtags"]]
worthKeeping[3] = "hashtags"

In [None]:
#create a feature for user-verified and user-followers_count

print(dfRaw[0][1].keys())
verified = [line[1]["verified"] for line in dfRaw]
followers = [line[1]["followers_count"] for line in dfRaw]

#for the location, Rebekah suggested to only spot the country name and discard the rest
listOfCountries = ['Afghanistan', 'Aland Islands', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia, Plurinational State of', 'Bonaire, Sint Eustatius and Saba', 'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil', 'British Indian Ocean Territory', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands', 'Central African Republic', 'Chad', 'Chile', 'China', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Congo', 'Congo, The Democratic Republic of the', 'Cook Islands', 'Costa Rica', "Côte d'Ivoire", 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Fiji', 'Finland', 'France', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Gibraltar', 'Greece', 'Greenland', 'Grenada', 'Guadeloupe', 'Guam', 'Guatemala', 'Guernsey', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Heard Island and McDonald Islands', 'Holy See (Vatican City State)', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran, Islamic Republic of', 'Iraq', 'Ireland', 'Isle of Man', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jersey', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', "Korea, Democratic People's Republic of", 'Korea, Republic of', 'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic", 'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Liechtenstein', 'Lithuania', 'Luxembourg', 'Macao', 'Macedonia, Republic of', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives', 'Mali', 'Malta', 'Marshall Islands', 'Martinique', 'Mauritania', 'Mauritius', 'Mayotte', 'Mexico', 'Micronesia, Federated States of', 'Moldova, Republic of', 'Monaco', 'Mongolia', 'Montenegro', 'Montserrat', 'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nauru', 'Nepal', 'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua', 'Niger', 'Nigeria', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Norway', 'Oman', 'Pakistan', 'Palau', 'Palestinian Territory, Occupied', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru', 'Philippines', 'Pitcairn', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar', 'Réunion', 'Romania', 'Russian Federation', 'Rwanda', 'Saint Barthélemy', 'Saint Helena, Ascension and Tristan da Cunha', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Martin (French part)', 'Saint Pierre and Miquelon', 'Saint Vincent and the Grenadines', 'Samoa', 'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal', 'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'Solomon Islands', 'Somalia', 'South Africa', 'South Georgia and the South Sandwich Islands', 'Spain', 'Sri Lanka', 'Sudan', 'Suriname', 'South Sudan', 'Svalbard and Jan Mayen', 'Swaziland', 'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Taiwan, Province of China', 'Tajikistan', 'Tanzania, United Republic of', 'Thailand', 'Timor-Leste', 'Togo', 'Tokelau', 'Tonga', 'Trinidad and Tobago', 'Tunisia', 'Turkey', 'Turkmenistan', 'Turks and Caicos Islands', 'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates', 'United Kingdom', 'United States', 'United States Minor Outlying Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu', 'Venezuela, Bolivarian Republic of', 'Viet Nam', 'Virgin Islands, British', 'Virgin Islands, U.S.', 'Wallis and Futuna', 'Yemen', 'Zambia', 'Zimbabwe']
def findCountry(x):
    for country in listOfCountries:
        if x and country in x:
            return country
    return None

location = [findCountry(line[1]["location"]) for line in dfRaw]

dfRaw = np.c_[dfRaw, verified, followers, location]
worthKeeping += ["verified_account", "followers_count", "location"]

In [None]:
#binary feature for whether the tweet has been withheld anywhere

withheld = []
for line in dfRaw:
    if not isinstance(line[2], list):
        line[2] = []
    withheld.append(len(line[2]) != 0)
        
dfRaw = np.c_[dfRaw, withheld]
worthKeeping += ["withheld_anywhere"]

In [None]:
#sentiment analysis
#https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/? with VADER


sentiment = SentimentIntensityAnalyzer()
#we made the assumption that sentiment analysis for this analyzer only works for english
res = np.array([[x for x in sentiment.polarity_scores(line[0]).values()] if line[4] == "en" else [0.0, 0.0, 0.0, 0.0] for line in dfRaw])

dfRaw = np.c_[dfRaw, res]
worthKeeping += ["neg", "neu", "pos", "compound"]

In [None]:
#popularity feature:
#build a score based on the values of followers_count, favourites_count, statuses_count
#compute a score from 0 to 1 for each, with (x - min)/(max - min), then comptute the average of these scores 

followers_count = np.array([line[1]["followers_count"] for line in dfRaw])
favourites_count = np.array([line[1]["favourites_count"] for line in dfRaw])
statuses_count = np.array([line[1]["statuses_count"] for line in dfRaw])

def normalize(array):
    return (array - np.min(array)) / (np.max(array) - np.min(array))

score = (1/3) * (normalize(followers_count) + normalize(favourites_count) + normalize(statuses_count))
dfRaw = np.c_[dfRaw, score]
worthKeeping += ["popularity_score"]

In [None]:
#reassemble the data in a pandas dataframe
df_cen = pd.DataFrame(dfRaw, columns = worthKeeping)
df_cen

In [None]:
cleanCols = filter(lambda x: x != "user", worthKeeping)
df_clean = df_cen[cleanCols]

In [None]:
df_clean

In [None]:
df_clean["withheld_anywhere"].describe()

In [None]:
df_clean

In [None]:
df_clean["neg"].describe()

In [None]:
df_clean["popularity_score"].describe()

In [None]:
# Library: https://pypi.org/project/tweet-preprocessor/

# p.clean(file{.JSON, .txt}) or p.tokenize(file{.JSON, .txt}) or p.parse(file{.JSON, .txt})

# p.set_options(p.OPT.?)

In [None]:
# Option Name, Option Short Code:

# URL, p.OPT.URL

# Mention, p.OPT.MENTION

# Hashtag, p.OPT.HASHTAG -> keeping for now

# Reserved Words, p.OPT.RESERVED

# Emoji, p.OPT.EMOJI -> keeping for now

# Smiley, p.OPT.SMILEY -> keeping for now

# Number, p.OPT.NUMBER

In [None]:
p.set_options(p.OPT.URL, p.OPT.NUMBER, p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
df_clean["text"] = df_clean.apply({"text": lambda line: p.clean(line)}) 
df_clean