## Downloads

In [1]:
%%capture
!pip install preprocessor
!pip install vader-multi
!pip install torchmetrics
!pip install sentence-transformers
!pip install gensim
!pip install requests
!pip install transvec

## Imports

In [27]:
import os
import re
import json
import gensim
import pickle
import zipfile
import requests
import numpy as np
from datetime import date

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import preprocessor as p
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transvec.transformers import TranslationWordVectorizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc


from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vincentdandenault/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vincentdandenault/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Run Flags and File Paths

In [64]:
date_today = date.today()

random_seed = 42
target_names = ['Not Censord', 'Censord']

RUN_PREPROCESSING = False
RUN_COUNTRY_DIVISION = False
FIT_CORPUS_FEATURE_SPACE = True

LANGUAGE_RUN = 'English'
FEATURE_SPACE = 'BOW' #BOW, TFIDF, Sentence2vec


data_path = 'Data'
results_path = 'Results'
vector_path = 'vectors'

clean_dataframe_path = 'Output/df_clean.csv'
english_dataframe_path = 'Output/df_english_clean.csv'

 ## Preprocessing

In [10]:
listOfCountries = ['France', 'Turkey', 'Germany', 'India']
def findCountry(x): 
    for country in listOfCountries:
        if x and country in x:
            return country
    return None

def normalize(array):
        return (array - np.min(array)) / (np.max(array) - np.min(array))
    
def preprocess_data():
    #extract the data from the json files
    dfs = []
    for r, d, f in os.walk('Data/'):
        for file in f:
            if 'withheldtweets.json' in file or "plus_one_control.json" in file:  # alt: if 'control' in file:
                dfs.append(pd.read_json('%s/%s' % (r, file), lines=True))
    df_cen = pd.concat(dfs)
    
    #keep only the features that are worth keeping
    worthKeeping = ["text", "truncated", "user",
                "withheld_in_countries", "entities", "lang",
                "possibly_sensitive", "extended_tweet"]
    df_cen = df_cen[worthKeeping]
    
    #some tweets have NaN as "possibly sensitive"…
    df_cen['possibly_sensitive'] = df_cen['possibly_sensitive'].fillna(0.0)
    
    #recover the full text for truncated tweets
    dfRaw = df_cen.values
    for line in dfRaw:
        if not pd.isna(line[-1]):
            line[0] = line[-1]["full_text"]   
        #remove urls from tweets
        #they are shortened anyway so we can't make use of them
        line[0] = re.sub(r'http\S+', '', line[0])
        #flatten retweets
        line[0] = re.sub(r'RT @\S+:', '', line[0])
    
    #remove "extended_tweet"
    dfRaw = np.delete(dfRaw, len(worthKeeping)-1, axis=1) 
    worthKeeping.remove("extended_tweet")
    
    #remove "truncated"
    dfRaw = np.delete(dfRaw, 1, axis=1) 
    worthKeeping.remove("truncated")
    
    #extract hashtags seperately
    for line in dfRaw:
        line[3] = [x["text"] for x in line[3]["hashtags"]]
    worthKeeping[3] = "hashtags"
    
    #create a feature for user-verified and user-followers_count
    verified = [line[1]["verified"] for line in dfRaw]
    followers = [line[1]["followers_count"] for line in dfRaw]
    
    #for the location, we keep the country name and discard the rest
    location = [findCountry(line[1]["location"]) for line in dfRaw]
    dfRaw = np.c_[dfRaw, verified, followers, location]
    worthKeeping += ["verified_account", "followers_count", "location"]
    
    #binary feature for whether the tweet has been withheld anywhere
    withheld = []
    for line in dfRaw:
        if not isinstance(line[2], list):
            line[2] = []
        withheld.append(len(line[2]) != 0)
            
    dfRaw = np.c_[dfRaw, withheld]
    worthKeeping += ["withheld_anywhere"]
    
    #popularity feature:
    #build a score based on the values of followers_count, favourites_count, statuses_count
    #compute a score from 0 to 1 for each, with (x - min)/(max - min), then comptute the average of these scores 
    followers_count = np.array([line[1]["followers_count"] for line in dfRaw])
    favourites_count = np.array([line[1]["favourites_count"] for line in dfRaw])
    statuses_count = np.array([line[1]["statuses_count"] for line in dfRaw])
    score = (1/3) * (normalize(followers_count) + normalize(favourites_count) + normalize(statuses_count))
    dfRaw = np.c_[dfRaw, score]
    worthKeeping += ["popularity_score"]
    #sentiment analysis
    #https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/? with VADER
    #https://github.com/brunneis/vader-multi, same concept but multilingual
    #text gets translated into english and then sentiment analysis is applied to the english text
    #takes a LOT of time
    analyzer = SentimentIntensityAnalyzer()
    def doThingsAffi(i, line):
        if i % 200 == 0: print(i)
        try:
            return [x for x in analyzer.polarity_scores(line[0]).values()]
        except Exception as e: #known error at about 42400, it's an error in the library
            print(e, line)
            return [0, 0, 0, 0]
            
    res = np.array([doThingsAffi(i, line) for i, line in enumerate(dfRaw)])
    dfRaw = np.c_[dfRaw, res]
    worthKeeping += ["neg", "neu", "pos", "compound"]
    
    #reassemble the data in a pandas dataframe and remove the column "user"
    df_cen = pd.DataFrame(dfRaw, columns = worthKeeping)
    cleanCols = filter(lambda x: x != "user", worthKeeping)
    df_clean = df_cen[cleanCols]
    
    return df_clean

In [11]:
if RUN_PREPROCESSING:
    df = preprocess_data()
    df.to_csv(clean_dataframe_path)
else: 
    df = pd.read_csv(clean_dataframe_path)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
<urlopen error [Errno 54] Connection reset by peer> [' سکندر سلطان چیف الیکشن کمیشن کو پاکستانیو کا پیغ

39400
39600
<urlopen error [Errno 54] Connection reset by peer> [' Pakistan PM #ImranKhan to seek vote of confidence from National Assembly \n@ImranKhanPTI @PTIofficial \n'
 {'id': 1236997246299901954, 'id_str': '1236997246299901954', 'name': 'Aamir', 'screen_name': 'aamir_1ak', 'location': 'Timbaktu', 'url': None, 'description': 'Grammarian.Diehard supporter of Imran khan.Retweets are not endorsements.Bleed green....Pakistan Zindabad.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 806, 'friends_count': 1255, 'listed_count': 0, 'favourites_count': 2840, 'statuses_count': 6681, 'created_at': 'Mon Mar 09 12:48:34 +0000 2020', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar

list index out of range ['@igor_vmarcos nao'
 {'id': 1362593273324441603, 'id_str': '1362593273324441603', 'name': 'mari__trajanno', 'screen_name': 'maritrajanno', 'location': None, 'url': None, 'description': '@vivit021💛', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 215, 'friends_count': 155, 'listed_count': 1, 'favourites_count': 1789, 'statuses_count': 2288, 'created_at': 'Fri Feb 19 02:42:22 +0000 2021', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1363139438868639754/fiH5F

52200
52400
52600
52800
list index out of range ['@kkauakk nem'
 {'id': 1109156370966827010, 'id_str': '1109156370966827010', 'name': 'Gustavo Rodrigues', 'screen_name': 'lyofps', 'location': 'São Paulo, Brasil', 'url': None, 'description': '16y | Semi-Professional CS:GO Player @RedDevils | Programador Full-Stack @etecitaqua | Base Sub-17 do @Corinthians | 1x LSL Gold Faceit 🏆 | $3.950 Awards 🏆', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 35, 'friends_count': 649, 'listed_count': 0, 'favourites_count': 2077, 'statuses_count': 961, 'created_at': 'Fri Mar 22 18:14:20 +0000 2019', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fi

<urlopen error [Errno 54] Connection reset by peer> [' @aiwokudasai39 本日も企画しますね！'
 {'id': 2905755794, 'id_str': '2905755794', 'name': 'jtwg.-pd.', 'screen_name': '66826b83b76b49e', 'location': None, 'url': None, 'description': '音楽好き。BTS、ＭＡＮＷ、ONE OK ROCK…Rock、フェス、LIVE、歌、TV好き➡笑うニヤける止まらない。音楽好き➡ダンス踊れないけどノリっちゃう。人好き➡人見知りだけど仲良くなるとなつく。料理好き➡食べるのもっと好き。雑貨好き➡雑貨店みたいな部屋にしたい。ハンドメイド好き➡趣味。', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 26, 'friends_count': 45, 'listed_count': 0, 'favourites_count': 495, 'statuses_count': 325, 'created_at': 'Fri Nov 21 05:38:13 +0000 2014', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color'

<urlopen error [Errno 54] Connection reset by peer> [' They’re babysitting 🍀 '
 {'id': 1272510602024562694, 'id_str': '1272510602024562694', 'name': 'Jojo asakura', 'screen_name': 'AsakuraJojo', 'location': None, 'url': None, 'description': 'Nivel 28 en algo que se llama vida\n\nIntento de algo que se cree humano por las mañanas y un kamen rider por las noches', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 24, 'friends_count': 124, 'listed_count': 0, 'favourites_count': 3316, 'statuses_count': 4695, 'created_at': 'Mon Jun 15 12:45:55 +0000 2020', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text

<urlopen error [Errno 54] Connection reset by peer> [' メッセージたくさんください🥺'
 {'id': 934095511, 'id_str': '934095511', 'name': '通りすがりの松下', 'screen_name': 'oshite_osaretai', 'location': '味が濃くて何が悪い', 'url': 'https://wug-portal.jp/finallive/', 'description': '田中美海さん（@minazou_373）と田村ゆかりさん（@yukari_tamura）が大好きです', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 473, 'friends_count': 872, 'listed_count': 11, 'favourites_count': 18587, 'statuses_count': 38415, 'created_at': 'Thu Nov 08 09:37:15 +0000 2012', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sideb

59200
59400
59600
59800
60000
60200
60400
60600
list index out of range ['@DREAMNHC @urbanangelbibby NO'
 {'id': 796483160854953985, 'id_str': '796483160854953985', 'name': 'chlo♡ | TASH’S DAY | 🍇', 'screen_name': 'alwaysyougeorge', 'location': '💛georgedown💛3/4 nhc💛she/her', 'url': 'https://twitter.com/newhopegeorge/status/1333014241721389057?s=21', 'description': '🤍george: im in ur twt name🤍reece: love u🤍blake: see u soon🤍 ✨@dreamgsgs when u fall asleep tonight just remember✨🤍@cutebibbysmith u know all my deep dish🤍', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 950, 'friends_count': 413, 'listed_count': 82, 'favourites_count': 30050, 'statuses_count': 32213, 'created_at': 'Wed Nov 09 22:42:38 +0000 2016', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': ''

<urlopen error [Errno 54] Connection reset by peer> [' 𝑏𝑣𝑙𝑔𝑎𝑟𝑖 𝑥 𝑀.𝑆𝑢𝑝𝑝𝑎𝑠𝑖𝑡 '
 {'id': 1239686584041013248, 'id_str': '1239686584041013248', 'name': 'Wuvแปลว่าlove (mewgulf)', 'screen_name': 'ViinasNj', 'location': None, 'url': None, 'description': 'โลกอีกใบ❤#หวานใจมิวกลัฟ', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 85, 'friends_count': 926, 'listed_count': 0, 'favourites_count': 64073, 'statuses_count': 78735, 'created_at': 'Mon Mar 16 22:55:31 +0000 2020', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pb

list index out of range ['@westcrowds yok'
 {'id': 451934979, 'id_str': '451934979', 'name': 'ger', 'screen_name': 'bectbox', 'location': 'hibernasi', 'url': None, 'description': 'Not affiliated with the one. mostly on dm.', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 5953, 'friends_count': 849, 'listed_count': 20, 'favourites_count': 2119, 'statuses_count': 121671, 'created_at': 'Sun Jan 01 06:27:59 +0000 2012', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': True, 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'FFFFFF', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_backgrou

list index out of range ['@1l_amj ولا'
 {'id': 1221389127234572290, 'id_str': '1221389127234572290', 'name': 'Ahmed📊', 'screen_name': 'R21Dm', 'location': ' ', 'url': None, 'description': 'اعتبرني كتبت الي تبيه @_flxb | \u200e@Fujaky', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 496, 'friends_count': 740, 'listed_count': 0, 'favourites_count': 744, 'statuses_count': 12767, 'created_at': 'Sun Jan 26 11:07:52 +0000 2020', 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1358331005808

list index out of range ['あらやだわ'
 {'id': 137724499, 'id_str': '137724499', 'name': 'las', 'screen_name': '_las_', 'location': 'とーきょーと', 'url': 'http://www.pixiv.net/member.php?id=886129', 'description': 'エロホモ絵とか描いてます。 NowPlaying→@las_musics Mstdn→https://goo.gl/axNPwZ Patreon→https://goo.gl/TwirdR お題→https://goo.gl/Tbqqfr', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 16541, 'friends_count': 1859, 'listed_count': 103, 'favourites_count': 26424, 'statuses_count': 40892, 'created_at': 'Tue Apr 27 15:20:58 +0000 2010', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '4D91C9', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme14/bg.gif', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme14/bg.gif', 'profile_background_tile': False, 'profile_link_color': 'FF9900', 'profile_sidebar_border_colo

66000
66200
66400
66600
list index out of range [' @taemvoir não\n'
 {'id': 794947410279694336, 'id_str': '794947410279694336', 'name': '𝖅𝖆𝖓𝖓𝖆𝖍 〄', 'screen_name': 'fxxl_z', 'location': '♡', 'url': None, 'description': '#𝖏𝖆𝖈𝖔𝖇: 𝖓𝖎𝖈𝖊, 𝖙𝖍𝖆𝖓𝖐 𝖞𝖔𝖚 𝖛𝖊𝖗𝖞 𝖐𝖆𝖒𝖘𝖆', 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 877, 'friends_count': 1266, 'listed_count': 21, 'favourites_count': 18228, 'statuses_count': 11299, 'created_at': 'Sat Nov 05 17:00:06 +0000 2016', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': '', 'profile_background_image_url_https': '', 'profile_background_tile': False, 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'profile_image_url': 'http://pbs.twimg.com/profi

## Splitting the Data by Language

In [13]:
def make_country_dataframes(df): 
    dataframes_dict = {}
    
    df_english = df[df['lang'] == "en"] 
    dataframes_dict['English'] = df_english
    
    df_turkish = df[df['lang'] == "tr"] 
    dataframes_dict['Turkish'] = df_turkish
    
    df_urdu = df[df['lang'] == "ur"]
    dataframes_dict['Urdu'] = df_urdu
    
    df_japanese = df[df['lang'] == "ja"] 
    dataframes_dict['Japanese'] = df_japanese
    
    df_spanish = df[df['lang'] == "es"] 
    dataframes_dict['Spanish'] = df_spanish
    
    df_thai = df[df['lang'] == "th"] 
    dataframes_dict['Thai'] = df_thai
    
    df_portuguese = df[df['lang'] == "pt"] 
    dataframes_dict['Portuguese'] = df_portuguese
    
    df_arabic = df[df['lang'] == "ar"] 
    dataframes_dict['Arabic'] = df_arabic
    
    df_indian = df[df['lang'] == "in"] 
    dataframes_dict['Indian'] = df_indian
    
    return dataframes_dict

In [14]:
dataframes_dict = make_country_dataframes(df)

In [21]:
if RUN_PREPROCESSING:
    df_english = dataframes_dict['English']
    df_english.to_csv(english_dataframe_path)
else: 
    df_english = pd.read_csv(english_dataframe_path)

## Feature Encoding

In [65]:
df_english = pd.read_csv(english_dataframe_path)

In [66]:
features_encoded = ['possibly_sensitive', 'verified_account',\
                    'followers_count', 'user_id', 'neg', 'neu', \
                    'pos', 'compound', 'popularity_score']

In [73]:
from sklearn.preprocessing import normalize
def encode_features(df, concat_all_features=False):
    df_features = df.copy()
    if FIT_CORPUS_FEATURE_SPACE:
        corpus = [sentence if isinstance(sentence, str) else '' for sentence in list(df_features['text'].values)]
        
        if FEATURE_SPACE == 'BOW':
            bow_texts = CountVectorizer().fit_transform(corpus)
            with open(('vectors/bow_vectors_' + str(LANGUAGE_RUN).lower() + '.pickle'), 'wb') as pkl:
                pickle.dump(bow_texts, pkl)
            text_vector = bow_texts
        
        elif FEATURE_SPACE == 'TFIDF':
            tfidf_vectors = TfidfVectorizer().fit_transform(corpus) 
            with open(('vectors/tfidf_vectors_' + str(LANGUAGE_RUN).lower() + '.pickle'), 'wb') as pkl:
                pickle.dump(tfidf_vectors, pkl)
            text_vector = tfidf_vectors
        
        else: 
            model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
            tweets_embeddings = model.encode(sentences=corpus, batch_size=32, 
                                             show_progress_bar=True, convert_to_numpy=True, 
                                            normalize_embeddings=True)
            with open(('vectors/tweets_embeddings_' + str(LANGUAGE_RUN).lower() + '.pickle'), 'wb') as pkl:
                pickle.dump(tweets_embeddings, pkl)
            text_vector = tweets_embeddings
        
    else: 
        if FEATURE_SPACE == 'BOW':
            with open(os.path.join(vector_path,'bow_vectors_english.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl)
        elif FEATURE_SPACE == 'TFIDF': 
            with open(os.path.join(vector_path,'tfidf_vectors_english.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl)
        else: 
            with open(os.path.join(vector_path,'tweets_embeddings_english.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl) 
    
    country_label = preprocessing.LabelEncoder()
    countries_encoded = country_label.fit_transform(list(df.location.values))
    df_features['Country_encoded'] = countries_encoded
    
    df_features = df_features.astype({"possibly_sensitive": float, "verified_account": float,\
                                'followers_count':int, 'user_id': int, 'neg': float, 'neu': float, 
                               'pos': float, 'compound': float, 'popularity_score': float})
    
    y = df_features["withheld_anywhere"].astype(int)
    
    X = df_features[features_encoded].copy().to_numpy()
    
    if concat_all_features: 
        X = np.concatenate((X, text_vector), axis=1)
        
    else:
        X = text_vector
        
    X = normalize(X)
    X = X.toarray()
    return X, y

In [74]:
X, y = encode_features(df_english)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20664, 24402)
(5166, 24402)
(20664,)
(5166,)


## Training the Models

In [76]:
base_model = LogisticRegression(random_state=random_seed)
y_pred = base_model.fit(X_train, y_train).predict(X_test)
res = classification_report(y_test, y_pred, target_names=target_names) 
print("Baseline model - Logisitic Regression: ")
print(res)

Baseline model - Logisitic Regression: 
              precision    recall  f1-score   support

 Not Censord       0.78      0.70      0.74      2053
     Censord       0.81      0.87      0.84      3113

    accuracy                           0.80      5166
   macro avg       0.80      0.78      0.79      5166
weighted avg       0.80      0.80      0.80      5166



In [None]:
models = {'SVM': SVC(random_state=random_seed), 
          'Random Forest': RandomForestClassifier(random_state=random_seed),
          'Gaussian Naive Bayes': GaussianNB()}

def run_models(models, X_train, X_test, y_train, y_test, save_results=True, visualize_roc_curve=True): 
    scores = {}
    for name in models.keys(): 
        y_pred = models[name].fit(X_train, y_train).predict(X_test)
        res = classification_report(y_test, y_pred, target_names=target_names, output_dict=True, zero_division=0) 
        scores[name] = res
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names, output_dict=False, zero_division=0))
        if visualize_roc_curve: 
            false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)
            fig = px.area(
                x=false_positive_rate, y=true_positive_rate,
                title=f'ROC Curve (AUC={auc(false_positive_rate, true_positive_rate):.4f})',
                labels=dict(x='False Positive Rate', y='True Positive Rate'),
                width=700, height=500
            )
            fig.add_shape(
                type='line', line=dict(dash='dash'),
                x0=0, x1=1, y0=0, y1=1
            )
            
            fig.update_yaxes(scaleanchor="x", scaleratio=1)
            fig.update_xaxes(constrain='domain')
            fig.show()
    
    if save_results: 
        with open(os.path.join(results_path, ('results_' + str(date_today) + '.txt')), 'w') as f:
            for key, value in scores.items(): 
                f.write('%s:%s\n' % (key, value))
                f.write('\n')
                
print(FEATURE_SPACE)
run_models(models, X_train, X_test, y_train, y_test, save_results=True)

BOW


## Multiple Country Runner

In [None]:
def make_dfs_by_country(df, country_list):
    df_list = []
    for country in country_list: 
        df_tmp = df[df['location'] == country].copy()
        df_list.append(df_tmp)
    return df_list

In [None]:
model_to_run = {'SVM': SVC(random_state=random_seed)}
df_countries = df.copy()
countries = ['France', 'Turkey', 'Germany', 'India']
df_list = make_dfs_by_country(df, countries)  
for idx, df in enumerate(df_list): 
    X, y = encode_features(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Country: ' + str(countries[idx]))
    run_models(model_to_run, X_train, X_test, y_train, y_test)

## Data Vizualisation

In [None]:
df.head(10)

In [None]:
df_France = df[df['location'] == 'France'].copy()
France_counts = list(df_France.withheld_anywhere.value_counts().values)

df_Turkey = df[df['location'] == 'India'].copy()
Turkey_counts = list(df_Turkey.withheld_anywhere.value_counts().values)

df_Germany = df[df['location'] == 'Germany'].copy()
Germany_counts = list(df_Germany.withheld_anywhere.value_counts().values)

df_India = df[df['location'] == 'India'].copy()
India_counts = list(df_India.withheld_anywhere.value_counts().values)

In [None]:
labels = ['Not Censored', 'Censored']
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(2, 2, specs=specs,
                    subplot_titles=countries)

fig.add_trace(go.Pie(labels=labels, values=France_counts, scalegroup='one',
                     name="France"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=Turkey_counts, scalegroup='one',
                     name="Turkey"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=Germany_counts, scalegroup='one',
                     name="Germany"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=India_counts, scalegroup='one',
                     name="India"), 2, 2)


fig.update_layout(title_text='Percentage of Censorded Tweets by Countries')
fig.show()

In [None]:
#source: https://plotly.com/python/roc-and-pr-curves/
fpr, tpr, thresholds = roc_curve(y_true, y_pred)
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()