## Downloads

In [1]:
%%capture
!pip install preprocessor
!pip install vader-multi
!pip install torchmetrics
!pip install sentence-transformers
!pip install gensim
!pip install requests
!pip install transvec

## Imports

In [3]:
import os
import re
import json
import gensim
import pickle
import zipfile
import requests
import numpy as np
from datetime import date

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import preprocessor as p
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transvec.transformers import TranslationWordVectorizer

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_curve, auc


from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vincentdandenault/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vincentdandenault/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Run Flags and File Paths

In [7]:
date_today = date.today()

random_seed = 42
target_names = ['Not Censord', 'Censord']

RUN_PREPROCESSING = False
RUN_COUNTRY_DIVISION = False
FIT_CORPUS_FEATURE_SPACE = True

LANGUAGE_RUN = 'English'
FEATURE_SPACE = 'Sentence2Vec' #BOW, TFIDF


data_path = 'Data'
results_path = 'Results'
vector_path = 'vectors'

clean_dataframe_path = 'Output/df_clean.csv'
english_dataframe_path = 'Output/df_english.csv'

 ## Preprocessing

In [10]:
listOfCountries = ['France', 'Turkey', 'Germany', 'India']
def findCountry(x): 
    for country in listOfCountries:
        if x and country in x:
            return country
    return None

def normalize(array):
        return (array - np.min(array)) / (np.max(array) - np.min(array))
    
def preprocess_data():
    #extract the data from the json files
    dfs = []
    for r, d, f in os.walk('Data/'):
        for file in f:
            if 'withheldtweets.json' in file or "plus_one_control.json" in file:  # alt: if 'control' in file:
                dfs.append(pd.read_json('%s/%s' % (r, file), lines=True))
    df_cen = pd.concat(dfs)
    
    #keep only the features that are worth keeping
    worthKeeping = ["text", "truncated", "user",
                "withheld_in_countries", "entities", "lang",
                "possibly_sensitive", "extended_tweet"]
    df_cen = df_cen[worthKeeping]
    
    #some tweets have NaN as "possibly sensitive"…
    df_cen['possibly_sensitive'] = df_cen['possibly_sensitive'].fillna(0.0)
    
    #recover the full text for truncated tweets
    dfRaw = df_cen.values
    for line in dfRaw:
        if not pd.isna(line[-1]):
            line[0] = line[-1]["full_text"]   
        #remove urls from tweets
        #they are shortened anyway so we can't make use of them
        line[0] = re.sub(r'http\S+', '', line[0])
        #flatten retweets
        line[0] = re.sub(r'RT @\S+:', '', line[0])
    
    #remove "extended_tweet"
    dfRaw = np.delete(dfRaw, len(worthKeeping)-1, axis=1) 
    worthKeeping.remove("extended_tweet")
    
    #remove "truncated"
    dfRaw = np.delete(dfRaw, 1, axis=1) 
    worthKeeping.remove("truncated")
    
    #extract hashtags seperately
    for line in dfRaw:
        line[3] = [x["text"] for x in line[3]["hashtags"]]
    worthKeeping[3] = "hashtags"
    
    #create a feature for user-verified and user-followers_count
    verified = [line[1]["verified"] for line in dfRaw]
    followers = [line[1]["followers_count"] for line in dfRaw]
    
    #for the location, we keep the country name and discard the rest
    location = [findCountry(line[1]["location"]) for line in dfRaw]
    dfRaw = np.c_[dfRaw, verified, followers, location]
    worthKeeping += ["verified_account", "followers_count", "location"]
    
    #binary feature for whether the tweet has been withheld anywhere
    withheld = []
    for line in dfRaw:
        if not isinstance(line[2], list):
            line[2] = []
        withheld.append(len(line[2]) != 0)
            
    dfRaw = np.c_[dfRaw, withheld]
    worthKeeping += ["withheld_anywhere"]
    
    #popularity feature:
    #build a score based on the values of followers_count, favourites_count, statuses_count
    #compute a score from 0 to 1 for each, with (x - min)/(max - min), then comptute the average of these scores 
    followers_count = np.array([line[1]["followers_count"] for line in dfRaw])
    favourites_count = np.array([line[1]["favourites_count"] for line in dfRaw])
    statuses_count = np.array([line[1]["statuses_count"] for line in dfRaw])
    score = (1/3) * (normalize(followers_count) + normalize(favourites_count) + normalize(statuses_count))
    dfRaw = np.c_[dfRaw, score]
    worthKeeping += ["popularity_score"]
    #sentiment analysis
    #https://www.analyticsvidhya.com/blog/2022/07/sentiment-analysis-using-python/? with VADER
    #https://github.com/brunneis/vader-multi, same concept but multilingual
    #text gets translated into english and then sentiment analysis is applied to the english text
    #takes a LOT of time
    analyzer = SentimentIntensityAnalyzer()
    def doThingsAffi(i, line):
        if i % 200 == 0: print(i)
        try:
            return [x for x in analyzer.polarity_scores(line[0]).values()]
        except Exception as e: #known error at about 42400, it's an error in the library
            print(e, line)
            return [0, 0, 0, 0]
            
    res = np.array([doThingsAffi(i, line) for i, line in enumerate(dfRaw)])
    dfRaw = np.c_[dfRaw, res]
    worthKeeping += ["neg", "neu", "pos", "compound"]
    
    #reassemble the data in a pandas dataframe and remove the column "user"
    df_cen = pd.DataFrame(dfRaw, columns = worthKeeping)
    cleanCols = filter(lambda x: x != "user", worthKeeping)
    df_clean = df_cen[cleanCols]
    
    return df_clean

In [None]:
if RUN_PREPROCESSING:
    df = preprocess_data()
    df.to_csv(clean_dataframe_path)
else: 
    df = pd.read_csv(clean_dataframe_path)

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200


## Splitting the Data by Language

In [7]:
def make_country_dataframes(df): 
    dataframes_dict = {}
    
    df_english = df[df['lang'] == "en"] 
    dataframes_dict['English'] = df_english
    
    df_turkish = df[df['lang'] == "tr"] 
    dataframes_dict['Turkish'] = df_turkish
    
    df_urdu = df[df['lang'] == "ur"]
    dataframes_dict['Urdu'] = df_urdu
    
    df_japanese = df[df['lang'] == "ja"] 
    dataframes_dict['Japanese'] = df_japanese
    
    df_spanish = df[df['lang'] == "es"] 
    dataframes_dict['Spanish'] = df_spanish
    
    df_thai = df[df['lang'] == "th"] 
    dataframes_dict['Thai'] = df_thai
    
    df_portuguese = df[df['lang'] == "pt"] 
    dataframes_dict['Portuguese'] = df_portuguese
    
    df_arabic = df[df['lang'] == "ar"] 
    dataframes_dict['Arabic'] = df_arabic
    
    df_indian = df[df['lang'] == "in"] 
    dataframes_dict['Indian'] = df_indian
    
    return dataframes_dict

In [8]:
dataframes_dict = make_country_dataframes(df)

In [None]:
if RUN_PREPROCESSING:
    df_english = dataframes_dict['English']
    df_english.to_csv(english_dataframe_path)
else: 
    df_english = pd.read_csv(english_dataframe_path)

## Feature Encoding

In [9]:
features_encoded = ['possibly_sensitive', 'verified_account',\
                    'followers_count', 'user_id', 'neg', 'neu', \
                    'pos', 'compound', 'popularity_score']

In [14]:
def encode_features(df, concat_all_features=True):
    df_features = df.copy()
    if FIT_CORPUS_FEATURE_SPACE:
        corpus = [sentence if isinstance(sentence, str) else '' for sentence in list(df_features['text'].values)]
        
        if FEATURE_SPACE == 'BOW':
            bow_texts = CountVectorizer().fit_transform(corpus)
            with open(('bow_vectors_' + str(LANGUAGE_RUN) + '.pickle'), 'wb') as pkl:
                pickle.dump(bow_texts, pkl)
            text_vector = bow_texts
        
        elif FEATURE_SPACE == 'TFIDF':
            tfidf_vectors = TfidfVectorizer().fit_transform(corpus) 
            with open(('tfidf_vectors_' + str(LANGUAGE_RUN) + '.pickle'), 'wb') as pkl:
                pickle.dump(tfidf_vectors, pkl)
            text_vector = tfidf_vectors
        
        else: 
            model = SentenceTransformer('sentence-transformers/distiluse-base-multilingual-cased-v2')
            tweets_embeddings = model.encode(sentences=corpus, batch_size=32, 
                                             show_progress_bar=True, convert_to_numpy=True, 
                                            normalize_embeddings=True)
            with open(('tweets_embeddings_' + str(LANGUAGE_RUN) + '.pickle'), 'wb') as pkl:
                pickle.dump(tweets_embeddings, pkl)
            text_vector = tweets_embeddings
        
    else: 
        if FEATURE_SPACE == 'BOW':
            with open(os.path.join(vector_path,'bow_vectors.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl)
        elif FEATURE_SPACE == 'TFIDF': 
            with open(os.path.join(vector_path,'tfidf_vectors.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl)
        else: 
            with open(os.path.join(vector_path,'tweets_embeddings.pickle'), 'rb') as pkl:
                text_vector = pickle.load(pkl) 
    
    country_label = preprocessing.LabelEncoder()
    countries_encoded = country_label.fit_transform(list(df.location.values))
    df_features['Country_encoded'] = countries_encoded
    
    df_features = df_features.astype({"possibly_sensitive": float, "verified_account": float,\
                                'followers_count':int, 'user_id': int, 'neg': float, 'neu': float, 
                               'pos': float, 'compound': float, 'popularity_score': float})
    
    y = df_features["withheld_anywhere"].astype(int)
    
    X = df_features[features_encoded].copy().to_numpy()
    
    if concat_all_features: 
        X = np.concatenate((X, text_vector.toarray()), axis=1)
        
    else:
        X = text_vector.toarray()
        
    X = noramlize(X)
    return X, y

In [15]:
X, y = encode_features(df_english)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(20664, 24411)
(5166, 24411)
(20664,)
(5166,)


## Training the Models

In [17]:
base_model = LogisticRegression(random_state=random_seed)
y_pred = base_model.fit(X_train, y_train).predict(X_test)
res = classification_report(y_test, y_pred, target_names=target_names) 
print("Baseline model - Logisitic Regression: ")
print(res)

Baseline model - Logisitic Regression: 
{'Not Censord': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 2053}, 'Censord': {'precision': 0.6025938830816879, 'recall': 1.0, 'f1-score': 0.7520231912066675, 'support': 3113}, 'accuracy': 0.6025938830816879, 'macro avg': {'precision': 0.30129694154084397, 'recall': 0.5, 'f1-score': 0.37601159560333375, 'support': 5166}, 'weighted avg': {'precision': 0.363119387927467, 'recall': 0.6025938830816879, 'f1-score': 0.4531645749567085, 'support': 5166}}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
models = {'SVM': SVC(random_state=random_seed), 
          'Random Forest': RandomForestClassifier(random_state=random_seed)}
          'Gaussian Naive Bayes': GaussianNB()}

def run_models(models, X_train, X_test, y_train, y_test, save_results=True, visualize_roc_curve=True): 
    scores = {}
    for name in models.keys(): 
        y_pred = models[name].fit(X_train, y_train).predict(X_test)
        res = classification_report(y_test, y_pred, target_names=target_names, output_dict=True, zero_division=0) 
        scores[name] = res
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names, output_dict=False, zero_division=0))
        if visualize_roc_curve: 
            fpr, tpr, thresholds = roc_curve(y, y_score)
            fig = px.area(
                x=fpr, y=tpr,
                title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
                labels=dict(x='False Positive Rate', y='True Positive Rate'),
                width=700, height=500
            )
            fig.add_shape(
                type='line', line=dict(dash='dash'),
                x0=0, x1=1, y0=0, y1=1
            )
            
            fig.update_yaxes(scaleanchor="x", scaleratio=1)
            fig.update_xaxes(constrain='domain')
            fig.show()
    if save_results: 
        with open(os.path.join(results_path, ('results_' + str(date_today) + '.txt')), 'w') as f:
            for key, value in scores.items(): 
                f.write('%s:%s\n' % (key, value))
                f.write('\n')
                
                
print(FEATURE_SPACE)
run_models(models, X_train, X_test, y_train, y_test, save_results=True)

TFIDF


## Multiple Country Runner

In [None]:
def make_dfs_by_country(df, country_list):
    df_list = []
    for country in country_list: 
        df_tmp = df[df['location'] == country].copy()
        df_list.append(df_tmp)
    return df_list

In [None]:
model_to_run = {'SVM': SVC(random_state=random_seed)}
df_countries = df.copy()
countries = ['France', 'Turkey', 'Germany', 'India']
df_list = make_dfs_by_country(df, countries)  
for idx, df in enumerate(df_list): 
    X, y = encode_features(df)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print('Country: ' + str(countries[idx]))
    run_models(model_to_run, X_train, X_test, y_train, y_test)

## Data Vizualisation

In [None]:
df.head(10)

In [None]:
df_France = df[df['location'] == 'France'].copy()
France_counts = list(df_France.withheld_anywhere.value_counts().values)

df_Turkey = df[df['location'] == 'India'].copy()
Turkey_counts = list(df_Turkey.withheld_anywhere.value_counts().values)

df_Germany = df[df['location'] == 'Germany'].copy()
Germany_counts = list(df_Germany.withheld_anywhere.value_counts().values)

df_India = df[df['location'] == 'India'].copy()
India_counts = list(df_India.withheld_anywhere.value_counts().values)

In [None]:
labels = ['Not Censored', 'Censored']
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(2, 2, specs=specs,
                    subplot_titles=countries)

fig.add_trace(go.Pie(labels=labels, values=France_counts, scalegroup='one',
                     name="France"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=Turkey_counts, scalegroup='one',
                     name="Turkey"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=Germany_counts, scalegroup='one',
                     name="Germany"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=India_counts, scalegroup='one',
                     name="India"), 2, 2)


fig.update_layout(title_text='Percentage of Censorded Tweets by Countries')
fig.show()