# 0. Set-up

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!git clone https://github.com/rafjaa/LeIA.git 
import sys
sys.path.insert(0,'/content/LeIA')
from LeIA.leia import SentimentIntensityAnalyzer 

Cloning into 'LeIA'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 88 (delta 11), reused 10 (delta 3), pack-reused 64[K
Unpacking objects: 100% (88/88), 304.96 KiB | 657.00 KiB/s, done.


In [None]:
!pip install --upgrade joblib==1.1.0
!pip install bertopic
!pip install flair
!pip install detoxify

In [None]:
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
import pandas as pd
#import plotly.express as px
import re
from detoxify import Detoxify
import spacy
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')

### ---- FUNCTIONS ----

In [None]:
def create_date_columns(data):
   # if to_utc == True:
   #     date_utc = data[DATE_COLUMN].str.replace({'CEST':'+0200','CET':'+0100'})
    date_utc = pd.to_datetime(data[DATE_COLUMN], format='%Y-%m-%d %H:%M:%S %z',utc=True)
    data['Month'] = date_utc.dt.strftime('%Y-%m')
    data['Week'] = date_utc.dt.to_period("W").dt.start_time
    data['Day'] = date_utc.dt.strftime('%Y-%m-%d')
    return data

def create_table_counts(data,metric_column,date_format = "Day"):
    # If you don't need the share of a subset in total, subset as data can be used 
    dynamics = (data.loc[:,[AUTHOR_COLUMN,date_format,metric_column]]
            .groupby([AUTHOR_COLUMN,date_format])
            .sum()
            .reset_index()
            )
    dynamics_pivot = dynamics.pivot(index = date_format, columns = AUTHOR_COLUMN, values=metric_column) 
    dynamics_pivot = dynamics_pivot.fillna(0).astype(int)
    return dynamics_pivot

def create_table_share(data,metric_column,list_of_keywords,date_format = "Day"):
    topic = "topic"
    data["topic"] =0
    data.loc[data[TEXT_COLUMN].str.lower().str.contains("|".join(list_of_keywords)),topic]=1

    dynamics = (data.loc[:,[AUTHOR_COLUMN,date_format,topic,metric_column]]
            .groupby([AUTHOR_COLUMN,date_format,topic])
            .sum()
            .reset_index()
            )
    dynamics["Percentage"] = (round(
                                    dynamics[metric_column] /
                                    dynamics.groupby([AUTHOR_COLUMN,date_format])[metric_column].transform('sum')
                                    *100,
                                    2)
                                    )
    dynamics_subset = dynamics.loc[dynamics[topic] ==1] 
    dynamics_subset_pivot = dynamics_subset.pivot(index = date_format, columns = AUTHOR_COLUMN,values="Percentage") 
    dynamics_subset_pivot = dynamics_subset_pivot.fillna(0).astype(int)

    return dynamics_subset_pivot

def create_table_total_and_mean_length(data):
    num_of_posts = pd.DataFrame(data[AUTHOR_COLUMN].value_counts())
    num_of_posts.columns = ["Number of Posts"]
    length = data[TEXT_COLUMN].apply(lambda x: len(x))
    length_of_posts = pd.DataFrame({AUTHOR_COLUMN: data[AUTHOR_COLUMN],"Average Length of Posts":length})
    length_of_posts = length_of_posts.groupby(AUTHOR_COLUMN).mean()
    df = num_of_posts.join(length_of_posts, how="outer")
    return df

def words_only(text):
    regex=REGEX
    regex = re.compile(regex)
    return " ".join(regex.findall(text))

def remove_emoji(text):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030" 
        "]+", flags=re.UNICODE)
    
    return emoj.sub("", text)

def replace_mentions(text):
    if text:
        new_text=[]
        for token in text.split(" "):
            token = "" if token.startswith('@') and len(token)>1 else token
            new_text.append(token)
        return " ".join(new_text)
    else:
        return text

def replace_email_phone_links(text):
    text = re.sub('([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', "<email>", text)
    text = re.sub('(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})',"<tel>",text)
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "<link>", text)
    return text

def basic_preprocessing(texts,keyword="",del_stopwords=True):
    texts = texts.apply(replace_email_phone_links)
    # remove tel and email
    texts = texts.str.replace("<link>","")
    texts = texts.str.replace("<email>","")
    texts = texts.str.replace("<tel>","")
    texts = texts.str.replace(keyword,"")
    if del_stopwords:
        texts = [remove_stopwords(text) for text in texts]
    # Remove new line characters
    texts = [re.sub('\s+', ' ', t) for t in texts]
    # Remove single quotes
    texts = [re.sub("\'", "", sent) for sent in texts]
    # remove some punctuation and numbers, emoji
    texts = [remove_emoji(t.lower()).strip() for t in texts]

    return texts

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words(LANGUAGE.lower())
    return " ".join([token for token in word_tokenize(text) if token.lower() not in stopwords])


def lemmatize(text,pos):
    doc = nlp(text)
    
    if pos:
        tokens = [token.text.lower() for token in doc if token.pos_ == pos]
    else:
        tokens = [token.text.lower() for token in doc]
    return tokens

def most_frequent_words(texts,top = 20, pos = False):
    """
    pos parameter can take values (the list is not exhaustive): 
      - VERB
      - NOUN
      - ADJ

    """   
    texts = [remove_stopwords(text) for text in texts]
    texts = [words_only(text) for text in texts] 

    tokens = [token for text in texts for token in lemmatize(text,pos)]
        
    counts = dict(Counter(tokens))
        
    freqs = pd.DataFrame.from_dict(counts,orient="index")
    freqs.columns = ["Count"]
    freqs = freqs.sort_values("Count",ascending = False)

    return  freqs.head(top)


def sentiment_scores_categorical(sentence):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence) 
    # decide sentiment as positive, negative and neutral
    if sentiment_dict['compound'] >= 0.05 :
        sentiment="Positive"
    elif sentiment_dict['compound'] <= - 0.05 :
        sentiment="Negative"
    else :
        sentiment="Neutral"
    return sentiment

def create_sentiment_table(data):
    sentiment_by_author = (data.loc[:,[AUTHOR_COLUMN,"Sentiment", "Number of Posts"]] 
                        .groupby([AUTHOR_COLUMN,"Sentiment"])
                        .sum()
                        .reset_index()
                        )
    sentiment_by_author["Percentage of Posts"] = (round(
                                                sentiment_by_author["Number of Posts"] /
                                                sentiment_by_author.groupby([AUTHOR_COLUMN])['Number of Posts'].transform('sum')
                                                *100,
                                                2)
                                                )

    sentiment_by_author = sentiment_by_author.pivot(index = AUTHOR_COLUMN, columns = "Sentiment",values="Percentage of Posts") 

    return sentiment_by_author

def topic_modelling(text,min_topic_size=50):
    docs = basic_preprocessing(text,del_stopwords=True)
    
    #alternative_model = TransformerDocumentEmbeddings(EMBEDDING_MODEL)
    #model=BERTopic(embedding_model=alternative_model,n_gram_range = (1,2),verbose=True,low_memory=True, min_topic_size=min_topic_size) 
    
    model=BERTopic(n_gram_range = (1,2),verbose=True,language=LANGUAGE,low_memory=True,min_topic_size=min_topic_size)# min_topic_size=500, nr_topics=30
    topics=model.fit_transform(docs)
    new_topics = model.reduce_outliers(docs,topics[0]) # to reduce the number of outliers
    model.update_topics(docs, topics=new_topics)
    topic_info=model.get_topic_info()
    
    return model.topics_, topic_info, model



# 1. Data preparation

### 1.1. Parameters and Data Columns

In [None]:
PATH='/content/drive/MyDrive/Brazil/'# path to data
DATE_COLUMN = "Date"
TEXT_COLUMN = "Message"
AUTHOR_COLUMN = "Profile"
LIKES_COLUMN = "Likes per post"
EMBEDDING_MODEL = "neuralmind/bert-large-portuguese-cased" # for topic modelling, optional
LANGUAGE = "Portuguese"
REGEX = u'[A-Za-zÀ-ú]+' # words only

### 1.2. Load Data

In [None]:
data = pd.read_excel(PATH+"FB_DRI.xlsx", skiprows=[0,1,2,3])
data = data.dropna(axis=1,how="all")
data = data.drop_duplicates()
data[TEXT_COLUMN] = data[TEXT_COLUMN].fillna("")
data["Number of Posts"] = 1
data.head()

Unnamed: 0,Date,Message,Profile,Network,Comments per post,Likes per post,Reactions per post,Shares per post,"Total Reactions, Comments, Shares",Message-ID,Profile-ID,Link,External Links,Number of Posts
0,2022-11-30 21:36:13,"Hoje não é dia de TBT, mas queria te lembrar q...",Marina Silva,FACEBOOK,281.0,407.0,676.0,37.0,994.0,126351747376464_682297636585757,126351747376464,https://www.facebook.com/516673643148158/posts...,,1
1,2022-11-30 21:07:26,Educação deve ser prioridade nacional. Fico mu...,Simone Tebet,FACEBOOK,453.0,2119.0,2572.0,156.0,3181.0,250792278286894_693804438781323,250792278286894,https://www.facebook.com/527346392093796/posts...,,1
2,2022-11-30 13:36:43,Não perca nossa live hoje!,Vera,FACEBOOK,1.0,70.0,74.0,31.0,106.0,214179238712305_714164310073272,214179238712305,https://www.facebook.com/538551947634510/posts...,,1
3,2022-11-29 23:08:15,,Vera,FACEBOOK,14.0,25.0,31.0,9.0,54.0,214179238712305_713716870118016,214179238712305,https://www.facebook.com/538551947634510/posts...,,1
4,2022-11-29 19:45:38,Eu faço política há 20 anos. Desde o meu prime...,Simone Tebet,FACEBOOK,1378.0,4433.0,5404.0,333.0,7115.0,250792278286894_693071222187978,250792278286894,https://www.facebook.com/527346392093796/posts...,,1


### 1.3. Date to UTC format and create a DAY, WEEK, MONTH columns

In [None]:
data = create_date_columns(data) # generates 3 columns: Month, Week, Day
data.columns

Index(['Date', 'Message', 'Profile', 'Network', 'Comments per post',
       'Likes per post', 'Reactions per post', 'Shares per post',
       'Total Reactions, Comments, Shares', 'Message-ID', 'Profile-ID', 'Link',
       'External Links', 'Number of Posts', 'Month', 'Week', 'Day'],
      dtype='object')

# 2. Engagement

Generates an excel table for Datawrapper

### 2.1. Statistics for all posts (number of posts, number of likes)

In [None]:
# Total number of posts
create_table_total_and_mean_length(data)

Unnamed: 0,Number of Posts,Average Length of Posts
Marina Silva,900,679.043333
Simone Tebet,588,423.719388
Sofia Manzano,182,321.56044
Vera,518,280.22973


In [None]:
# Number of posts

tab_engagement_posts = create_table_counts(data,metric_column = "Number of Posts",date_format = "Month")
#tab_engagement_posts.to_excel(PATH+"Engagement_Posts_Month.xlsx")
tab_engagement_posts

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,92,55,14,58
2022-04,99,58,12,59
2022-05,105,75,2,80
2022-06,99,57,7,81
2022-07,102,76,34,48
2022-08,85,97,41,56
2022-09,130,81,46,51
2022-10,120,57,25,45
2022-11,68,32,1,40


In [None]:
# Number of likes

tab_engagement_likes = create_table_counts(data,metric_column = LIKES_COLUMN,date_format = "Month")
#tab_engagement_likes.to_excel(PATH+"Engagement_Likes_Month.xlsx")
tab_engagement_likes

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,27119,21213,333,2588
2022-04,71532,25257,849,5083
2022-05,36602,43065,150,5078
2022-06,46355,38889,487,6220
2022-07,40445,62463,2444,3099
2022-08,33642,86324,2266,3518
2022-09,53780,92722,2554,2512
2022-10,132008,164413,2351,2371
2022-11,51214,153626,96,1763


### 2.2. Statistics for a subset (gender topic)

In [None]:
list_of_keywords = pd.read_excel("https://docs.google.com/spreadsheets/d/1fUze3H2QKvPYTPV0lIud-GJ_NALlZtY8gaUQME3QTso/export",sheet_name = "Tópicos")
list_of_keywords = list_of_keywords.iloc[:,0].to_list()

In [None]:
subset = data.loc[data[TEXT_COLUMN].str.lower().str.contains("|".join(list_of_keywords))]

In [None]:
# Number of posts for a subset (topic)
tab_engagement_subset = create_table_counts(subset,metric_column = "Number of Posts",date_format = "Month")
tab_engagement_subset

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,11,17,1,9
2022-04,18,7,2,3
2022-05,21,11,0,2
2022-06,8,8,0,3
2022-07,15,10,3,1
2022-08,14,17,0,1
2022-09,20,9,0,3
2022-10,10,0,0,1
2022-11,11,5,0,3


In [None]:
# Number of likes for a subset (topic)
tab_engagement_subset = create_table_counts(subset,metric_column = LIKES_COLUMN,date_format = "Month")
tab_engagement_subset

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,1400,6357,45,256
2022-04,32014,2202,158,321
2022-05,7914,5181,0,94
2022-06,9175,5940,0,234
2022-07,6509,7194,212,86
2022-08,4887,15773,0,61
2022-09,4539,10366,0,222
2022-10,8635,0,0,46
2022-11,8661,17954,0,130


In [None]:
# Percentage of posts on a topic in the total number of posts (for example, Marina Silva's posts in March 2022 were related to gender)

tab_engagement_subset = create_table_share(data,metric_column = "Number of Posts",list_of_keywords=list_of_keywords,date_format = "Month")
#tab_engagement_subset.to_excel(PATH+"Engagement_Subset_Posts_Month.xlsx")
tab_engagement_subset

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,11,30,7,15
2022-04,18,12,16,5
2022-05,20,14,0,2
2022-06,8,14,0,3
2022-07,14,13,8,2
2022-08,16,17,0,1
2022-09,15,11,0,5
2022-10,8,0,0,2
2022-11,16,15,0,7


In [None]:
# Percentage of likes

tab_engagement_subset_likes = create_table_share(data,metric_column = LIKES_COLUMN,list_of_keywords=list_of_keywords,date_format = "Month")
#tab_engagement_subset.to_excel(PATH+"Engagement_Subset_Likes_Month.xlsx")
tab_engagement_subset_likes

Profile,Marina Silva,Simone Tebet,Sofia Manzano,Vera
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-03,5,29,13,9
2022-04,44,8,18,6
2022-05,21,12,0,1
2022-06,19,15,0,3
2022-07,16,11,8,2
2022-08,14,18,0,1
2022-09,8,11,0,8
2022-10,6,0,0,1
2022-11,16,11,0,7


### 2.3. Most frequent words

In [None]:
# load a model
# models for other languages:  https://spacy.io/usage/models

import spacy.cli
model_lemma = "pt_core_news_sm"
spacy.cli.download(model_lemma)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [None]:
nlp = spacy.load(model_lemma)
most_frequent_words(data[TEXT_COLUMN],top = 20, pos = "NOUN")

Unnamed: 0,Count
país,456
governo,379
voz,305
dia,301
porta,288
anos,283
pessoas,254
presidente,244
democracia,202
deputada,199


# 3. Sentiment Analysis

In [None]:
data[TEXT_COLUMN] = [replace_email_phone_links(text) for text in data[TEXT_COLUMN]]
#data[TEXT_COLUMN] = [replace_mentions(text) for text in data[TEXT_COLUMN]] # if Twitter

In [None]:
data["Sentiment"] = [sentiment_scores_categorical(text) for text in data[TEXT_COLUMN]]

In [None]:
create_sentiment_table(data)

Sentiment,Negative,Neutral,Positive
Profile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marina Silva,38.67,7.89,53.44
Simone Tebet,27.04,5.1,67.86
Sofia Manzano,23.63,43.96,32.42
Vera,48.46,22.39,29.15


In [None]:
subset = data.loc[data[TEXT_COLUMN].str.lower().str.contains("|".join(list_of_keywords))]
create_sentiment_table(subset)

Sentiment,Negative,Neutral,Positive
Profile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Marina Silva,41.41,3.12,55.47
Simone Tebet,32.14,3.57,64.29
Sofia Manzano,33.33,16.67,50.0
Vera,76.92,7.69,15.38


# 4. Topic modelling

In [None]:
topics, topic_info,model = topic_modelling(subset[TEXT_COLUMN],min_topic_size=5)
subset["Topics"] = topics

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

2023-03-09 15:40:56,967 - BERTopic - Transformed documents to Embeddings
2023-03-09 15:41:01,462 - BERTopic - Reduced dimensionality
2023-03-09 15:41:01,487 - BERTopic - Clustered reduced embeddings
100%|██████████| 1/1 [00:00<00:00,  8.54it/s]


In [None]:
topic_info

Unnamed: 0,Topic,Count,Name
0,0,84,0_mulheres_política_violência_marina
1,1,44,1_mdb_mulheres_nacional_mdb nacional
2,2,37,2_brasil_mdb_mdb nacional_nacional
3,3,27,3_brasil_vote_paulo_fernando
4,4,16,4_indígenas_povos_mulheres_povos indígenas
5,5,11,5_negros_racismo_capitalismo_trabalhadores
6,6,11,6_racismo_escravidão_racial_brasil
7,7,9,7_brasil_comunidade_garimpeiros_lei áurea
8,8,5,8_senadora_bolsonaro_carta_lula


# 5. Toxicity

In [None]:
model_tox = Detoxify('multilingual')

toxicity_results = [[model_tox.predict(sentence) for sentence in sent_tokenize(text)] for text in subset[TEXT_COLUMN]] # ,device="cuda"

In [None]:
def get_max_score(list_of_dicts):
  return {k: max([d.get(k) for d in list_of_dicts]) for k in {"toxicity","severe_toxicity","obscene","identity_attack","insult","threat","sexual_explicit"}}

toxicity_results_max = [get_max_score(list_of_dicts) for list_of_dicts in toxicity_results]
pd.DataFrame.from_dict(toxicity_results_max,orient = "columns")

Unnamed: 0,identity_attack,obscene,insult,severe_toxicity,sexual_explicit,toxicity,threat
0,0.00148,0.002189,0.001218,0.00012,0.000478,0.004175,0.000264
1,0.374706,0.001536,0.024315,0.000861,0.000988,0.383348,0.001908
2,0.009779,0.017307,0.012624,0.001695,0.002324,0.022977,0.000828
3,0.012142,0.002833,0.004841,0.000364,0.001702,0.025374,0.001073
4,0.003108,0.000882,0.005285,0.000356,0.000698,0.00842,0.000451
5,0.010032,0.001341,0.002552,0.000252,0.002471,0.034284,0.000931
6,0.086343,0.002704,0.009704,0.001182,0.002626,0.084366,0.0015
7,0.006331,0.002486,0.020816,0.000935,0.000594,0.047081,0.000795
8,0.075526,0.001432,0.008079,0.000509,0.000534,0.11653,0.001267
9,0.037287,0.001876,0.003239,0.000649,0.000711,0.036505,0.001052
