In [68]:
#| include: false

# Imports
import pandas as pd
import numpy as np
import re
import spacy
import preprocessor as p
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
# !apt install python3.11-dev

In [89]:
#| include: false

# read data
df = pd.read_csv('../data/processed/elecciones_argentina.csv')

### Datos

In [90]:
print(f'Número de tweets analizados: {len(df)}')

Número de tweets analizados: 52476


### Eventos monitoreados

Cantidad de tweets para cada uno de los eventos monitoreados:

In [91]:
df['event'].value_counts()

event
elecciones    27448
1er debate    12588
2do debate    12440
Name: count, dtype: int64

### Fechas de eventos

Periodo cubierto para cada evento

In [92]:
debate1 = df.loc[df['event'].isin(['1er debate'])]
debate1_min = debate1['dt_date'].min()
debate1_max = debate1['dt_date'].max()
print(f'El primer debate contempla tweets desde {debate1_min} hasta {debate1_max}')

debate2 = df.loc[df['event'].isin(['2do debate'])]
debate2_min = debate2['dt_date'].min()
debate2_max = debate2['dt_date'].max()
print(f'El segundo debate contempla tweets desde {debate2_min} hasta {debate2_max}')

elecciones = df.loc[df['event'].isin(['elecciones'])]
elecciones_min = elecciones['dt_date'].min()
elecciones_max = elecciones['dt_date'].max()
print(f'Elecciones contempla tweets desde {elecciones_min} hasta {elecciones_max}')

El primer debate contempla tweets desde 2023-09-30 hasta 2023-10-02
El segundo debate contempla tweets desde 2023-11-11 hasta 2023-11-13
Elecciones contempla tweets desde 2023-11-16 hasta 2023-11-20


### Ataques identificados

> NOTA: Un tweet puede tener diversas etiquetas

In [100]:
attacks = df.dropna(subset=['label', 'to_journalist'])

print(f'En los datos se identificaron {len(attacks)} publicaciones etiquetadas como ataques.')

En los datos se identificaron 2950 publicaciones etiquetadas como ataques.


### Ranking the periodistas más atacados

In [101]:
attacks['to_journalist'].value_counts()

to_journalist
@diegobranca       418
@JonatanViale      373
@Angelalerena      326
@Cris_noticias     266
@odonnellmaria     256
@edufeiok          244
@rialjorge         175
@robdnavarro       159
@guadavazquez      136
@luisnovaresio      99
@vivicanosaok       81
@majulluis          78
@mjolivan           62
@rominamanguel      57
@NANCYPAZOS         55
@nbg__              44
@Gatosylvestre      34
@ischargro          26
@anaecorrea         14
@cyngarciaradio     11
@hindelita           7
@aleberco            7
@juliamengo          6
@ertenembaum         5
@Sietecase           2
@soyingridbeck       2
@wwnicolas           2
@MercedesFunes       2
@Marcelitaojeda      1
@negropolisok        1
@maclorena           1
Name: count, dtype: int64

### Ranking de periodistas atacados por género
> NOTA: Clasificación binaria

In [102]:
attacks['journalist_genre'].value_counts()

journalist_genre
H    1622
M    1328
Name: count, dtype: int64

### Hombres periodistas más atacados

In [103]:
attacks_men = attacks.loc[attacks['journalist_genre'].isin(['H'])]
attacks_men['to_journalist'].value_counts()

to_journalist
@diegobranca      418
@JonatanViale     373
@edufeiok         244
@rialjorge        175
@robdnavarro      159
@luisnovaresio     99
@majulluis         78
@Gatosylvestre     34
@ischargro         26
@aleberco           7
@ertenembaum        5
@wwnicolas          2
@Sietecase          2
Name: count, dtype: int64

### Mujeres periodistas más atacadas

In [104]:
attacks_women = attacks.loc[attacks['journalist_genre'].isin(['M'])]
attacks_women['to_journalist'].value_counts()

to_journalist
@Angelalerena      326
@Cris_noticias     266
@odonnellmaria     256
@guadavazquez      136
@vivicanosaok       81
@mjolivan           62
@rominamanguel      57
@NANCYPAZOS         55
@nbg__              44
@anaecorrea         14
@cyngarciaradio     11
@hindelita           7
@juliamengo          6
@soyingridbeck       2
@MercedesFunes       2
@Marcelitaojeda      1
@negropolisok        1
@maclorena           1
Name: count, dtype: int64

### Ranking the tipos de ataques para hombres

In [105]:
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks_men_count = attacks_men[conditions].sum()
attacks_men_count

women         370
politics      450
appearance    646
racism         83
class          57
lgbti          57
criminal       31
calls          21
dtype: int64

### Ranking the tipos de ataques para mujeres

In [106]:
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks_women_count = attacks_women[conditions].sum()
attacks_women_count

women         607
politics      424
appearance    226
racism         63
class          69
lgbti          19
criminal        8
calls          10
dtype: int64

### Número de ataques por tipo de evento

In [107]:
attacks['event'].value_counts()

event
elecciones    1584
1er debate     698
2do debate     668
Name: count, dtype: int64

### Actividad de los periodistas en Twitter por género

In [109]:
journalist_posts = df.dropna(subset=['from_journalist'])
men_journalist_posts = journalist_posts.loc[journalist_posts['journalist_genre'].isin(['M'])]
women_journalist_posts = journalist_posts.loc[journalist_posts['journalist_genre'].isin(['H'])]

print(f"""Tweets publicados por periodistas hombres: {len(men_journalist_posts)}\nTweets publicados por periodistas mujeres: {len(women_journalist_posts)}""")


Tweets publicados por periodistas hombres: 640
Tweets publicados por periodistas mujeres: 292


### Ranking de periodistas más activos

In [122]:
journalist_activity = df['from_journalist'].value_counts()
journalist_activity

from_journalist
@anaecorrea         105
@rominamanguel       61
@rialjorge           61
@diegobranca         60
@guadavazquez        55
@odonnellmaria       45
@Cris_noticias       42
@SilvinaMolina       36
@NANCYPAZOS          35
@hindelita           35
@majulluis           32
@soyingridbeck       31
@Marcelitaojeda      27
@nbg__               25
@Gatosylvestre       25
@edufeiok            24
@luisnovaresio       24
@monigps             23
@robdnavarro         23
@Angelalerena        22
@mjolivan            20
@MercedesFunes       18
@maclorena           13
@vivicanosaok        12
@aleberco            11
@silviafbarrio        9
@juliamengo           9
@cyngarciaradio       9
@JonatanViale         8
@Sietecase            7
@SANTIAGODELMORO      7
@ischargro            7
@mafito11             5
@wwnicolas            2
@FlorHalfon           2
@ertenembaum          1
@deboraplager         1
Name: count, dtype: int64

### Publicaciones de periodistas por evento

In [110]:
men_debate1 = men_journalist_posts.loc[men_journalist_posts['event'].isin(['1er debate'])]
men_count = men_debate1.groupby('dt_date').size().reset_index(name='count')

women_debate1 = women_journalist_posts.loc[women_journalist_posts['event'].isin(['1er debate'])]
women_count = women_debate1.groupby('dt_date').size().reset_index(name='count')

fig = px.line()
fig.add_scatter(x=men_count['dt_date'], y=men_count['count'], name='Hombres', line=dict(color='orange'), hovertemplate='posts: %{y}')
fig.add_scatter(x=women_count['dt_date'], y=women_count['count'], name='Mujeres', line=dict(color='purple'), hovertemplate='posts: %{y}')
fig.update_layout(title='Publicaciones de periodistas durante el 1er debate', width=600)
fig.update_xaxes(type='category')
fig.show()

In [111]:
men_debate2 = men_journalist_posts.loc[men_journalist_posts['event'].isin(['2do debate'])]
men_count = men_debate2.groupby('dt_date').size().reset_index(name='count')

women_debate2 = women_journalist_posts.loc[women_journalist_posts['event'].isin(['2do debate'])]
women_count = women_debate2.groupby('dt_date').size().reset_index(name='count')

fig = px.line()
fig.add_scatter(x=men_count['dt_date'], y=men_count['count'], name='Hombres', line=dict(color='orange'), hovertemplate='posts: %{y}')
fig.add_scatter(x=women_count['dt_date'], y=women_count['count'], name='Mujeres', line=dict(color='purple'), hovertemplate='posts: %{y}')
fig.update_layout(title='Publicaciones de periodistas durante el 2do debate', width=600)
fig.update_xaxes(type='category')
fig.show()

In [112]:
men_elecciones = men_journalist_posts.loc[men_journalist_posts['event'].isin(['elecciones'])]
men_count = men_elecciones.groupby('dt_date').size().reset_index(name='count')

women_elecciones = women_journalist_posts.loc[women_journalist_posts['event'].isin(['elecciones'])]
women_count = women_elecciones.groupby('dt_date').size().reset_index(name='count')

fig = px.line()
fig.add_scatter(x=men_count['dt_date'], y=men_count['count'], name='Hombres', line=dict(color='orange'), hovertemplate='posts: %{y}')
fig.add_scatter(x=women_count['dt_date'], y=women_count['count'], name='Mujeres', line=dict(color='purple'), hovertemplate='posts: %{y}')
fig.update_layout(title='Publicaciones de periodistas durante las eleciones', width=600)
fig.update_xaxes(type='category')
fig.show()

### Ranking de eventos con más ataques para hombres

In [113]:
attacks_men['event'].value_counts()

event
elecciones    806
2do debate    435
1er debate    381
Name: count, dtype: int64

### Ranking de eventos con más ataques para mujeres

In [114]:
attacks_women['event'].value_counts()

event
elecciones    778
1er debate    317
2do debate    233
Name: count, dtype: int64

### Hashtags 

20 hashtags más utilizados en los ataques:

In [115]:
attacks['hashtags'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\w+', x)) == 0 else re.findall(r'#\w+', x))

attacks['hashtags'] = attacks['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
hashtags = attacks['hashtags'].unique()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split(',') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# remove whitespaces
hashtags = list(map(lambda x: x.replace(' ', ''), hashtags))

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)

top_hashtags

#SeVanParaSiempre                3
#KirchnerismoNuncaMas            2
#MassaPresidente2023             2
#Milei2023EnPrimeraVuelta        1
#Tenemos                         1
#GORDITOLECHOSO                  1
#NoAl5toGobiernoK                1
#NoVasASerPresidente             1
#nuncamas                        1
#PalestinaLibre                  1
#YoVotoAMassa                    1
#NuncaMilei                      1
#NuncaMas                        1
#MileiNo                         1
#LameTujesK                      1
#MileiBasuraVosSosLaDictadura    1
#ElPuebloEnDefensaPropia         1
#viv                             1
#BrancatelliPelotudo             1
#MileiPresidente                 1
Name: count, dtype: int64

### Menciones

20 usuarios más mencionados en los ataques:

In [116]:
attacks['mentions'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'@(\w+)', x)) == 0 else re.findall(r'@(\w+)', x))

attacks['mentions'] = attacks['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
mentions = attacks['mentions'].unique()

# remove nan items from list
mentions = [x for x in mentions if not pd.isna(x)]

# split items into a list based on a delimiter
mentions = [x.split(',') for x in mentions]

# flatten list of lists
mentions = [item for sublist in mentions for item in sublist]

# remove whitespaces
mentions = list(map(lambda x: x.replace(' ', ''), mentions))

# count items on list
mentions_count = pd.Series(mentions).value_counts()

# return first n rows in descending order
top_mentions = mentions_count.nlargest(20)

top_mentions

PatoBullrich       4
SergioMassa        4
JMilei             3
minsaurralde       3
vivicanosaok       2
Cris_noticias      2
rialjorge          2
Kicillofok         2
guadavazquez       1
herlombardi        1
diegobranca        1
luispetri          1
JorgeTelerman      1
horaciorlarreta    1
Angelalerena       1
EsmeraldaMitre     1
ertenembaum        1
majulluis          1
LuisNovaresio1     1
QuintelaRicardo    1
Name: count, dtype: int64

### Tokens

Lista del top 20 de palabras más comunes y su frecuencia:

In [117]:
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
attacks['text_pre'] = attacks['text'].apply(filter_stopwords)

# count items on column
token_counts = attacks["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts

vos        374
sos        311
q          244
gordo      195
vas        135
lechoso    126
gordito    114
mierda     104
milei       99
zurda       84
asco        81
massa       78
gente       77
tenes       74
viejo       71
anda        66
cara        66
zurdos      64
gato        62
orto        60
Name: count, dtype: int64

### Tópicos

Técnica de modelado de tópicos con `transformers` y `TF-IDF`:

In [118]:
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
attacks['text_pre'] = attacks['text_pre'].apply(lambda x: p.clean(x))


# filter column
docs = attacks['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()

2024-01-16 21:37:46,415 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 93/93 [00:35<00:00,  2.61it/s]
2024-01-16 21:38:24,113 - BERTopic - Embedding - Completed ✓
2024-01-16 21:38:24,114 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-16 21:38:34,217 - BERTopic - Dimensionality - Completed ✓
2024-01-16 21:38:34,218 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-16 21:38:34,657 - BERTopic - Cluster - Completed ✓
2024-01-16 21:38:34,660 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-16 21:38:34,761 - BERTopic - Representation - Completed ✓


### Reducción de tópicos

Mapa con 20 tópicos del contenido de los tweets:

In [119]:
# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=20)

# visualize topics
topic_model.visualize_topics()

2024-01-16 21:38:35,689 - BERTopic - Topic reduction - Reducing number of topics
2024-01-16 21:38:35,788 - BERTopic - Topic reduction - Reduced number of topics from 55 to 20


### Términos por tópico

In [120]:
topic_model.visualize_barchart(top_n_topics=20)

### Tópicos en el tiempo

In [121]:
# convert column to list
tweets = attacks['text_pre'].to_list()
timestamps = attacks['dt_date'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

5it [00:00, 20.40it/s]
