In [102]:
#| include: false

# Imports
import pandas as pd
import numpy as np
import re
import spacy
import preprocessor as p
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"
# !sudo apt install python3.11-dev

In [79]:
#| include: false

# read data
df = pd.read_csv('../data/processed/elecciones_argentina.csv')

### Datos

In [80]:
print(f'Número de tweets analizados: {len(df)}')

Número de tweets analizados: 52476


### Eventos monitoreados

Cantidad de tweets para cada uno de los eventos monitoreados:

In [81]:
df['event'].value_counts()

event
elecciones    27448
1er debate    12588
2do debate    12440
Name: count, dtype: int64

### Fechas de eventos

Periodo cubierto para cada evento

In [82]:
debate1 = df.loc[df['event'].isin(['1er debate'])]
debate1_min = debate1['dt_date'].min()
debate1_max = debate1['dt_date'].max()
print(f'El primer debate contempla tweets desde {debate1_min} hasta {debate1_max}')

debate2 = df.loc[df['event'].isin(['2do debate'])]
debate2_min = debate2['dt_date'].min()
debate2_max = debate2['dt_date'].max()
print(f'El segundo debate contempla tweets desde {debate2_min} hasta {debate2_max}')

elecciones = df.loc[df['event'].isin(['elecciones'])]
elecciones_min = elecciones['dt_date'].min()
elecciones_max = elecciones['dt_date'].max()
print(f'Elecciones contempla tweets desde {elecciones_min} hasta {elecciones_max}')

El primer debate contempla tweets desde 2023-09-30 hasta 2023-10-02
El segundo debate contempla tweets desde 2023-11-11 hasta 2023-11-13
Elecciones contempla tweets desde 2023-11-16 hasta 2023-11-20


### Ataques identificados

> NOTA: Un tweet puede tener diversas etiquetas

In [83]:
#| include: true
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks = df.loc[df[conditions].isin([1]).any(axis=1)]
print(f'En los datos se identificaron {len(attacks)} publicaciones etiquetadas como ataques.')

En los datos se identificaron 3613 publicaciones etiquetadas como ataques.


### Ranking the periodistas más atacados

In [84]:
attacks['journalist_username'].value_counts()

journalist_username
@diegobranca        431
@JonatanViale       408
@Angelalerena       330
@edufeiok           299
@Cris_noticias      260
@odonnellmaria      244
@rialjorge          220
@robdnavarro        175
@vivicanosaok       135
@luisnovaresio      132
@guadavazquez       106
@Gatosylvestre       91
@majulluis           83
@NANCYPAZOS          73
@cyngarciaradio      66
@mjolivan            62
@fantinofantino      60
@rominamanguel       58
@nbg__               51
@lucianageuna        48
@ischargro           39
@marialauratv        36
@juliamengo          29
@aleberco            26
@anaecorrea          25
@ertenembaum         19
@VHMok               15
@barilirodolfo       14
@negropolisok        14
@diegoleuco           9
@hindelita            8
@Sietecase            8
@alfleuco             6
@andykusnetzoff       6
@wwnicolas            4
@MercedesFunes        4
@SANTIAGODELMORO      4
@deboraplager         4
@maclorena            3
@soyingridbeck        3
@Marcelitaojeda     

### Ranking de periodistas atacados por género
> NOTA: Clasificación binaria

In [85]:
attacks['journalist_genre'].value_counts()

journalist_genre
H    2049
M    1564
Name: count, dtype: int64

### Hombres periodistas más atacados

In [86]:
attacks_men = attacks.loc[attacks['journalist_genre'].isin(['H'])]
attacks_men['journalist_username'].value_counts()

journalist_username
@diegobranca        431
@JonatanViale       408
@edufeiok           299
@rialjorge          220
@robdnavarro        175
@luisnovaresio      132
@Gatosylvestre       91
@majulluis           83
@fantinofantino      60
@ischargro           39
@aleberco            26
@ertenembaum         19
@VHMok               15
@barilirodolfo       14
@diegoleuco           9
@Sietecase            8
@alfleuco             6
@andykusnetzoff       6
@SANTIAGODELMORO      4
@wwnicolas            4
Name: count, dtype: int64

### Mujeres periodistas más atacadas

In [87]:
attacks_women = attacks.loc[attacks['journalist_genre'].isin(['M'])]
attacks_women['journalist_username'].value_counts()

journalist_username
@Angelalerena      330
@Cris_noticias     260
@odonnellmaria     244
@vivicanosaok      135
@guadavazquez      106
@NANCYPAZOS         73
@cyngarciaradio     66
@mjolivan           62
@rominamanguel      58
@nbg__              51
@lucianageuna       48
@marialauratv       36
@juliamengo         29
@anaecorrea         25
@negropolisok       14
@hindelita           8
@deboraplager        4
@MercedesFunes       4
@maclorena           3
@soyingridbeck       3
@Marcelitaojeda      2
@SilvinaMolina       1
@monigps             1
@FlorHalfon          1
Name: count, dtype: int64

### Ranking the tipos de ataques para hombres

In [88]:
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks_men_count = attacks_men[conditions].sum()
attacks_men_count

women         518
politics      626
appearance    800
racism        124
class          73
lgbti          98
criminal       37
calls          30
dtype: int64

### Ranking the tipos de ataques para mujeres

In [89]:
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks_women_count = attacks_women[conditions].sum()
attacks_women_count

women         751
politics      529
appearance    312
racism         90
class          76
lgbti          30
criminal       12
calls          15
dtype: int64

### Número de ataques por tipo de evento

In [90]:
attacks['event'].value_counts()

event
elecciones    2027
2do debate     816
1er debate     770
Name: count, dtype: int64

### Ranking de eventos con más ataques para hombres

In [91]:
attacks_men['event'].value_counts()

event
elecciones    1061
2do debate     547
1er debate     441
Name: count, dtype: int64

### Ranking de eventos con más ataques para mujeres

In [92]:
attacks_women['event'].value_counts()

event
elecciones    966
1er debate    329
2do debate    269
Name: count, dtype: int64

### Hashtags 

20 hashtags más utilizados en los ataques:

In [95]:
attacks['hashtags'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\w+', x)) == 0 else re.findall(r'#\w+', x))

attacks['hashtags'] = attacks['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
hashtags = attacks['hashtags'].unique()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split(',') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# remove whitespaces
hashtags = list(map(lambda x: x.replace(' ', ''), hashtags))

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)

top_hashtags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks['hashtags'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\w+', x)) == 0 else re.findall(r'#\w+', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks['hashtags'] = attacks['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)


#SeVanParaSiempre                4
#DebatePresidencial2023          3
#KirchnerismoNuncaMas            3
#Debate2023                      3
#lasmargaritas                   2
#VotoContraMassa                 2
#MileiPresidente                 2
#ElClubDelMoro                   2
#iPhone15ProMax                  1
#ElPuebloEnDefensaPropia         1
#MileiBasuraVosSosLaDictadura    1
#LameTujesK                      1
#fraude                          1
#Tenemos                         1
#NoAMilei                        1
#Hora17                          1
#NoAl5toGobiernoK                1
#NoVasASerPresidente             1
#partidodelacosta                1
#GORDITOLECHOSO                  1
Name: count, dtype: int64

### Menciones

20 usuarios más mencionados en los ataques:

In [None]:
attacks['mentions'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'@(\w+)', x)) == 0 else re.findall(r'@(\w+)', x))

attacks['mentions'] = attacks['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
mentions = attacks['mentions'].unique()

# remove nan items from list
mentions = [x for x in mentions if not pd.isna(x)]

# split items into a list based on a delimiter
mentions = [x.split(',') for x in mentions]

# flatten list of lists
mentions = [item for sublist in mentions for item in sublist]

# remove whitespaces
mentions = list(map(lambda x: x.replace(' ', ''), mentions))

# count items on list
mentions_count = pd.Series(mentions).value_counts()

# return first n rows in descending order
top_mentions = mentions_count.nlargest(20)

top_mentions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks['mentions'] = attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'(@\w+)', x)) == 0 else re.findall(r'@(\w+)', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks['mentions'] = attacks['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)


JonatanViale      15
edufeiok          12
JMilei            12
SergioMassa       12
C5N               11
vivicanosaok      11
PatoBullrich      10
lanacionmas        9
Gatosylvestre      9
rialjorge          8
NANCYPAZOS         8
PRossiOficial      7
marialauratv       7
majulluis          6
diegobranca        6
myriambregman      6
luisnovaresio      6
aleberco           4
fantinofantino     4
robdnavarro        4
Name: count, dtype: int64

### Tokens

Lista del top 20 de palabras más comunes y su frecuencia:

In [98]:
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
attacks['text_pre'] = attacks['text'].apply(filter_stopwords)

# count items on column
token_counts = attacks["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks['text_pre'] = attacks['text'].apply(filter_stopwords)


vos        421
sos        353
q          301
gordo      212
vas        156
lechoso    136
mierda     130
milei      123
gordito    121
viejo      111
zurdos     103
cara       101
zurda       98
tenes       97
asco        91
gato        90
massa       88
anda        84
gente       83
orto        76
Name: count, dtype: int64

### Tópicos

Técnica de modelado de tópicos con `transformers` y `TF-IDF`:

In [103]:
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
attacks['text_pre'] = attacks['text_pre'].apply(lambda x: p.clean(x))


# filter column
docs = attacks['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()

2024-01-16 00:53:06,756 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 113/113 [00:35<00:00,  3.17it/s]
2024-01-16 00:53:45,419 - BERTopic - Embedding - Completed ✓
2024-01-16 00:53:45,420 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-01-16 00:54:06,087 - BERTopic - Dimensionality - Completed ✓
2024-01-16 00:54:06,088 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-01-16 00:54:06,682 - BERTopic - Cluster - Completed ✓
2024-01-16 00:54:06,696 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-01-16 00:54:06,816 - BERTopic - Representation - Completed ✓


### Reducción de tópicos

Mapa con 20 tópicos del contenido de los tweets:

In [104]:
# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=20)

# visualize topics
topic_model.visualize_topics()

2024-01-16 00:54:26,024 - BERTopic - Topic reduction - Reducing number of topics
2024-01-16 00:54:26,128 - BERTopic - Topic reduction - Reduced number of topics from 56 to 20


### Términos por tópico

In [105]:
topic_model.visualize_barchart(top_n_topics=20)

### Tópicos en el tiempo

In [106]:
# convert column to list
tweets = attacks['text_pre'].to_list()
timestamps = attacks['dt_date'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

5it [00:00, 22.62it/s]
