In [1]:
# | include: false

USERNAME = "@odonnellmaria"

# Imports
import pandas as pd
import numpy as np
import re
import spacy
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook"
pio.renderers.default = "plotly_mimetype+notebook_connected"

In [2]:
# | include: false

# read data
df = pd.read_csv("../data/processed/elecciones_argentina.csv")

attacks = df.dropna(subset=["label", "to_journalist"])

### Datos

In [3]:
df_attacks = attacks.loc[attacks["to_journalist"].isin([USERNAME])]
print(f"Número de ataques: {len(df_attacks)}")

Número de ataques: 256


In [4]:
journalist_mentions = len(df.loc[df["to_journalist"].isin([USERNAME])])
journalist_attacks = len(df_attacks)

percentage_attacks = (journalist_attacks / journalist_mentions) * 100

proportion = (percentage_attacks / 100) * 10
proportion_rounded = round(proportion, 1)


print(
    f"Aproximadamente {proportion_rounded} de cada 10 publicaciones que mencionan a {USERNAME} son ataques"
)

Aproximadamente 0.8 de cada 10 publicaciones que mencionan a @odonnellmaria son ataques


### Ranking the tipos de ataques

In [5]:
conditions = ['women', 'politics', 'appearance', 'racism', 'class', 'lgbti', 'criminal', 'calls']
attacks_count = df_attacks[conditions].sum()
attacks_count = attacks_count.sort_values(ascending=False)
attacks_count

politics      114
women         107
appearance     51
class          13
racism          8
lgbti           0
criminal        0
calls           0
dtype: int64

### Número de ataques por tipo de evento

In [6]:
df_attacks['event'].value_counts()

event
1er debate    113
2do debate     90
elecciones     53
Name: count, dtype: int64

### Publicaciones por evento

In [7]:
journalist_posts = df.loc[df['from_journalist'].isin([USERNAME])]
journalist_posts = journalist_posts.dropna(subset=['from_journalist'])

eventos = ['1er debate', '2do debate', 'elecciones']
colors = ['green', 'purple', 'orange']
eventos_count = {}

fig = px.line()

for i, evento in enumerate(eventos):
    evento_data = journalist_posts.loc[journalist_posts['event'].isin([evento])]
    evento_count = evento_data.groupby('dt_date').size().reset_index(name='count')
    eventos_count[evento] = evento_count
    fig.add_scatter(x=evento_count['dt_date'], y=evento_count['count'], name=evento, line=dict(color=colors[i]), hovertemplate='posts: %{y}')

fig.update_layout(title=f'Publicaciones de {USERNAME}', width=600)
fig.update_xaxes(type='category')
fig.update_yaxes(range=[0, 100]) 
fig.show()


### Hashtags

In [8]:
df_attacks['hashtags'] = df_attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'#\w+', x)) == 0 else re.findall(r'#\w+', x))

df_attacks['hashtags'] = df_attacks['hashtags'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
hashtags = df_attacks['hashtags'].unique()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split(',') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# remove whitespaces
hashtags = list(map(lambda x: x.replace(' ', ''), hashtags))

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

hashtags_count



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



#Milei2023EnPrimeraVuelta    1
#coreadelcentro              1
#LameTujesK                  1
Name: count, dtype: int64

### Menciones

In [9]:
df_attacks['mentions'] = df_attacks['text'].apply(lambda x: np.nan if pd.isnull(x) or not isinstance(x, str) or len(re.findall(r'@(\w+)', x)) == 0 else re.findall(r'@(\w+)', x))

df_attacks['mentions'] = df_attacks['mentions'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# convert dataframe column to list
mentions = df_attacks['mentions'].unique()

# remove nan items from list
mentions = [x for x in mentions if not pd.isna(x)]

# split items into a list based on a delimiter
mentions = [x.split(',') for x in mentions]

# flatten list of lists
mentions = [item for sublist in mentions for item in sublist]

# remove whitespaces
mentions = list(map(lambda x: x.replace(' ', ''), mentions))

# count items on list
mentions_count = pd.Series(mentions).value_counts()

mentions_count



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



odonellmaria     1
odonnellmaria    1
SergioMassa      1
ertenembaum      1
Name: count, dtype: int64

### Tokens

In [10]:
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
df_attacks['text_pre'] = df_attacks['text'].apply(filter_stopwords)

# count items on column
token_counts = df_attacks["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



vos          44
sos          41
zurda        39
q            20
maría        13
ensobrada    11
anda          9
derecho       8
pauta         8
milei         8
k             8
vagos         7
mujer         7
gente         7
kuka          7
zurdos        7
vieja         7
puta          7
trabajo       6
tenes         6
Name: count, dtype: int64