In [1]:
#| include: false

# Imports
import pandas as pd
import pytz
import spacy
import preprocessor as p
from emoji import demojize
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
#| include: false

!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.5.0/pt_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [3]:
#| include: false

# read data
data = pd.read_csv('../../data/processed/tweets.csv')


Columns (6,10,18,19,20,26,28,38,39,40,45,46,47,48,54) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
#| include: false

# replace 'T' from column
data['date'] = data['local_time'].str.replace('T', ' ')

# Convert column to datetime and localize to UTC
data['date'] = pd.to_datetime(data['date'], errors='coerce').dt.tz_localize('UTC')

data.head(1)

Unnamed: 0,query,id,timestamp_utc,local_time,user_screen_name,text,possibly_sensitive,retweet_count,like_count,reply_count,...,media_types,media_alt_texts,mentioned_names,mentioned_ids,hashtags,intervention_type,intervention_text,intervention_url,country,date
0,from:TommyZambranoM,1.638175e+18,1679406309,2023-03-21T13:45:09,TommyZambranoM,Los Nacionalistas para lograr la renovación de...,0.0,30.0,117.0,58.0,...,video,,pnh_oficial,201589327,librenuncamas,,,,Honduras,2023-03-21 13:45:09+00:00


In [5]:
#| include: false

# filter data
df = data[data['user_screen_name'] == 'brasilsemaborto']

# convert time column to Brasilia, Brazil timezone
df['date'] = df['date'].dt.tz_convert(pytz.timezone('America/Sao_Paulo'))

print(len(df))

1411


### Información

Los datos de este usuario cubren desde la creación de la cuenta `2009-07-31` hasta `2023-01-01`

In [6]:
#| include: false

df['date'].min()

Timestamp('2009-07-31 18:19:18-0300', tz='America/Sao_Paulo')

In [7]:
#| include: false

df['date'].max()

Timestamp('2023-01-01 10:45:02-0300', tz='America/Sao_Paulo')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1411 entries, 199416 to 200826
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype                            
---  ------                   --------------  -----                            
 0   query                    1411 non-null   object                           
 1   id                       1411 non-null   float64                          
 2   timestamp_utc            1411 non-null   int64                            
 3   local_time               1411 non-null   object                           
 4   user_screen_name         1411 non-null   object                           
 5   text                     1411 non-null   object                           
 6   possibly_sensitive       623 non-null    object                           
 7   retweet_count            1411 non-null   float64                          
 8   like_count               1411 non-null   float64                          
 9   reply_

### Datos

In [9]:
# count items on column
domains_list = df['domains'].value_counts()

# return first n rows in descending order
top_domains = domains_list.nlargest(20)

top_domains

domains
bit.ly                                                                    94
brasilsemaborto.org                                                       87
youtube.com                                                               30
youtu.be                                                                  29
wp.me                                                                     28
instagram.com                                                             24
facebook.com                                                              19
twitpic.com                                                               19
brasilsemaborto.com.br                                                    11
gazetadopovo.com.br                                                        8
camara.leg.br                                                              8
www12.senado.gov.br                                                        8
twitpic.com|twitpic.com                                             

### Hashtags

Lista del top 20 de hashtags más usados y su frecuencia

In [10]:
# convert dataframe column to list
hashtags = df['hashtags'].to_list()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)

top_hashtags

brasilsemaborto          271
marchavirtualpelavida     87
codigopenal               73
stfabortonao              71
pelas2vidas               61
mulhersimabortonao        35
marchapelavida            33
abortoépreconceito        33
asduasvidasimportam       33
10anos                    30
estatutodonascituro       28
afavordavida              24
brasilpelasduasvidas      16
verdadepelavida           16
avidadependedoseuvoto     15
simàvida                  12
diadonascituro            10
anencefalo                 9
avidaporumfio              9
stfdiganaoaoaborto         7
Name: count, dtype: int64

### Usuarios

Top 20 de usuarios más mencionados en los tweets

In [11]:
# filter column from dataframe
users = df['mentioned_names'].to_list()

# remove nan items from list
users = [x for x in users if not pd.isna(x)]

# split items into a list based on a delimiter
users = [x.split('|') for x in users]

# flatten list of lists
users = [item for sublist in users for item in sublist]

# count items on list
users_count = pd.Series(users).value_counts()

# return first n rows in descending order
top_users = users_count.nlargest(20)

top_users

lenisegarcia       41
brasilsemaborto    29
addthis             8
rebeccakiesslin     6
stf_oficial         6
anabeatrizries      5
mpf_pgr             4
cnnoticias          4
luh_lena            4
anadep_brasil       4
gazetadopovo        4
jorgeferraz         4
veja                3
alosenado           3
wagnermoura         3
angela_gandra       3
jornaldacbn         3
addtoany            2
agenciacamara       2
eunicio             2
Name: count, dtype: int64

### Likes en el tiempo

In [12]:
# plot the data using plotly
fig = px.line(df, 
              x='date', 
              y='like_count', 
              title='Likes over Time',
              template='plotly_white', 
              hover_data=['text'])

# show the plot
fig.show()

### Tokens

Lista del top 20 de los tokens más comunes y su frecuencia 

In [13]:
# load the spacy model for Portuguese
nlp = spacy.load("pt_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
df['text_pre'] = df['text'].apply(filter_stopwords)

# count items on column
token_counts = df["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts

vida                     494
aborto                   306
brasilsemaborto          274
marcha                   165
brasil                   131
nacional                 110
dia                      109
marchavirtualpelavida     88
movimento                 85
nascituro                 77
participe                 74
codigopenal               73
mãe                       71
defesa                    71
stfabortonao              70
hoje                      66
estatuto                  65
código                    64
saiba                     63
acompanhe                 60
Name: count, dtype: int64

### Hora

Lista de las 10 horas con más cantidad de tweets publicados

In [14]:
# extract hour from datetime column
df['hour'] = df['date'].dt.strftime('%H')

# count items on column
hours_count = df['hour'].value_counts()

# return first n rows in descending order
top_hours = hours_count.nlargest(10)

top_hours

hour
11    193
10    161
16    130
15    128
19    111
12    106
09     94
17     91
14     72
18     57
Name: count, dtype: int64

### Pataformas

Plataformas desde las que se publicaron contenidos y su frecuencia

In [15]:
df['source_name'].value_counts()

source_name
Twitter for Android            479
Twitter Web Client             453
Plume for Android              161
Twitter Web App                 89
Twitter for iPhone              60
TweetDeck                       42
Jetpack.com                     41
Twitter for Websites            26
TweetCaster for Android         18
Posterous                       16
Gravity                         15
Twitter for Android Tablets      7
Gravity!                         2
Twibbon                          1
Twitpic                          1
Name: count, dtype: int64

### Tópicos

Técnica de modelado de tópicos con `transformers` y `TF-IDF` 

In [16]:
#| include: false

# Remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
df['text_pre'] = df['text_pre'].apply(lambda x: p.clean(x))

# Replace emojis with descriptions
df['text_pre'] = df['text_pre'].apply(lambda x: demojize(x))

In [17]:
#| include: false

# filter column
docs = df['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/45 [00:00<?, ?it/s]

2023-07-06 01:46:45,513 - BERTopic - Transformed documents to Embeddings
2023-07-06 01:46:56,483 - BERTopic - Reduced dimensionality
2023-07-06 01:46:56,612 - BERTopic - Clustered reduced embeddings


In [18]:
# visualize topics
topic_model.visualize_topics()

### Reducción de tópicos

Mapa con 10 tópicos del contenido de los tweets

In [19]:
#| include: false

# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=11)

2023-07-06 01:46:58,105 - BERTopic - Reduced number of topics from 31 to 11


<bertopic._bertopic.BERTopic at 0x7f0ba429e490>

In [20]:
# visualize topics
topic_model.visualize_topics()

### Términos por tópico

In [21]:
topic_model.visualize_barchart(top_n_topics=11)

In [22]:
#| include: false

# list of words
word_list = ['brasilsemaborto', 
             'marchavirtualpelavida', 
             'stfabortonao ', 
             'pelas2vidas', 
             'mulhersimabortonao', 
             'marchapelavida', 
             'abortoépreconceito', 
             'asduasvidasimportam', 
             'afavordavida',
             'brasilpelasduasvidas',
             'verdadepelavida',
             'avidadependedoseuvoto',
             'simàvida',
             'diadonascituro'
             'anencefalo',
             'avidaporumfio',
             'stfdiganaoaoaborto',
             'comunicandoavida',
             '40daysforlife',
             'vidapraviver',
             'dilmaborto',
             'stfabortonão',
             'contraoaborto',
             'codigopenalsemaborto',
             'afavaordavida',
             'pelasduasvidas',
             'pelavida',
             'aborto'
             ]

# use apply method with lambda function to filter rows
filtered_df = df[df['text_pre'].apply(lambda x: any(word in x for word in word_list))]

percentage = round(100 * len(filtered_df) / len(df), 2)
print(f"Del total de {len(df)} tweets de @brasilsemaborto, alrededor de {len(filtered_df)} hablan sobre temas de género, es decir, cerca del {percentage}%")

Del total de 1411 tweets de @brasilsemaborto, alrededor de 698 hablan sobre temas de género, es decir, cerca del 49.47%


### Análisis de tópicos
Selección de tópicos que tocan temas de género

In [23]:
# selection of topics
topics = [0, 2]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
word_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_df = df[df['text_pre'].apply(lambda x: any(word in x for word in word_list))]

percentage = round(100 * len(filtered_df) / len(df), 2)
print(f"Del total de {len(df)} tweets de @brasilsemaborto, alrededor de {len(filtered_df)} hablan sobre temas de género, es decir, cerca del {percentage}%")

Del total de 1411 tweets de @brasilsemaborto, alrededor de 1020 hablan sobre temas de género, es decir, cerca del 72.29%


In [24]:
# drop rows with 0 values in two columns
filtered_df = filtered_df[(filtered_df.like_count != 0) & (filtered_df.retweet_count != 0)]

# add a new column with the sum of two columns
filtered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2

# extract year from datetime column
filtered_df['year'] = filtered_df['date'].dt.year

# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL)
filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))

# Create scatter plot
fig = px.scatter(filtered_df, x='like_count', 
                 y='retweet_count',
                 size='impressions', 
                 color='year',
                 hover_name='tweet_text')

# Update title and axis labels
fig.update_layout(
    title='Tweets talking about gender with most Likes and Retweets',
    xaxis_title='Number of Likes',
    yaxis_title='Number of Retweets'
)

fig.show()

### Tópicos en el tiempo

In [25]:
# convert column to list
tweets = df['text_pre'].to_list()
timestamps = df['local_time'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

20it [00:00, 47.54it/s]
