In [1]:
#| include: false

# Imports
import pandas as pd
import pytz
import spacy
import preprocessor as p
from emoji import demojize
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

!python -m spacy download pt_core_news_sm

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


Collecting pt-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.5.0/pt_core_news_sm-3.5.0-py3-none-any.whl (13.0 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')


In [2]:
#| include: false

# read data
data = pd.read_csv('../data/processed/tweets.csv')

# replace 'T' from column
data['date'] = data['local_time'].str.replace('T', ' ')

# convert column to datetime and localize to UTC
data['date'] = pd.to_datetime(data['date'], errors='coerce').dt.tz_localize('UTC')

# filter data
df = data[data['user_screen_name'] == 'nikolas_dm']

# convert time column to Brasilia, Brazil timezone
df['date'] = df['date'].dt.tz_convert(pytz.timezone('America/Sao_Paulo'))

print(len(df))


Columns (6,10,18,19,20,26,28,38,39,40,45,46,47,48,54) have mixed types. Specify dtype option on import or set low_memory=False.



7035


### Datos

Información general sobre la base de datos

In [3]:
min_date = df['date'].min()

max_date = df['date'].max()

print(f"\nPeriodo de tweets recolectados: {min_date} / {max_date}\n")


Periodo de tweets recolectados: 2012-08-14 23:32:27-03:00 / 2023-03-21 12:03:54-03:00



In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7035 entries, 3582 to 10616
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype                            
---  ------                   --------------  -----                            
 0   query                    7035 non-null   object                           
 1   id                       7035 non-null   float64                          
 2   timestamp_utc            7035 non-null   int64                            
 3   local_time               7035 non-null   object                           
 4   user_screen_name         7035 non-null   object                           
 5   text                     7035 non-null   object                           
 6   possibly_sensitive       1841 non-null   object                           
 7   retweet_count            7035 non-null   float64                          
 8   like_count               7035 non-null   float64                          
 9   reply_cou

### Dominios

Lista del top 20 de otros sitios web mencionados en los tweets y su frecuencia

In [5]:
# count items on column
domains_list = df['domains'].value_counts()

# return first n rows in descending order
top_domains = domains_list.nlargest(20)

top_domains

domains
youtu.be                       96
twcm.me                        56
ask.fm                         24
youtube.com                    18
otempo.com.br                  16
google.com.br                  13
moi.st                         11
t.me                           10
instagram.com                   6
jornaldacidadeonline.com.br     6
bit.ly                          6
24.media.tumblr.com             5
em.com.br                       5
itatiaia.com.br                 5
g1.globo.com                    5
veja.abril.com.br               4
brasilsemmedo.com               4
25.media.tumblr.com             4
phelipe.com.br                  4
livrariadonikolas.com           4
Name: count, dtype: int64

### Hashtags

Lista del top 20 de hashtags más usados y su frecuencia

In [6]:
# convert dataframe column to list
hashtags = df['hashtags'].to_list()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)

top_hashtags

28000                       17
virabrasil                  12
ptnuncamais                  7
forakalil                    6
b28                          5
reagebh                      5
foramaia                     4
devolveodinheirojanones      4
gobolsonaromundial           3
mpnofelipeneto               3
bh                           3
belohorizonte                3
derretefelipeneto            3
familiascontrafelipeneto     3
paz                          3
bolsonaro2022                3
g1                           3
fechadocombolsonaro          3
deixaopovotrabalhar          2
nikolasnopânico              2
Name: count, dtype: int64

### Usuarios

Top 20 de usuarios más mencionados en los tweets

In [7]:
# filter column from dataframe
users = df['mentioned_names'].to_list()

# remove nan items from list
users = [x for x in users if not pd.isna(x)]

# split items into a list based on a delimiter
users = [x.split('|') for x in users]

# flatten list of lists
users = [item for sublist in users for item in sublist]

# count items on list
users_count = pd.Series(users).value_counts()

# return first n rows in descending order
top_users = users_count.nlargest(20)

top_users

jairbolsonaro      208
felipeneto         143
_portiinho         104
amanddok            99
brunoenglerdm       77
bolsonarosp         61
lorena_rcp          59
anamarciaac         56
lulaoficial         54
claramurta          52
taoquei1            51
alexandrekalil      47
danilogentili       43
andrejanonesadv     40
anaclara_ah         39
dededumontt         37
brendiinhasc        36
nikolas_dm          35
buenoosophia        34
fernandarian        33
Name: count, dtype: int64

### Likes en el tiempo

In [8]:
# plot the data using plotly
fig = px.line(df, 
              x='date', 
              y='like_count', 
              title='Likes over Time',
              template='plotly_white', 
              hover_data=['text'])

# show the plot
fig.show()

### Tokens

Lista del top 20 de los tokens más comunes y su frecuencia 

In [9]:
# load the spacy model for Portuguese
nlp = spacy.load("pt_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
df['text_pre'] = df['text'].apply(filter_stopwords)

# count items on column
token_counts = df["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts

pra           1013
bh             338
brasil         236
bolsonaro      234
lula           220
dia            213
esquerda       206
hoje           196
tá             194
gente          181
nao            177
pessoas        163
presidente     150
cara           149
kalil          145
deus           138
vei            126
pro            125
mundo          122
verdade        116
Name: count, dtype: int64

### Horas

Lista de las 10 horas con más cantidad de tweets publicados

In [10]:
# extract hour from datetime column
df['hour'] = df['date'].dt.strftime('%H')

# count items on column
hours_count = df['hour'].value_counts()

# return first n rows in descending order
top_hours = hours_count.nlargest(10)

top_hours

hour
21    623
19    592
20    533
18    524
14    504
13    488
22    484
12    469
17    463
16    406
Name: count, dtype: int64

### Pataformas

Plataformas desde las que se publicaron contenidos y su frecuencia

In [11]:
df['source_name'].value_counts()

source_name
Twitter for iPhone        5013
Twitter Web Client        1487
Twitter Web App            283
Twitter for Android        183
Twitcom - Comunidades       58
TwitCasting                 11
Name: count, dtype: int64

### Tópicos

Técnica de modelado de tópicos con `transformers` y `TF-IDF` 

In [12]:
# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
df['text_pre'] = df['text_pre'].apply(lambda x: p.clean(x))

# replace emojis with descriptions
df['text_pre'] = df['text_pre'].apply(lambda x: demojize(x))

# filter column
docs = df['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

# visualize topics
topic_model.visualize_topics()

Batches:   0%|          | 0/220 [00:00<?, ?it/s]

2023-07-26 21:20:35,254 - BERTopic - Transformed documents to Embeddings
2023-07-26 21:21:14,451 - BERTopic - Reduced dimensionality
2023-07-26 21:21:19,603 - BERTopic - Clustered reduced embeddings


### Reducción de tópicos

Mapa con el 20% del total de tópicos generados

In [13]:
# calculate the 20% from the total of topics
num_topics = len(topic_model.get_topic_info())
per_topics = int(num_topics * 20 / 100)

# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=per_topics)

# visualize topics
topic_model.visualize_topics()

2023-07-26 21:24:30,885 - BERTopic - Reduced number of topics from 139 to 27


### Términos por tópico

In [14]:
topic_model.visualize_barchart(top_n_topics=per_topics)

### Análisis de tópicos
Selección de tópicos que tocan temas de género

In [18]:
# selection of topics
topics = [14]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
words_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_df = df[df['text_pre'].apply(lambda x: any(word in x for word in words_list))]

percentage = round(100 * len(filtered_df) / len(df), 2)
print(f"Del total de {len(df)} tweets de @nikolas_dm, alrededor de {len(filtered_df)} hablan sobre temas de género, es decir, cerca del {percentage}%")

print(f"Lista de palabras en tópicos {topics}:\n{words_list}")

Del total de 7035 tweets de @nikolas_dm, alrededor de 222 hablan sobre temas de género, es decir, cerca del 3.16%
Lista de palabras en tópicos [14]:
['mulher', 'aborto', 'feminista', 'feminismo', 'feministas', 'mulheres', 'movimento', 'chega', 'homem', 'chifre']


In [19]:
# drop rows with 0 values in two columns
filtered_df = filtered_df[(filtered_df.like_count != 0) & (filtered_df.retweet_count != 0)]

# add a new column with the sum of two columns
filtered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2

# extract year from datetime column
filtered_df['year'] = filtered_df['date'].dt.year

# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL)
filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))

# Create scatter plot
fig = px.scatter(filtered_df, x='like_count', 
                 y='retweet_count',
                 size='impressions', 
                 color='year',
                 hover_name='tweet_text')

# Update title and axis labels
fig.update_layout(
    title='Tweets talking about gender with most Likes and Retweets',
    xaxis_title='Number of Likes',
    yaxis_title='Number of Retweets'
)

fig.show()

### Tópicos en el tiempo

In [20]:
# convert column to list
tweets = df['text_pre'].to_list()
timestamps = df['local_time'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

10it [00:00, 13.12it/s]
