In [1]:
#| include: false

# Imports
import pandas as pd
import pytz
import spacy
import preprocessor as p
from emoji import demojize
from bertopic import BERTopic
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
#| include: false

!python -m spacy download es_core_news_sm

Collecting es-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.5.0/es_core_news_sm-3.5.0-py3-none-any.whl (12.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')


In [3]:
#| include: false

# read data
data = pd.read_csv('../../data/processed/tweets.csv')


Columns (6,10,18,19,20,26,28,38,39,40,45,46,47,48,54) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
#| include: false

# replace 'T' from column
data['date'] = data['local_time'].str.replace('T', ' ')

# Convert column to datetime and localize to UTC
data['date'] = pd.to_datetime(data['date'], errors='coerce').dt.tz_localize('UTC')

data.head(1)

Unnamed: 0,query,id,timestamp_utc,local_time,user_screen_name,text,possibly_sensitive,retweet_count,like_count,reply_count,...,media_types,media_alt_texts,mentioned_names,mentioned_ids,hashtags,intervention_type,intervention_text,intervention_url,country,date
0,from:TommyZambranoM,1.638175e+18,1679406309,2023-03-21T13:45:09,TommyZambranoM,Los Nacionalistas para lograr la renovación de...,0.0,30.0,117.0,58.0,...,video,,pnh_oficial,201589327,librenuncamas,,,,Honduras,2023-03-21 13:45:09+00:00


In [5]:
#| include: false

# filter data
df = data[data['user_screen_name'] == 'UnidosxlaVidaCo']

# convert time column to Brasilia, Brazil timezone
df['date'] = df['date'].dt.tz_convert(pytz.timezone('America/Bogota'))

print(len(df))

7830


### Información

Los datos de este usuario cubren desde la creación de la cuenta `2011-06-14` hasta `2023-03-01`

In [6]:
#| include: false

df['date'].min()

Timestamp('2011-06-14 20:50:03-0500', tz='America/Bogota')

In [7]:
#| include: false

df['date'].max()

Timestamp('2023-03-01 12:22:56-0500', tz='America/Bogota')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7830 entries, 171420 to 179249
Data columns (total 63 columns):
 #   Column                   Non-Null Count  Dtype                         
---  ------                   --------------  -----                         
 0   query                    7830 non-null   object                        
 1   id                       7830 non-null   float64                       
 2   timestamp_utc            7830 non-null   int64                         
 3   local_time               7830 non-null   object                        
 4   user_screen_name         7830 non-null   object                        
 5   text                     7830 non-null   object                        
 6   possibly_sensitive       4135 non-null   object                        
 7   retweet_count            7830 non-null   float64                       
 8   like_count               7830 non-null   float64                       
 9   reply_count              7830 non-null 

### Datos

In [9]:
df.head(3)

Unnamed: 0,query,id,timestamp_utc,local_time,user_screen_name,text,possibly_sensitive,retweet_count,like_count,reply_count,...,media_types,media_alt_texts,mentioned_names,mentioned_ids,hashtags,intervention_type,intervention_text,intervention_url,country,date
171420,from:UnidosxlaVidaCo,1.630982e+18,1677691376,2023-03-01T17:22:56,UnidosxlaVidaCo,#QuienEsBeatriz y porque la @CorteIDH quiere c...,,8.0,7.0,0.0,...,,,corteidh,190706828.0,quienesbeatriz,,,,Colombia,2023-03-01 12:22:56-05:00
171421,from:UnidosxlaVidaCo,1.630982e+18,1677691337,2023-03-01T17:22:17,UnidosxlaVidaCo,Una nueva intervención de la @CorteIDH para im...,,6.0,4.0,0.0,...,,,corteidh,190706828.0,quienesbeatriz,,,,Colombia,2023-03-01 12:22:17-05:00
171422,from:UnidosxlaVidaCo,1.630981e+18,1677691259,2023-03-01T17:20:59,UnidosxlaVidaCo,#QuienEsBeatriz « LideresXlaVida: Beatriz Vs. ...,,6.0,6.0,0.0,...,photo,,,,quienesbeatriz,,,,Colombia,2023-03-01 12:20:59-05:00


### Dominios
Lista del top 20 de dominios mencionados en los tweets y su frecuencia:

In [10]:
# count items on column
domains = df['domains'].value_counts()

# return first n rows in descending order
top_domains = domains.nlargest(20)

top_domains

domains
fb.me                     1231
bit.ly                     242
unidosporlavida.com        193
facebook.com               171
instagram.com              125
sumall.com                  98
youtube.com                 68
lifenews.com                40
citizengo.org               36
youtu.be                    33
20ft.net                    33
aciprensa.com               19
votocatolico.co             18
actuall.com                 15
shar.es                     15
liveactionnews.org          15
twitter.com                 12
es.gaudiumpress.org         12
religionenlibertad.com      10
razonmasfe.com               8
Name: count, dtype: int64

### Hashtags

Lista del top 20 de hashtags más usados y su frecuencia

In [11]:
# convert dataframe column to list
hashtags = df['hashtags'].to_list()

# remove nan items from list
hashtags = [x for x in hashtags if not pd.isna(x)]

# split items into a list based on a delimiter
hashtags = [x.split('|') for x in hashtags]

# flatten list of lists
hashtags = [item for sublist in hashtags for item in sublist]

# count items on list
hashtags_count = pd.Series(hashtags).value_counts()

# return first n rows in descending order
top_hashtags = hashtags_count.nlargest(20)

top_hashtags

sialavida                647
aborto                   416
9marchaxlavida           373
noalaborto               325
colombiaesprovida        295
eutanasia                157
procuradorordóñez        139
sialprocurador           138
yosoyprovida             135
soyprovida               108
negocio                  106
repost                   100
todavidaimporta          100
elijolas2vidas            98
colombia                  93
eutanasiano               91
abortocero                91
fiestaxlavida             91
4mayo7marchaporlavida     89
caravanaporlavida         88
Name: count, dtype: int64

### Usuarios

Top 20 de usuarios más mencionados en los tweets

In [12]:
# filter column from dataframe
users = df['mentioned_names'].to_list()

# remove nan items from list
users = [x for x in users if not pd.isna(x)]

# split items into a list based on a delimiter
users = [x.split('|') for x in users]

# flatten list of lists
users = [item for sublist in users for item in sublist]

# count items on list
users_count = pd.Series(users).value_counts()

# return first n rows in descending order
top_users = users_count.nlargest(20)

top_users

marceposada        196
colombiaprovida    194
cconstitucional    176
monicaroa          173
sialprocurador     106
unidosxlavidaco    105
noticiasrcn         83
7marcofidelr        62
amadarosa           59
referendoxvida      51
colombiaderecha     49
profamiliacol       48
oea_oficial         47
comisionprimera     42
camaracolombia      40
lam_vero            36
wradiocolombia      35
unidosxlavida       35
yosoyprovida        34
aciprensa           32
Name: count, dtype: int64

### Likes en el tiempo

In [13]:
# plot the data using plotly
fig = px.line(df, 
              x='date', 
              y='like_count', 
              title='Número de likes en el tiempo',
              template='plotly_white', 
              hover_data=['text'])

# show the plot
fig.show()

### Tokens

Lista del top 20 de los tokens más comunes y su frecuencia 

In [14]:
# load the spacy model for Spanish
nlp = spacy.load("es_core_news_sm")

# load stop words for Spanish
STOP_WORDS = nlp.Defaults.stop_words

# Function to filter stop words
def filter_stopwords(text):
    # lower text
    doc = nlp(text.lower())
    # filter tokens
    tokens = [token.text for token in doc if not token.is_stop and token.text not in STOP_WORDS and token.is_alpha]
    return ' '.join(tokens)

# apply function to dataframe column
df['text_pre'] = df['text'].apply(filter_stopwords)

# count items on column
token_counts = df["text_pre"].str.split(expand=True).stack().value_counts()[:20]

token_counts

vida                 2070
aborto               1097
colombia              719
sialavida             661
colombiaesprovida     437
mayo                  390
q                     388
noalaborto            370
eutanasia             323
derecho               323
gracias               309
provida               308
muerte                268
feliz                 268
d                     263
voz                   250
mujer                 222
familia               210
mujeres               204
concepción            191
Name: count, dtype: int64

### Hora

Lista de las 10 horas con más cantidad de tweets publicados

In [15]:
# extract hour from datetime column
df['hour'] = df['date'].dt.strftime('%H')

# count items on column
hours_count = df['hour'].value_counts()

# return first n rows in descending order
top_hours = hours_count.nlargest(10)

top_hours

hour
11    786
10    737
12    677
09    622
14    525
13    519
08    519
07    448
19    426
15    403
Name: count, dtype: int64

### Pataformas

Plataformas desde las que se publicaron contenidos y su frecuencia

In [16]:
df['source_name'].value_counts()

source_name
Twitter for iPhone             2031
Twitter Web App                1706
Twitter Web Client             1487
Facebook                       1468
Twitter for Android             412
Mobile Web                      163
TweetDeck                       133
erased88075                     131
Twitter for Websites            124
Instagram                        99
UberSocial for iPhone            22
Mobile Web (M2)                  12
iOS                              11
Twitter for Android Tablets      10
Twitter for Mac                   7
Tweeet! on iOS                    4
Hootsuite Inc.                    3
Buffer                            3
Hootsuite                         2
Twibbon                           1
Periscope                         1
Name: count, dtype: int64

### Tópicos

Técnica de modelado de tópicos con `transformers` y `TF-IDF` 

In [17]:
#| include: false

# Remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.NUMBER)
df['text_pre'] = df['text_pre'].apply(lambda x: p.clean(x))

# Replace emojis with descriptions
df['text_pre'] = df['text_pre'].apply(lambda x: demojize(x))

In [18]:
#| include: false

# filter column
docs = df['text_pre']

# calculate topics and probabilities
topic_model = BERTopic(language="multilingual", calculate_probabilities=True, verbose=True)

# training
topics, probs = topic_model.fit_transform(docs)

Batches:   0%|          | 0/245 [00:00<?, ?it/s]

2023-07-05 19:07:30,745 - BERTopic - Transformed documents to Embeddings
2023-07-05 19:07:57,445 - BERTopic - Reduced dimensionality
2023-07-05 19:08:02,223 - BERTopic - Clustered reduced embeddings


In [19]:
# visualize topics
topic_model.visualize_topics()

### Reducción de tópicos

Mapa con 10 tópicos del contenido de los tweets

In [20]:
#| include: false

# reduce the number of topics
topic_model.reduce_topics(docs, nr_topics=11)

2023-07-05 19:08:07,230 - BERTopic - Reduced number of topics from 144 to 11


<bertopic._bertopic.BERTopic at 0x7f154a4d7880>

In [21]:
# visualize topics
topic_model.visualize_topics()

### Términos por tópico

In [22]:
topic_model.visualize_barchart(top_n_topics=11)

### Análisis de tópicos

Selección de tópicos que tocan temas de género

In [38]:
# selection of topics
topics = [0, 1, 2]

keywords_list = []
for topic_ in topics:
    topic = topic_model.get_topic(topic_)
    keywords = [x[0] for x in topic]
    keywords_list.append(keywords)

# flatten list of lists
word_list = [item for sublist in keywords_list for item in sublist]

# use apply method with lambda function to filter rows
filtered_df = df[df['text_pre'].apply(lambda x: any(word in x for word in word_list))]

percentage = round(100 * len(filtered_df) / len(df), 2)
print(f"Del total de {len(df)} tweets de @UnidosxlaVidaCo, alrededor de {len(filtered_df)} hablan sobre temas de género, es decir, cerca del {percentage}%")

Del total de 7830 tweets de @UnidosxlaVidaCo, alrededor de 5573 hablan sobre temas de género, es decir, cerca del 71.17%


In [27]:
# drop rows with 0 values in two columns
filtered_df = filtered_df[(filtered_df.like_count != 0) & (filtered_df.retweet_count != 0)]

# add a new column with the sum of two columns
filtered_df['impressions'] = (filtered_df['like_count'] + filtered_df['retweet_count'])/2

# extract year from datetime column
filtered_df['year'] = filtered_df['date'].dt.year

# remove urls, mentions, hashtags and numbers
p.set_options(p.OPT.URL)
filtered_df['tweet_text'] = filtered_df['text'].apply(lambda x: p.clean(x))

# Create scatter plot
fig = px.scatter(filtered_df, x='like_count', 
                 y='retweet_count',
                 size='impressions', 
                 color='year',
                 hover_name='tweet_text')

# Update title and axis labels
fig.update_layout(
    title='Tweets talking about gender with most Likes and Retweets',
    xaxis_title='Number of Likes',
    yaxis_title='Number of Retweets'
)

fig.show()

### Tópicos en el tiempo

In [25]:
# convert column to list
tweets = df['text_pre'].to_list()
timestamps = df['local_time'].to_list()

topics_over_time = topic_model.topics_over_time(docs=tweets, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

20it [00:00, 32.51it/s]
