In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os
import sys
from pathlib import Path
from aves.config import setup_style

load_dotenv()
setup_style()

AVES_ROOT = Path(os.environ['AVES_ROOT'])

In [3]:
TWEET_PATH = Path(os.environ['TWEET_PATH'])

In [None]:
TWEET_PATH

In [5]:
from glob import glob

In [None]:
tweet_files = sorted(glob(str(TWEET_PATH / '*.gz')))
tweet_files[0:3]

In [None]:
import pandas as pd

tweets = pd.read_json(tweet_files[0], orient='records', lines=True, dtype={'created_at': 'datetime'})
tweets.head()

In [None]:
tweets['user.location'].value_counts().head(25)

In [None]:
tweets['text']

In [None]:
tweets['created_at'].min(), tweets['created_at'].min().round('10min')

In [None]:
from aves.features.twokenize import tokenize

tweets.head()['text'].map(tokenize)

In [12]:
from collections import Counter
from functools import lru_cache

def clean_df(df):
    return df[
        ~df["user.location"].str.contains(
            "Argent|Colom|Perú|Ecuador|Bolivia|México|Mexico|España|Cuba|Lima|Dominicana|Costa Rica|Uruguay|Paraguay|El Salvador|Venezuela",
            regex=True, case=False
        )
    ]




In [None]:
import dask
import dask.dataframe as dd

all_tweets = dd.read_json(
    tweet_files, orient="records", lines=True, dtype={"created_at": "datetime"}
)[["created_at", "text", "user.location", "user.id", "user.description"]].compute()
all_tweets

In [None]:
def count_tokens(series):
    @lru_cache(512)
    def cached_tokenize(text):
        return tokenize(text)
    tokens = series.str.lower().map(cached_tokenize)
    counts = Counter()
    tokens.map(counts.update)
    return counts

count_test = all_tweets.head(1000).pipe(clean_df).pipe(lambda x: count_tokens(x['text']))
count_test

In [None]:
len(count_test)

In [None]:
count_test.most_common(50)

In [None]:
cleaned_tweets = all_tweets.pipe(clean_df).pipe(
    lambda x: x[x["created_at"] >= "2022-05-01"]
)
len(all_tweets), len(cleaned_tweets)

In [None]:
cleaned_tweets.resample('10min', on='created_at').size().plot()

In [None]:
# "total", solo miraremos el 33% de los tweets. esto simplifica las cosas pero mantiene los grandes patrones
total_count = count_tokens(cleaned_tweets.sample(frac=0.33)['text'])
total_count.most_common(25)

In [None]:
from aves.visualization.text import draw_wordcloud
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
draw_wordcloud(ax, total_count)

In [None]:
with open(AVES_ROOT / 'data' / 'external' / 'stopwords-es.txt') as f:
    stopwords = set(f.read().split())

len(stopwords)

In [None]:
import string
punctuation = string.punctuation + "“”‘’«»¡¿…"
punctuation


In [None]:
from cytoolz import keyfilter

def filter_tokens(counts):
    return keyfilter(lambda x: not x in stopwords and not x in punctuation and not x in ['..', '...'], counts)

total_count_filtered = filter_tokens(total_count)

fig, ax = plt.subplots()
draw_wordcloud(ax, total_count_filtered)
ax.set_axis_off()

In [None]:
len(total_count_filtered)

In [None]:
df_words = pd.DataFrame(total_count_filtered.items(), columns=['word', 'frequency']).sort_values('frequency', ascending=False)
df_words

In [None]:
df_words['frequency'].plot(kind='hist', bins=100, logy=True)

In [None]:
top_words = set(df_words['word'].values[:1000])
top_words

In [None]:
def keep_top_only(counts):
    counts = filter_tokens(counts)
    return keyfilter(lambda x: x in top_words, counts)


words_x_time = (
    cleaned_tweets.sample(frac=0.05)
    .resample("10min", on="created_at")
    .aggregate(lambda x: keep_top_only(count_tokens(x["text"])))
    .apply(pd.Series)
    .fillna(0)
)
words_x_time

In [None]:
import seaborn as sns
from aves.features.utils import normalize_rows

sns.clustermap(words_x_time.T.pipe(normalize_rows), method='ward', col_cluster=False)

In [None]:
words_x_time['chile'].plot()

In [None]:
words_x_time['😡'].plot()

In [None]:
words_x_time = (
    cleaned_tweets
    .resample("10min", on="created_at")
    .aggregate(lambda x: keep_top_only(count_tokens(x["text"])))
    .apply(pd.Series)
    .fillna(0)
)

words_x_time

In [None]:
words_x_time.plot(kind='area', legend=False)

In [34]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=8, random_state=666)
doc_topics = lda.fit_transform(words_x_time.values)

In [None]:
from aves.features.utils import tfidf

word_topics = pd.DataFrame(lda.components_.T, index=words_x_time.columns).pipe(tfidf)
word_topics

In [None]:
fig, axes = plt.subplots(2, 4)

for i, ax in enumerate(axes.flatten()):
    ax.set_axis_off()

    if i < lda.n_components:
        draw_wordcloud(ax, word_topics[i])
        ax.set_title(f'topic {i}')

In [None]:
pd.DataFrame(doc_topics, index=words_x_time.index).plot(kind='area')

In [None]:
palette = sns.color_palette('cubehelix_r', n_colors=lda.n_components)
palette

In [None]:
topic_labels = word_topics.apply(lambda x: '\n'.join(x.sort_values(ascending=False).head(3).index), axis=0).to_dict()
topic_labels

In [None]:
from aves.visualization.tables.areas import streamgraph

fig, ax = plt.subplots()

#fig.set_facecolor("#efefef")
#ax.set_facecolor("#efefef")
#ax.set_xlim([1920, 2020])
#ax.set_ylim([0, 1])
#ax.set_title("Evolución de Nombres en Chile (1920-2020)", loc="left")
#ax.set_ylabel("Proporción de las inscripciones")
#ax.set_xlabel("")

streamgraph(
    ax,
    pd.DataFrame(doc_topics, index=words_x_time.index, columns=topic_labels.values()),
    fig=fig,
    area_colors=dict(zip(topic_labels.values(), palette)),
    baseline="wiggle",
    labels=True,
    #label_threshold=0.75,
    avoid_label_collisions=False,
    area_args=dict(linewidth=0.01, alpha=0.75),
    label_rolling_window=6
)


sns.despine(ax=ax, bottom=True, top=True)

In [None]:
len(cleaned_tweets['user.id'].unique())

In [None]:
users = cleaned_tweets.drop_duplicates('user.id', keep='last').pipe(lambda x: x[pd.notnull(x['user.description']) & (x['user.description'].str.len() >= 50)])
len(users)

In [44]:
from sentence_transformers import SentenceTransformer

sentence_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

In [85]:
user_embeddings = sentence_model.encode(users['user.description'].values, show_progress_bar=False, batch_size=256)

In [None]:
user_embeddings.shape

In [None]:
from umap import UMAP

umap = UMAP(n_components=3)
projected_users = umap.fit_transform(user_embeddings)
projected_users

In [97]:
projected_users = pd.DataFrame(projected_users, index=users['user.id'], columns=['x', 'y', 'z'])

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(projected_users['x'], projected_users['y'], projected_users['z'], marker='.')

In [None]:
from sklearn.cluster import HDBSCAN

clustering_model = HDBSCAN(min_cluster_size=100)
clusters = pd.Series(clustering_model.fit_predict(projected_users.values), index=projected_users.index, name='cluster')
clusters.value_counts()

In [106]:
%matplotlib ipympl

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

for idx, group in projected_users.join(clusters).groupby('cluster'):
    if idx > -1:
        ax.scatter(group['x'], group['y'], group['z'], marker='.', label=f'cluster {idx}')
    else:
        ax.scatter(group['x'], group['y'], group['z'], marker='.', color='grey')

ax.legend()

In [None]:
for idx, group in users.join(clusters, on='user.id').groupby('cluster'):
    print(idx)
    print('\n'.join(group['user.description'].sample(3).values))