In [1]:
import pickle as pkl
import os
import pandas as pd
import plotly.express as px

In [2]:
with open(os.path.join("datasets", "lda.pkl.gz"), "rb") as f:
    lda = pkl.load(f)

In [3]:
with open(os.path.join("datasets", "vectorizer.pkl.gz"), "rb") as f:
    vectorizer = pkl.load(f)

In [4]:
df = pd.read_pickle(os.path.join("datasets", "tweets_cleaned_lemma_stopwords.pkl.gz"))

In [5]:
tweets = df.tweet.to_list()
counts = vectorizer.transform(tweets)

In [6]:
x = lda.transform(counts)

In [7]:
x_sum = x.sum(axis=0)

In [8]:
proba_per_topic = x_sum / x_sum.sum()

In [9]:
proba_per_topic

array([0.09151091, 0.11187487, 0.11126361, 0.09082755, 0.11243592,
       0.09487073, 0.10997889, 0.09962241, 0.08382289, 0.09379223])

In [10]:
df_proba = pd.DataFrame({"proba": proba_per_topic}).reset_index()
df_proba["index"] = df_proba["index"].astype(str)

In [11]:
fig = px.bar(df_proba, x="index", y="proba", color="index", color_discrete_sequence=["#1da1f2", "#1b9eee", "#199bea", "#1797e6", "#1594e2", "#1491de", "#128eda", "#108ad6", "#0e87d2", "#0c84ce"])
fig.update_layout(showlegend=False, xaxis_title="Topic", yaxis_title="Topic's share", yaxis_tickformat=",.0%", xaxis_tickmode='linear')
fig.update_layout(
    dict(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'
    )
)
fig.show(renderer="browser")


In [12]:
fig.update_layout(
    font_family="Lato",
    font_size=60
)
fig.write_image(os.path.join("assets", "poster-lda.png"), width=3242, height=2000)

In [11]:
import numpy as np
x_max_topic_1 = x.argmax(axis=1) == 1 

In [24]:
x_max_topic_1

array([False, False, False, ..., False, False, False])

In [30]:
tweets_for_topic_1 = np.array(tweets)[x_max_topic_1]

In [36]:
tweets_for_topic_1

array(['fakt fakt', ' zaufanie zaufanie', ' zaufanie 😊😊😊', ...,
       ' człowiek robota kandydat parlament platforma',
       'minister sprawa grzegorz schetyna państwo wesprzeć akcja',
       ' rada platforma kształt lista jarosław urbaniak'], dtype='<U258')

In [37]:
from wordcloud import WordCloud
from src.data.utils import get_frequencies
frequencies = get_frequencies(pd.Series(tweets_for_topic_1))


In [13]:
from PIL import Image
image_file = Image.open(os.path.join("assets", "wc_topic_mask.jpg"))
# image_file = image_file.convert('1')
mask = np.array(image_file)
mask[mask < 255] = 0

In [42]:
wordcloud = WordCloud(
    width=4096, height=2048, background_color="rgba(255, 255, 255, 0)", mode="RGBA", mask=mask
).generate_from_frequencies(frequencies=frequencies)

In [14]:
def color_func(word, font_size, position,orientation,random_state=None, **kwargs):
    return np.random.choice(np.array(["#1da1f2", "#1b9eee", "#199bea", "#1797e6", "#1594e2", "#1491de", "#128eda", "#108ad6", "#0e87d2", "#0c84ce"]))


In [None]:

wordcloud.recolor(color_func = color_func)

In [46]:
fig = px.imshow(wordcloud)
fig.show(renderer="browser")

In [47]:
wc_svg = wordcloud.to_file(os.path.join("assets", "wc_topic_1_test.png"))

In [17]:
x_max_topic_2 = x.argmax(axis=1) == 2
tweets_for_topic_2 = np.array(tweets)[x_max_topic_2]
from wordcloud import WordCloud
from src.data.utils import get_frequencies
frequencies = get_frequencies(pd.Series(tweets_for_topic_2))
wordcloud = WordCloud(
    width=4096, height=2048, background_color="rgba(255, 255, 255, 0)", mode="RGBA", mask=mask
).generate_from_frequencies(frequencies=frequencies)
wordcloud.recolor(color_func = color_func)
wc_svg = wordcloud.to_file(os.path.join("assets", "wc_topic_2_test.png"))