In [1]:
import pickle as pkl

import fasttext
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
from wordcloud import WordCloud

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Data preparation based on pretrained models

Each section should work individually,
as long as all files used in it are already available

## Tweets

### Step 1 - drop unnecessary columns

In [2]:
tweets = pd.read_pickle('../datasets/tweets.pkl.gz')

In [8]:
filtered = tweets[['username', 'id', 'link', 'tweet']]

In [9]:
filtered.to_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

### Step 2 - join with users/parties/coalitions

In [2]:
filtered = pd.read_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

In [3]:
users = pd.read_csv('../datasets/accounts_processed.csv', index_col=0)

In [4]:
users = users[['username', 'party', 'coalition', 'pozycja']]
users = users.rename(columns={'pozycja': 'role'})
users['username'] = users['username'].apply(str.lower)


In [9]:
tweets_users = filtered.merge(users, on='username')

In [10]:
tweets_users.to_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

### Stage 3 - calculate sentiment


In [3]:
sentiment_model = fasttext.load_model('../trained_models/sentiment_model.bin')



In [4]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_emojied2text.pkl.gz')
tweets_users = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

In [5]:
clean_tweets['tweet'] = clean_tweets['tweet'].apply(str.lower)
clean_tweets = clean_tweets[['id', 'tweet']]

In [6]:
just_tweets = clean_tweets['tweet'].tolist()

In [7]:
%%time

predictions = sentiment_model.predict(just_tweets)[0]

CPU times: user 1min 32s, sys: 1.52 s, total: 1min 34s
Wall time: 1min 34s


In [13]:
predictions = [label for sublist in predictions for label in sublist]

In [14]:
clean_tweets['sentiment'] = predictions
clean_tweets = clean_tweets[['id', 'sentiment']]

In [15]:
tweets_users_sentiment = tweets_users.merge(clean_tweets, on='id', how='right')

In [20]:
tweets_users_sentiment.replace(to_replace={
    '__label__positive': 'positive',
    '__label__negative': 'negative',
    '__label__ambiguous': 'ambiguous',
    '__label__neutral': 'neutral'
}, inplace=True)

In [21]:
tweets_users_sentiment['sentiment'].value_counts()

negative     551675
neutral      440461
positive     361306
ambiguous    137464
Name: sentiment, dtype: int64

In [22]:
tweets_users_sentiment.to_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment.pkl.gz')

### Stage 4 - calculate topics

In [3]:
tweets_users_sentiment = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment.pkl.gz')

In [4]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')

In [2]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [11]:
tweets_texts = clean_tweets.tweet.tolist()
counts = vectorizer.transform(tweets_texts)

In [12]:
probas = lda.transform(counts)

In [16]:
labels = np.argmax(probas, axis=1)
prob_values = np.max(probas, axis=1)

In [18]:
clean_tweets['topic'] = labels
clean_tweets['topic_proba'] = prob_values

In [19]:
clean_tweets = clean_tweets[['id', 'topic', 'topic_proba']]

In [20]:
tweets_users_sentiment_topic = tweets_users_sentiment.merge(clean_tweets, on='id')

In [21]:
tweets_users_sentiment_topic.to_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')

## Topics

### Words per topic

In [4]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [6]:
words_in_topics = {}

for topic_num, topic in enumerate(lda.components_):
    frequencies = [
        {
            'text': name,
            'value': freq
        }
        for name, freq in zip(vectorizer.get_feature_names(), topic)
    ]
    words_in_topics[topic_num] = frequencies

In [11]:
with open('../datasets/for_presentation/words_per_topic.pkl.gz', 'wb') as f:
    pkl.dump(words_in_topics, f)


#### Extra - visualisation of topics

In [None]:
for i in range(len(lda.components_)):
    topic = lda.components_[i]
    frequencies = {name: freq for name, freq in zip(vectorizer.get_feature_names(), topic)}
    wordcloud = WordCloud(
        width=1920, height=1080, background_color="white"
    ).generate_from_frequencies(frequencies=frequencies)
    fig = px.imshow(wordcloud, title=f"Topic {i}")
    fig.show()

### Topics per user/party/coalition

In [14]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')

with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [None]:
topics_count = len(lda.components_)

In [15]:
tweets_texts = clean_tweets.tweet.tolist()
counts = vectorizer.transform(tweets_texts)

In [16]:
probas = lda.transform(counts)

In [34]:
tweets_users_sentiment_topic = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
a = clean_tweets.merge(tweets_users_sentiment_topic, on='id')
a.rename(columns={'username_x': 'username'}, inplace=True)
a = a.reset_index()

def get_topic_distribution_for_column(column_value, column_name):
    indices = np.array(a[a[column_name]==column_value].index.tolist())
    topics = probas[indices]
    values = np.sum(topics, axis=0)
    distribution = values / np.sum(values)
    return distribution

In [35]:
topics_distributions = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()

In [43]:
for username in tqdm(unique_usernames):
    topics_distributions['per_user'][username] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='username',
            column_value=username))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [44]:
for party in tqdm(unique_parties):
    topics_distributions['per_party'][party] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='party',
            column_value=party))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [45]:
for coalition in tqdm(unique_coalitions):
    topics_distributions['per_coalition'][coalition] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='coalition',
            column_value=coalition))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [46]:
with open('../datasets/for_presentation/topics_distributions.pkl.gz', 'wb') as f:
    pkl.dump(topics_distributions, f)

## Words

### Words per user/party/coalition

In [2]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')
tweets_users_sentiment_topic = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
a = clean_tweets.merge(tweets_users_sentiment_topic, on='id', suffixes=('', '_y'))
a.rename(columns={'username_x': 'username'}, inplace=True)
a.reset_index(inplace=True)

del clean_tweets
del tweets_users_sentiment_topic

In [3]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

In [4]:
counts = vectorizer.transform(a.tweet.tolist())

In [27]:
def get_word_counts_for_column(column_name, column_value):
    indices = np.array(a[a[column_name]==column_value].index.tolist())
    words = counts[indices]
    summed = np.sum(words, axis=0)
    return np.array(summed).squeeze()

In [38]:
words_counts = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()

In [39]:
for username in tqdm(unique_usernames):
    words_counts['per_user'][username] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='username',
                column_value=username
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [40]:
for party in tqdm(unique_parties):
    words_counts['per_party'][party] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='party',
                column_value=party
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [41]:
for coalition in tqdm(unique_coalitions):
    words_counts['per_coalition'][coalition] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='coalition',
                column_value=coalition
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [42]:
with open('../datasets/for_presentation/words_counts.pkl.gz', 'wb') as f:
    pkl.dump(words_counts, f)

## Sentiment

### Sentiment per user/party/coalition/topic

In [24]:
a = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')

In [29]:
sent_values = a.sentiment.unique()

def get_sentiment_distribution_by_column(column_name, column_value):
    sent_counts = a[a[column_name] == column_value].sentiment.value_counts()
    tweets_count = sent_counts.sum()
    result = []
    for sent in sent_values:
        if sent in sent_counts.index:
            result.append((sent, sent_counts[sent] / tweets_count))
        else:
            result.append((sent, 0))

    return result

In [31]:
sentiment_distributions = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {},
    'per_topic': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()
unique_topics = a.topic.unique()

In [32]:
for username in tqdm(unique_usernames):
    sentiment_distributions['per_user'][username] = get_sentiment_distribution_by_column(
        column_name='username',
        column_value=username
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [33]:
for topic in tqdm(unique_topics):
    sentiment_distributions['per_topic'][topic] = get_sentiment_distribution_by_column(
        column_name='topic',
        column_value=topic
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [34]:
for party in tqdm(unique_parties):
    sentiment_distributions['per_party'][party] = get_sentiment_distribution_by_column(
        column_name='party',
        column_value=party
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [35]:
for coalition in tqdm(unique_coalitions):
    sentiment_distributions['per_coalition'][coalition] = get_sentiment_distribution_by_column(
        column_name='coalition',
        column_value=coalition
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [36]:
with(open('../datasets/for_presentation/sentiment_distributions.pkl.gz', 'wb')) as f:
    pkl.dump(sentiment_distributions, f)