In [10]:
import pickle as pkl

import fasttext
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
from wordcloud import WordCloud

from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, Normalizer

In [3]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

# Data preparation based on pretrained models

Each section should work individually,
as long as all files used in it are already available

## Tweets

### Step 1 - drop unnecessary columns

In [3]:
tweets = pd.read_pickle('../datasets/tweets.pkl.gz')

In [4]:
filtered = tweets[['username', 'id', 'link', 'tweet']]

In [5]:
filtered.to_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

### Step 2 - join with users/parties/coalitions

In [6]:
filtered = pd.read_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

In [7]:
users = pd.read_csv('../datasets/accounts_processed.csv', index_col=0)

In [8]:
users = users[['username', 'party', 'coalition', 'pozycja']]
users = users.rename(columns={'pozycja': 'role'})
users['username'] = users['username'].apply(str.lower)


In [9]:
tweets_users = filtered.merge(users, on='username')

In [10]:
tweets_users.to_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

### Stage 3 - calculate sentiment


In [11]:
sentiment_model = fasttext.load_model('../trained_models/sentiment_model.bin')



In [12]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_emojied2text.pkl.gz')
tweets_users = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

In [13]:
clean_tweets['tweet'] = clean_tweets['tweet'].apply(str.lower)
clean_tweets = clean_tweets[['id', 'tweet']]

In [14]:
just_tweets = clean_tweets['tweet'].tolist()

In [15]:
%%time

predictions = sentiment_model.predict(just_tweets)[0]

CPU times: user 1min 33s, sys: 1.64 s, total: 1min 34s
Wall time: 1min 34s


In [16]:
predictions = [label for sublist in predictions for label in sublist]

In [17]:
clean_tweets['sentiment'] = predictions
clean_tweets = clean_tweets[['id', 'sentiment']]

In [18]:
tweets_users_sentiment = tweets_users.merge(clean_tweets, on='id', how='right')

In [19]:
tweets_users_sentiment.replace(to_replace={
    '__label__positive': 'positive',
    '__label__negative': 'negative',
    '__label__ambiguous': 'ambiguous',
    '__label__neutral': 'neutral'
}, inplace=True)

In [20]:
tweets_users_sentiment['sentiment'].value_counts()

negative     551675
neutral      440461
positive     361306
ambiguous    137464
Name: sentiment, dtype: int64

In [21]:
tweets_users_sentiment.to_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment.pkl.gz')

### Stage 4 - calculate topics

In [22]:
tweets_users_sentiment = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment.pkl.gz')

In [23]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')

In [24]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [25]:
tweets_texts = clean_tweets.tweet.tolist()
counts = vectorizer.transform(tweets_texts)

In [26]:
probas = lda.transform(counts)

In [27]:
labels = np.argmax(probas, axis=1)
prob_values = np.max(probas, axis=1)

In [28]:
clean_tweets['topic'] = labels
clean_tweets['topic_proba'] = prob_values

In [29]:
clean_tweets = clean_tweets[['id', 'topic', 'topic_proba']]

In [30]:
tweets_users_sentiment_topic = tweets_users_sentiment.merge(clean_tweets, on='id')

In [31]:
tweets_users_sentiment_topic.to_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')

## Topics

### Words per topic

In [4]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [32]:
words_in_topics = {}

for topic_num, topic in enumerate(lda.components_):
    frequencies = [
        {
            'text': name,
            'value': freq
        }
        for name, freq in zip(vectorizer.get_feature_names(), topic)
    ]
    words_in_topics[topic_num] = frequencies

In [33]:
with open('../datasets/for_presentation/words_per_topic.pkl.gz', 'wb') as f:
    pkl.dump(words_in_topics, f)


#### Extra - visualisation of topics

In [None]:
for i in range(len(lda.components_)):
    topic = lda.components_[i]
    frequencies = {name: freq for name, freq in zip(vectorizer.get_feature_names(), topic)}
    wordcloud = WordCloud(
        width=1920, height=1080, background_color="white"
    ).generate_from_frequencies(frequencies=frequencies)
    fig = px.imshow(wordcloud, title=f"Topic {i}")
    fig.show()

### Topics per user/party/coalition

In [34]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')

with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

with open('../trained_models/lda_10.pkl.gz', 'rb') as lda_file:
    lda: LatentDirichletAllocation = pkl.load(lda_file)

In [35]:
topics_count = len(lda.components_)

In [15]:
tweets_texts = clean_tweets.tweet.tolist()
counts = vectorizer.transform(tweets_texts)

In [16]:
probas = lda.transform(counts)

In [36]:
tweets_users_sentiment_topic = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
a = clean_tweets.merge(tweets_users_sentiment_topic, on='id')
a.rename(columns={'username_x': 'username'}, inplace=True)
a = a.reset_index()

def get_topic_distribution_for_column(column_value, column_name):
    indices = np.array(a[a[column_name]==column_value].index.tolist())
    topics = probas[indices]
    values = np.sum(topics, axis=0)
    distribution = values / np.sum(values)
    return distribution

In [37]:
topics_distributions = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()

In [38]:
for username in tqdm(unique_usernames):
    topics_distributions['per_user'][username] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='username',
            column_value=username))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [39]:
for party in tqdm(unique_parties):
    topics_distributions['per_party'][party] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='party',
            column_value=party))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [40]:
for coalition in tqdm(unique_coalitions):
    topics_distributions['per_coalition'][coalition] = [
        {
            'topic': t,
            'part': p
        }
        for t, p
        in zip(range(topics_count), get_topic_distribution_for_column(
            column_name='coalition',
            column_value=coalition))
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [41]:
with open('../datasets/for_presentation/topics_distributions.pkl.gz', 'wb') as f:
    pkl.dump(topics_distributions, f)

## Words

### Words per user/party/coalition

In [42]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_lemma_stopwords.pkl.gz')
tweets_users_sentiment_topic = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
a = clean_tweets.merge(tweets_users_sentiment_topic, on='id', suffixes=('', '_y'))
a.rename(columns={'username_x': 'username'}, inplace=True)
a.reset_index(inplace=True)

del clean_tweets
del tweets_users_sentiment_topic

In [43]:
with open('../trained_models/vectorizer_10.pkl.gz', 'rb') as vec_file:
    vectorizer: CountVectorizer = pkl.load(vec_file)

In [44]:
counts = vectorizer.transform(a.tweet.tolist())

In [45]:
def get_word_counts_for_column(column_name, column_value):
    indices = np.array(a[a[column_name]==column_value].index.tolist())
    words = counts[indices]
    summed = np.sum(words, axis=0)
    return np.array(summed).squeeze()

In [46]:
words_counts = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()

In [47]:
for username in tqdm(unique_usernames):
    words_counts['per_user'][username] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='username',
                column_value=username
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [48]:
for party in tqdm(unique_parties):
    words_counts['per_party'][party] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='party',
                column_value=party
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [49]:
for coalition in tqdm(unique_coalitions):
    words_counts['per_coalition'][coalition] = [
        {
            'text': name,
            'value': freq
        }
        for name, freq
        in zip(
            vectorizer.get_feature_names(),
            get_word_counts_for_column(
                column_name='coalition',
                column_value=coalition
            )
        )
    ]

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [50]:
with open('../datasets/for_presentation/words_counts.pkl.gz', 'wb') as f:
    pkl.dump(words_counts, f)

## Sentiment

### Sentiment per user/party/coalition/topic

In [51]:
a = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')

In [52]:
sent_values = a.sentiment.unique()

def get_sentiment_distribution_by_column(column_name, column_value):
    sent_counts = a[a[column_name] == column_value].sentiment.value_counts()
    tweets_count = sent_counts.sum()
    result = []
    for sent in sent_values:
        if sent in sent_counts.index:
            result.append((sent, sent_counts[sent] / tweets_count))
        else:
            result.append((sent, 0))

    return result

In [53]:
sentiment_distributions = {
    'per_user': {},
    'per_party': {},
    'per_coalition': {},
    'per_topic': {}
}

unique_usernames = a.username.unique()
unique_parties = a.party.unique()
unique_coalitions = a.coalition.unique()
unique_topics = a.topic.unique()

In [54]:
for username in tqdm(unique_usernames):
    sentiment_distributions['per_user'][username] = get_sentiment_distribution_by_column(
        column_name='username',
        column_value=username
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=538.0), HTML(value='')))




In [55]:
for topic in tqdm(unique_topics):
    sentiment_distributions['per_topic'][topic] = get_sentiment_distribution_by_column(
        column_name='topic',
        column_value=topic
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [56]:
for party in tqdm(unique_parties):
    sentiment_distributions['per_party'][party] = get_sentiment_distribution_by_column(
        column_name='party',
        column_value=party
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




In [57]:
for coalition in tqdm(unique_coalitions):
    sentiment_distributions['per_coalition'][coalition] = get_sentiment_distribution_by_column(
        column_name='coalition',
        column_value=coalition
    )

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [58]:
with(open('../datasets/for_presentation/sentiment_distributions.pkl.gz', 'wb')) as f:
    pkl.dump(sentiment_distributions, f)

## Coalitions and parties

### Extract info about each party and coalition for quicker access

In [59]:
accounts = pd.read_csv('../datasets/accounts_processed.csv')

In [60]:
parties = accounts.groupby('party').max()

In [61]:
parties.reset_index(inplace=True)
parties = parties[['party', 'coalition']]

In [62]:
parties.to_csv('../datasets/for_presentation/parties.csv', index=False)

## Graph positions

### t-SNE

In [4]:
tweets = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
usernames = tweets.username.unique()

In [5]:
embedding_data = pd.read_csv('../datasets/embeddings.csv')
embedding_data['username'] = embedding_data['username'].str.lower()

In [7]:
embedding_data = embedding_data[embedding_data['username'].isin(usernames)]

In [8]:
embeddings = np.array([np.array([np.float(i) for i in x.replace("]", "").replace("[", "").split()]) for x in embedding_data['embedding'].tolist()])
embeddings.shape

(538, 768)

In [12]:
%%time

tsne3d = TSNE(n_components=3).fit_transform(embeddings)

CPU times: user 47.6 s, sys: 37.4 ms, total: 47.6 s
Wall time: 17.6 s


In [13]:
%%time

tsne2d = TSNE(n_components=2).fit_transform(embeddings)

CPU times: user 33.8 s, sys: 67.5 ms, total: 33.9 s
Wall time: 14 s


In [None]:
embeddings_normalized = Normalizer().fit_transform(embeddings)
embeddings_standardized = StandardScaler().fit_transform(embeddings)

tsne3d_standardized = TSNE(n_components=3).fit_transform(embeddings_standardized)
tsne3d_normalized = TSNE(n_components=3).fit_transform(embeddings_normalized)

tsne2d_standardized = TSNE(n_components=2).fit_transform(embeddings_standardized)
tsne2d_normalized = TSNE(n_components=2).fit_transform(embeddings_normalized)

In [26]:
graph_positions = pd.DataFrame(tsne3d, columns=['3D_x', '3D_y', '3D_z'])

In [27]:
graph_positions['2D_x'] = tsne2d[:, 0]
graph_positions['2D_y'] = tsne2d[:, 1]
graph_positions['username'] = usernames

In [30]:
graph_positions.to_csv('../datasets/for_presentation/graph_tsne.csv', index=False)

## Clusters

### KMeans

In [31]:
tweets = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment_topic.pkl.gz')
usernames = tweets.username.unique()

embedding_data = pd.read_csv('../datasets/embeddings.csv')
embedding_data['username'] = embedding_data['username'].str.lower()

embedding_data = embedding_data[embedding_data['username'].isin(usernames)]

embeddings = np.array([np.array([np.float(i) for i in x.replace("]", "").replace("[", "").split()]) for x in embedding_data['embedding'].tolist()])
embeddings.shape

(538, 768)

In [32]:
clusters = KMeans(n_clusters=6).fit(embeddings)

In [33]:
df = pd.DataFrame(usernames, columns=['username'])
df['kmeans_cluster'] = clusters.labels_

In [36]:
df.to_csv('../datasets/for_presentation/clusters.csv', index=False)