In [1]:
from json_utils import read_json, read_jsonl
import math
import numpy as np
import pandas as pd
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from nltk.tokenize import TweetTokenizer

import matplotlib.pyplot as plt
import plotly.express as px
plt.rcParams['figure.figsize'] = [10, 8]
plt.style.use('fivethirtyeight')

import re

# stop words
from nltk.corpus import stopwords
from spacy.lang.en.stop_words import STOP_WORDS as en_stop_words
from spacy.lang.da.stop_words import STOP_WORDS as da_stop_words
nltk_da_stop_words = stopwords.words("danish")
nltk_en_stop_words = stopwords.words("english")
from string import punctuation
stop_words = list(en_stop_words) + list(da_stop_words) + nltk_da_stop_words + nltk_en_stop_words + ['#dkpol'] + list(punctuation)

In [2]:
len(nltk_da_stop_words), len(nltk_en_stop_words), len(en_stop_words), len(da_stop_words)

(94, 179, 326, 219)

In [3]:
data = read_jsonl("../data/dkpol_tweets.jsonl")
text = [tweet["text"] for tweet in data]

Reading ../data/dkpol_tweets.jsonl


In [4]:
def remove_stopwords(tokens):
    clean_tokens = " ".join([token.lower().strip() for token in tokens if token.lower().strip() not in stop_words])
    clean_tokens = clean_tokens.strip().split(" ")
    return clean_tokens
        

def clean_doc(doc):
    doc = re.sub(r'http\S+', "", doc.strip()) #remove links
    doc = re.sub(r'^[A-Za-z]', "", doc.strip()) #remove special chars
    doc = ''.join([ch for ch in doc if ord(ch) < 250]) #remove special characters with a hex code > 250(æ:230, ø:248, å:229)
    return doc


def tokenize_docs(docs):
    tt = TweetTokenizer()
    tokens = []
    
    for doc in docs:
        doc = clean_doc(doc)
        if doc:
            doc = tt.tokenize(doc)
            if doc:
                doc = remove_stopwords(doc)
                if len(doc) > 2:
                    tokens.append(doc)
                
    return tokens

In [5]:
def get_counts(tokenized_docs):
    counter = Counter()
    for doc in tokenized_docs:
        counter.update(doc)
    return counter

In [6]:
def sort_counts(counts):
    return sorted(counts.items(), key=lambda k: k[1], reverse=True)

In [7]:
tokens = tokenize_docs(text)
counts = get_counts(tokens)
sorted_counts = sort_counts(counts)

# Word Frequency

In [8]:
def plot_top_n_words(counts, n):
    top_n = counts[:n]
    word, counts = zip(*top_n)
    
    fig = plt.figure(figsize=(20, 8))
    y_pos = np.arange(n)
    plt.bar(y_pos, counts, align='center', alpha=0.5, width=0.9)
    plt.xticks(y_pos, word,rotation=75)
    plt.ylabel('Frequency')
    plt.title(f'Top {n} tokens in tweets')

In [None]:
plot_top_n_words(sorted_counts, 50)

# Word Frequency Per Month

In [None]:
df = pd.DataFrame(data)

In [None]:
df.created_at = pd.to_datetime(df.created_at)

In [None]:
df["month"] = df.created_at.dt.to_period("M")

In [None]:
def top_10_tokens_per_month(df):
    months = df["month"].unique()
    n_months = len(months)
    #fig, axs = plt.subplots(math.floor(n_months/2), math.ceil(n_months/2), figsize=(15,15))
    
    for month in months:
        df1 = df[df["month"]==month]
        text = [text for text in df1.text]
        tokens = tokenize_docs(text)
        counts = get_counts(tokens)
        sorted_counts = sort_counts(counts)
        plot_top_n_words(sorted_counts, 10)
        
top_10_tokens_per_month(df)

# TF-IDF 

    Todo: Create better tokenizer

In [9]:
def dummy_fun(doc):
    return doc

vectorizer = TfidfVectorizer(analyzer="word", tokenizer=dummy_fun, preprocessor=dummy_fun)
X = vectorizer.fit_transform(tokens)

In [None]:
dic = dict(zip(X.indices[np.argsort(X.data)], X.data[np.argsort(X.data)]))

In [None]:
for key, val in vectorizer.vocabulary_.items():
    dic[key] = dic[val]
    del dic[val]

In [None]:
sorted_word_counts = sort_counts(dic)

#### Clustering of the tfidf embeddings

In [10]:
import hdbscan
import umap

In [11]:
feature_names = vectorizer.get_feature_names_out()

In [None]:
umap_embeddings = umap.UMAP(n_neighbors=20,
                            min_dist=0.1,
                            n_components=2, 
                            metric='cosine').fit_transform(X)

In [None]:
cluster = hdbscan.HDBSCAN(min_cluster_size=20,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [None]:
result = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
result['labels'] = cluster.labels_

In [None]:
# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
#plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()
plt.tight_layout()

In [None]:
px.scatter(x=clustered.x, y=clustered.y, color=clustered.labels, color_discrete_map='hsv_r')

In [None]:
indexes = result.loc[result.loc[:, "labels"]==178].index.values

In [None]:
print(list(feature_names[indexes]))

# Clustering with k-means

In [None]:
from sklearn.cluster import MiniBatchKMeans

In [None]:
def find_optimal_clusters(data, max_k):
    iters = range(2, max_k+1, 2)
    
    sse = []
    for k in iters:
        sse.append(MiniBatchKMeans(n_clusters=k, init_size=1024, batch_size=2048, random_state=20).fit(data).inertia_)
        print('Fit {} clusters'.format(k))
        
    f, ax = plt.subplots(1, 1)
    ax.plot(iters, sse, marker='o')
    ax.set_xlabel('Cluster Centers')
    ax.set_xticks(iters)
    ax.set_xticklabels(iters)
    ax.set_ylabel('SSE')
    ax.set_title('SSE by Cluster Center Plot')
    
find_optimal_clusters(umap_embeddings, 50)

In [None]:
clusters = MiniBatchKMeans(n_clusters=40, init_size=1024, batch_size=2048, random_state=20).fit_predict(umap_embeddings)

In [None]:
np.unique(clusters)

In [None]:
result = pd.DataFrame(umap_embeddings, columns=['x', 'y'])
result['labels'] = clusters

In [None]:
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
#plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()
plt.tight_layout()

In [None]:
indexes = result.loc[result.loc[:, "labels"]==1].index.values
print(list(feature_names[indexes]))

In [None]:
def get_top_keywords(data, clusters, labels, n_terms):
    df = pd.DataFrame(data.todense()).groupby(clusters).mean()
    print(df)
    
    for i,r in df.iterrows():
        print('\nCluster {}'.format(i))
        print(','.join([labels[t] for t in np.argsort(r)[-n_terms:]]))
            
get_top_keywords(X[:100], clusters[:100], vectorizer.get_feature_names(), 10)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
np.unique(clusters)

In [None]:
pd.DataFrame(X.todense()[:1000]).groupby(clusters[:1000]).mean()