### import modules

In [None]:
import csv
import os

# set this if you want to use an Apple Silicon GPU and run into problems
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

import faiss
import numpy as np
import pandas as pd
from scc.scc import SCC
from scipy.sparse import coo_array, csr_array
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from tqdm import trange

from _ctfidf import ClassTfidfTransformer

In [None]:
# this will enable logging for scc
from absl import logging
logging.set_verbosity(logging.DEBUG)

### import data

In [None]:
url = "https://raw.githubusercontent.com/tblock/10kGNAD/master/articles.csv"

data = pd.read_csv(url, sep=";", header=None, names=["category", "text"], on_bad_lines="skip")
data.head()

### create embeddings

#### load model

In [None]:
gbert_cosine = SentenceTransformer("deutsche-telekom/gbert-large-paraphrase-cosine")

#### embed

In [None]:
embeddings = gbert_cosine.encode(list(data.text), show_progress_bar=True, batch_size=32)

### create k-nearest neighbors graph (input for SCC)

#### setup

In [None]:
def cosine_top_k(
    X: np.ndarray,
    top_k: int,
    normalize: bool = True,
    matrix_format: str = "csr",
    batch_size: int = 1_000,
    show_progress_bar: bool = False,
):
    """This function creates a nearest-neighbhors using Faiss."""
    top_k = min(top_k, X.shape[0])

    if normalize:
        X = X.copy()
        faiss.normalize_L2(X)
    index = faiss.IndexFlatIP(X.shape[1])
    
    index.add(X)

    top_k_data, top_k_col = index.search(X, top_k)

    print("Search finished.")

    # create canonical form
    # do a batched sort over rows to prevent out-of-memory error from sorting with np.lexical
    top_k_indices = np.zeros((batch_size, top_k), dtype=np.int32)
    for i in trange(0, top_k_col.shape[0], batch_size):
        row_start = i
        row_end = min(i + batch_size, top_k_col.shape[0])
        slice_end = row_end - row_start

        top_k_indices[:slice_end] = np.argsort(top_k_col[row_start:row_end], axis=1)

        top_k_col[row_start:row_end] = np.take_along_axis(
            top_k_col[row_start:row_end], top_k_indices[:slice_end], axis=1
        )
        top_k_data[row_start:row_end] = np.take_along_axis(
            top_k_data[row_start:row_end], top_k_indices[:slice_end], axis=1
        )

    print("Ordering finished.")

    data = top_k_data.flatten()
    data = np.sqrt(data, where=data>0)
    indices = top_k_col.flatten()
    matrix_shape = (X.shape[0], X.shape[0])

    if matrix_format == "csr":
        indptr = np.arange(0, (X.shape[0] + 1) * top_k, top_k)
        return csr_array((data, indices, indptr), dtype=np.float32, shape=matrix_shape)
    elif matrix_format in ["coo", "dok", "lil"]:
        row = np.repeat(np.arange(X.shape[0]), top_k)
        coo_matrix = coo_array(
            (data, (row, indices)), dtype=np.float32, shape=matrix_shape
        )

        if matrix_format == "dok":
            return coo_matrix.todok()
        elif matrix_format == "lil":
            return coo_matrix.tolil()
        elif matrix_format == "csc":
            return coo_matrix.tocsc()
        else:
            return coo_matrix

#### run

In [None]:
top_k = 25

knn_graph = cosine_top_k(embeddings, top_k=top_k, matrix_format="coo")

### run SCC

#### setup

In [None]:
upper = 1.0
lower = 0.001
num_rounds = 200
taus = np.geomspace(start=upper, stop=lower, num=num_rounds)

scc = SCC(knn_graph, num_rounds, taus)

#### run

In [None]:
scc.fit()

#### inspect levels

In [None]:
for i, level in enumerate(scc.rounds):
    print(i, level.num_uniq_parents)

#### map levels to each other
SCC does not ouput relationships between rounds but only gives cluster assignments for each data sample per round.
<br> So, we need to extract these relationships. For demonstrational purposes, and similar to the analysis in the paper,
<br> we select three `levels` with roughly 20, 100, and 1000 clusters.

In [None]:
levels = [167, 109, 38]

level_map, inverted_level_map = [], []
mapping = None

for i, level in enumerate(levels):
    if i == 0:
        clusters = np.arange(scc.rounds[level].num_uniq_parents)
    else:
        clusters = np.array([child for children in mapping.values() for child in children])
    
    if i+1  == len(levels):
        level_map.append({cluster: None for cluster in clusters})
        break

    mapping, inverted_mapping = {}, {}
    for cluster in clusters:
        cluster_samples = np.where(scc.rounds[level].cluster_assignments==cluster)[0]

        children = set(scc.rounds[levels[i+1]].cluster_assignments[cluster_samples])
        mapping[cluster] = children

        inverted_mapping = inverted_mapping | {child: cluster for child in children}

    level_map.append(mapping)  
    inverted_level_map.append(inverted_mapping)

#### extract top key words
Here, we extract the top 10 keywords for each cluster (per level). The paper uses a different tokenization strategy
<br> specifically designed for German language social media text. For simplicity, we only remove stop_words in this process.

In [None]:
url = "https://raw.githubusercontent.com/stopwords-iso/stopwords-de/master/raw/stop-words-german.txt"
stop_words = list(pd.read_csv(url, header=None).iloc[:, 0])

In [None]:
n_top_words = 10
top_words = []

for i, level in enumerate(level_map):
    # initialize c-tf-idf vectorizer
    # could modify tokenizer here
    vectorizer_model = CountVectorizer(lowercase=True, stop_words=stop_words)
    ctfidf_model = ClassTfidfTransformer()

    # join texts per cluster to prepare for class-based tokenization (similar to BERTopic)
    topic_texts = []
    clusters = np.array(list(level.keys()))
    for cluster in clusters:
        cluster_samples = np.where(scc.rounds[levels[i]].cluster_assignments==cluster)[0]
        topic_texts.append(" ".join(data.iloc[cluster_samples].text))
    
    vectorizer_model.fit(topic_texts)
    bow = vectorizer_model.transform(topic_texts)
    ctfidf_reprs = ctfidf_model.fit_transform(bow)

    # extract top 10 words from sparse matrix (per row = cluster)
    words = {}
    for row in range(len(ctfidf_reprs.indptr) - 1):
        ind_start, ind_end = ctfidf_reprs.indptr[row], ctfidf_reprs.indptr[row + 1]
        max_words = min(n_top_words, ind_end - ind_start)
        top_word_indices = np.argpartition(
            ctfidf_reprs.data[ind_start:ind_end], -max_words
        )[-max_words:]
        words[clusters[row]] = (
            vectorizer_model.get_feature_names_out()[
                ctfidf_reprs.indices[ind_start:ind_end][top_word_indices]
            ].tolist()
        )

    top_words.append(words)

In [None]:
top_words

### get sentiments

#### setup

In [None]:
# will add own model after deanonymization
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
txlm = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

#### run

In [None]:
sentiments = txlm(data.text)

#### export nodes
This code combines and transformers the results into a jsonl format (in Python, simply a list of dicts). We used this format
<br> for creating the visualization in the paper (see code in the other subfolder in this repo (`treemap_visualization`)

In [None]:
nodelist = []

for i, level in enumerate(levels):
    for cluster, children in level_map[i].items():
        cluster_samples = np.where(scc.rounds[level].cluster_assignments==cluster)[0]
        
        name = f"{level}/{cluster}"
        weight = len(cluster_samples)
        keywords = top_words[i][cluster]

        sentiment_counts = sentiments.iloc[cluster_samples].sentiment.value_counts()
        sentiment_dict = {
            "negative": int(sentiment_counts.negative) if "negative" in sentiment_counts else 0,
            "neutral": int(sentiment_counts.neutral) if "neutral" in sentiment_counts else 0,
            "positive": int(sentiment_counts.positive) if "positive" in sentiment_counts else 0,
        
        }

        nodelist.append({
            "name": name,
            "label": None,
            "description": None,
            "level": int(level),
            "level_id": int(cluster),
            "weight": int(weight),
            "keywords": keywords,
            "sentiment_dict": sentiment_dict,
            "parent": None if i == 0 else int(inverted_level_map[i-1][cluster]),
            "children": [f'{levels[i+1]}/{child}' for child in children] if children is not None else None,
        })

In [None]:
nodelist

In [None]:
# and, optionally, save somwhere
save_path = os.path.join("path_to_file.jsonl")

with open(save_path, 'w') as f:
    for entry in nodelist:
        json.dump(entry, f)
        f.write('\n')