# Topic representation comparison

In [1]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
from dependencies import *
import warnings
import pickle
import random
from tqdm import tqdm_notebook
from hdbscan import HDBSCAN
from hdbscan.flat import HDBSCAN_flat

warnings.filterwarnings('ignore')

In [2]:
pd.set_option('max_colwidth', 200)

## Fixiating various parameters

In [3]:
# Umap default parameters
umap_n_neighbors = 15
umap_dim_size = 5

# HDBSCAN Parameters
partioned_clusttering_size = 20
epsilon=0.0

# number of words to represent topic
num_words = 10

## Loading data

In [5]:
## Redefining a few functions from dependencies.py
from tqdm import tqdm_notebook
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity


def preprocessing_documents(document_list):
    cleaned_documents=[doc.lower() for doc in document_list]
    cleaned_documents = [doc.replace("\n", " ") for doc in cleaned_documents]
    cleaned_documents = [doc.replace("\t", " ") for doc in cleaned_documents]
    cleaned_documents = [re.sub(r'[^A-Za-z0-9 ]+', '', doc) for doc in cleaned_documents]
    return cleaned_documents


def doucments_lemmatizer(documents_tokens):
    tokens=[[WordNetLemmatizer().lemmatize(token) for token in doc_tokens] for doc_tokens in documents_tokens]
    return tokens

def document_tokenize(cleaned_documents):
    stop_words = list(set(stopwords.words('english')))
    document_tokens = [word_tokenize(document) for document in cleaned_documents ]
    tokens=[[token for token in document_tokens if not token in stop_words] for document_tokens in document_tokens ]
    return tokens

def calculate_cluster_embeddings(df):
    # Assuming the 'C' column contains the cluster ids and 'embedding' contains the embeddings
    df['embedding'] = df['embedding'].apply(np.array)  # Ensure the embeddings are numpy arrays
    df_grouped = df.groupby('C')['embedding'].apply(np.stack).apply(np.mean, axis=0)
    return df_grouped.reset_index()


def text_processing(all_documents) :
    preprocessed_documents=preprocessing_documents(all_documents)
    documents_tokens=document_tokenize(preprocessed_documents)
    tokens=doucments_lemmatizer(documents_tokens)
    tokens=token_frequency_filter(tokens,10)
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    return tokens,dictionary,corpus


# Less aggressive preprocessing, allows less frequent tokens to stay around
def text_processing_2(all_documents) :
    preprocessed_documents=preprocessing_documents(all_documents)
    documents_tokens=document_tokenize(preprocessed_documents)
    tokens=doucments_lemmatizer(documents_tokens)
    tokens=token_frequency_filter(tokens,5)
    dictionary = Dictionary(tokens)
    corpus = [dictionary.doc2bow(text) for text in tokens]
    return tokens,dictionary,corpus

# returns whole word embeddings after merging the subtokens
def dict_extractor(tokens, embeddings):
    ## convert token embedding tensors into numpy arrays
    token_embeddings = embeddings.numpy()
    my_list = list(zip(tokens, token_embeddings))
        
    new_list = []
    new_list2 = []
    i = 0
    while i < len(my_list):
        if i < len(my_list) - 1 and my_list[i+1][0].startswith('##'):
            combined_word = my_list[i][0] + my_list[i+1][0][2:]
            float_sum = np.sum([my_list[i][1], my_list[i+1][1]],axis=0)
            j = 2
            while i+j < len(my_list) and my_list[i+j][0].startswith('##'):
                combined_word += my_list[i+j][0][2:]
                float_sum = np.sum([float_sum, my_list[i+j][1]],axis=0)
                j += 1
            i += j
            new_list.append(combined_word)
            new_list2.append(float_sum / j)
        else:
            new_list.append(my_list[i][0])
            new_list2.append(my_list[i][1])
            i += 1
    my_list = list(zip(new_list,new_list2))
    res = {}
    for s, v in my_list:
        if s in res: res[s].append(v)
        else: res[s] = [v]
    return res
    

In [6]:

# Return dataframe of topics and their associated word representations

def find_nearest_words(topic_df, word_df, documents_per_topic_per_time, k, diversity = 0.0,verbose = False) :
    result = []
    doc_embeddings = [topic_df.iloc[i].embedding for i in range(len(topic_df))]
    words_all = word_df['content'].tolist()
    for i,doc_embedding in enumerate(doc_embeddings) :
        words_topic = text_processing_2([documents_per_topic_per_time.iloc[i].content])[0][0]
        # reduce the candidate space by considering only the words that constitute the docs of the topic
        # if number of candidates is lower than k, consider entire dictionary as candidate space
        set1 = set(words_topic)
        set2 = set(words_all)
        intersection = set1 & set2
        words = []
        for w in words_all :
            if w in words_topic :
                words.append(w)
        if len(words) < k :
            words = words_all
        df = word_df[word_df['content'].isin(words)]

        word_doc_similarity = cosine_similarity(np.array(df['embedding'].tolist()),[doc_embedding])
        word_similarity = cosine_similarity(np.array(df['embedding'].tolist()))

        # Initialize candidates and already choose best keyword/keyphrase
        keywords_idx = [np.argmax(word_doc_similarity)]
        candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]
        for _ in range(k - 1):
            # Extract similarities within candidates and between candidates and selected keywords/phrases
            candidate_similarities = word_doc_similarity[candidates_idx, :]
            target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)
            # Calculate MMR : diversity is set to 0 for default settings
            # increasing it will make the selected words for a topic more diversified
            mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
            mmr_idx = candidates_idx[np.argmax(mmr)]
            # Update keywords & candidates
            keywords_idx.append(mmr_idx)
            candidates_idx.remove(mmr_idx)
        tmp = [words[idx] for idx in keywords_idx]

        if verbose :
            print(tmp)

        result.append((doc_embedding,tmp))

    # Create a DataFrame
    df = pd.DataFrame(result, columns=['embedding', 'topic_representation'])
    return df









In [526]:
# Load sampled data : 25k documents per year, 2012 to 2017 included.
df_sampled = pd.read_pickle('nyt_bert_25k.pkl')
df_embedded = df_sampled
df = df_sampled

In [527]:
# For computational reasons, select one year at a time
df_embedded = df_embedded[df_embedded['time'] == 2017]

In [528]:
tokens, dictionary, corpus = text_processing(df_embedded.content.values)

In [None]:
# Only do this once, next time, load umap embedding from cell below

fit = umap.UMAP(
            metric="cosine",
            n_neighbors=15,
            n_components=5,
            n_epochs=200,
            random_state=42)

umap_embeddings_clustering = fit.fit_transform(df_embedded['embedding'].tolist())
pickle.dump(umap_embeddings_clustering, open('2017.pkl', 'wb'))


In [529]:
# Load umap embedding if it has been already generated
umap_embeddings_clustering = pickle.load(open('2017.pkl', 'rb'))

### HDBSCAN clustering initiated

In [530]:
TOKENIZERS_PARALLELISM=False

In [531]:
# HDBSCAN
partioned_clusttering_size = 20
c = hdbscan.HDBSCAN(min_cluster_size=partioned_clusttering_size, metric = "euclidean",cluster_selection_method = "eom")
c.fit(umap_embeddings_clustering)
labels = c.labels_
probabilities = c.probabilities_

### Link documents and the clusters they belong to

In [532]:
df_cluster = df_embedded.assign(C = labels).sort_values(by=['C'])
df_cluster = df_cluster[df_cluster['C'] > -1]
df_cluster = df_cluster.reset_index(drop=True)

In [533]:
documents = df_embedded.content.values
topic_ids = np.unique(df_cluster['C'])
df_cluster['slice_num'] = 1
cluster_df = [df_cluster]
documents_per_topic_per_time = rep_prep(cluster_df)
output = ctfidf_rp2(dictionary, documents_per_topic_per_time, num_doc=len(df_embedded), num_words=10)

### Calculate topic vector for each topic

In [534]:
topic_vectors = calculate_cluster_embeddings(df_cluster)

# Method 1 - Calculate word vector for each word (Consider a word as one document) - (non-contextual NEAREST WORDS)

In [535]:
# Find the n-closest words that represent each topic
# We first need to calculate the embedding of each word

df_words = pd.DataFrame(list(dictionary.token2id.keys()), columns=['content'])
word_vectors = contextual_embedding(df_words,mode='mpnet')

In [536]:
res_top2vec = find_nearest_words(topic_vectors,word_vectors,documents_per_topic_per_time,k=10,diversity=0.0,verbose=False)
res_top2vec = res_top2vec.drop('embedding', axis=1)
res_top2vec = res_top2vec.rename(columns={'topic_representation' : 'nearest_words'})


# 2 - Calculate word vector for each word (Consider a word as a max pooling of its various mean pooled embeddings) - (contextual NEAREST WORDS)

In [537]:
import torch
from transformers import AutoTokenizer #, AutoModel, pipeline, BertTokenizer, BertModel
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer 
from collections import Counter

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')


stop_words = list(set(stopwords.words('english')))
model = SentenceTransformer('all-mpnet-base-v2')

def get_embedding(sentence):
    # Pass the input through the model
    output = model.encode(sentence,output_value=None)
    return dict_extractor(tokenizer.convert_ids_to_tokens(output['input_ids'])[1:-1], output['token_embeddings'][1:-1])



In [275]:
# WARNING : 'we' is a big dictionary! Saving it after helps to avoid having to do this everytime.

documents = df_embedded.content.values
we = {}
for i in tqdm_notebook(range(len(documents))) :
    word_embeddings = get_embedding(documents[i])
    for word in word_embeddings.keys() :
        if word not in we : 
            we[word] = [word_embeddings[word]]
        else :
            we[word].append(word_embeddings[word])

with open('we_2017.pickle', 'wb') as handle:
    pickle.dump(we, handle, protocol=pickle.HIGHEST_PROTOCOL)

  0%|          | 0/25000 [00:00<?, ?it/s]

In [538]:
# Loads we if already calculated
with open('we_2017.pickle', 'rb') as handle:
    we = pickle.load(handle)

In [539]:
# Initial filtering to keep only words in the dictionary used by the other two methods for a more accurate comparison
we_filtered = {}
dicto = list(dictionary.token2id.keys())
for key, value in we.items():
    if key in dicto : we_filtered[key] = value

In [540]:
# Helps remove some problematic words that sometimes appear everywhere
if 'succinct' in we_filtered : del we_filtered['succinct']
if 'darndest' in we_filtered : del we_filtered['darndest']
if 'healthlink' in we_filtered : del we_filtered['healthlink']


In [541]:
# Step 1 : Apply mean pooling on a document level
new_dict = {key: [np.mean(arr_list, axis=0).tolist() for arr_list in value] for key, value in tqdm_notebook(we_filtered.items())}

In [543]:
# Step 2 : Apply a max pooling on the mean poolings
we_maxpooled = {}
for word in tqdm_notebook(new_dict) :
    we_maxpooled[word] = np.max(new_dict[word],axis=0)

In [544]:
data = [(k, v) for k, v in we_maxpooled.items()]
df_we_maxpool = pd.DataFrame(data, columns=['content', 'embedding'])

In [547]:
topic_vectors = calculate_cluster_embeddings(df_cluster)

In [548]:
res_proposed = find_nearest_words(topic_vectors,df_we_maxpool,documents_per_topic_per_time,k=10,diversity=0.0,verbose=False)
res_proposed = res_proposed.drop('embedding', axis=1)
res_proposed = res_proposed.rename(columns={'topic_representation' : 'nearest_words'})

# Method 3 - CTFIDF Representation (used in BERTopic and ANTM)

In [550]:
res_ctfidf = ctfidf_rp2(dictionary, documents_per_topic_per_time, num_doc=len(df), num_words=10)
res_ctfidf = res_ctfidf.drop(['content','cluster','slice_num','num_doc'], axis=1)
res_ctfidf = res_ctfidf.rename(columns={'topic_representation' : 'nearest_words'})

# 4 - Comparing topic coherence

In [553]:
cm1 = CoherenceModel(topics=res_top2vec.nearest_words.to_list(), texts=tokens, dictionary=dictionary, coherence="c_v", topn=10)

In [554]:
cm2 = CoherenceModel(topics=res_proposed.nearest_words.to_list(), texts=tokens, dictionary=dictionary, coherence="c_v", topn=10)

In [555]:
cm3 = CoherenceModel(topics=res_ctfidf.nearest_words.to_list(), texts=tokens, dictionary=dictionary, coherence="c_v", topn=10)

In [556]:
TOKENIZERS_PARALLELISM=(True | False)

In [557]:
import plotly.io as pio
pio.renderers.default = "browser"
import plotly.graph_objects as go

# Example lists of floats
list1 = cm1.get_coherence_per_topic()
list2 = cm2.get_coherence_per_topic()
list3 = cm3.get_coherence_per_topic()

In [70]:
################ SCATTER PLOT ###################

# Create x-axis values from 1 to the length of the lists
x = list(range(1, len(list1) + 1))

# Create a trace for the first list
trace1 = go.Scatter(
    x=x,
    y=list1,
    mode='markers',
    name='Nearest Words (non-contextual)',
    marker=dict(color='blue')
)

# Create a trace for the second list
trace2 = go.Scatter(
    x=x,
    y=list2,
    mode='markers',
    name='Nearest Words (contextual)',
    marker=dict(color='red')
)

# Create a trace for the second list
trace3 = go.Scatter(
    x=x,
    y=list3,
    mode='markers',
    name='c-TF-IDF',
    marker=dict(color='green')
)

# Create the layout
layout = go.Layout(
    title='Topic coherence',
    xaxis=dict(title='Topics'),
    yaxis=dict(title='Coherence')
)

# Create the figure and add the traces
fig = go.Figure(data=[trace1, trace2, trace3], layout=layout)

# Show the plot
fig.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [558]:
################ BOX PLOT ###################

# Create a list to hold the box plot traces
data = []

# Create a box plot trace for list1
trace1 = go.Box(y=list1, name='Nearest Words (non-contextual)')
data.append(trace1)

# Create a box plot trace for list2
trace2 = go.Box(y=list2, name='Nearest Words (contextual)')
data.append(trace2)

# Create a box plot trace for list3
trace3 = go.Box(y=list3, name='c-TF-IDF')
data.append(trace3)

# Create the layout
layout = go.Layout(
    yaxis=dict(title='Coherence value')
)

# Create the figure
fig = go.Figure(data=data, layout=layout)

# Show the plot
fig.show()



# 5 - Comparing topic representations using Venn Diagrams

In [596]:
from matplotlib_venn import venn3
import matplotlib.pyplot as plt

def showcase_topic_difference_plus(df1,df2, i) :
    
    list1 = df1.iloc[i]['nearest_words']
    list2 = df2.iloc[i]['nearest_words']
    
    # Convert your lists to sets
    set1 = set(list1)
    set2 = set(list2)

    # Create the Venn diagram
    venn = venn2([set1, set2], (' ', ' '))

    # Determine the intersection and unique elements
    intersection = set1 & set2
    unique1 = set1 - intersection
    unique2 = set2 - intersection

    # Label the subsets with the actual elements instead of the counts
    for subset in ('10', '01', '11'):
        if venn.get_label_by_id(subset):
            labels = '\n'.join(sorted(intersection if subset == '11'
                                      else unique1 if subset == '10'
                                      else unique2))
            venn.get_label_by_id(subset).set_text(labels)

    plt.show()
