In [7]:
%load_ext autoreload
%autoreload

import pandas as pd
import numpy as np
from dependencies import *
import warnings
import pickle
import random

import warnings
warnings.filterwarnings('ignore')

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [8]:
import numpy as np
import plotly.graph_objects as go
import plotly.io as pio


def find_similar_clusters(topic_vectors, slice_nums, title):
    num_topics = len(topic_vectors)

    grouped_topics = {}
    for i in range(num_topics):
        slice_num = slice_nums[i]
        if slice_num not in grouped_topics:
            grouped_topics[slice_num] = set(topic_vectors[i])
        else:
            grouped_topics[slice_num].update(topic_vectors[i])

    fig = go.Figure()

    slices = list(grouped_topics.keys())

    for i, s in enumerate(slices):
        if s == min(slices):
            previous_words = set()
        else:
            previous_words = set(grouped_topics[slices[i - 1]])

        current_words = set(grouped_topics[s])

        words_appear = current_words - previous_words
        words_disappear = previous_words - current_words
        words_remain = current_words.intersection(previous_words)

        color_dict = {
            'appear': 'green',
            'disappear': 'red',
            'remain': 'grey'
        }

        for word_set, marker_color in zip([words_appear, words_disappear, words_remain],
                                          ['appear', 'disappear', 'remain']):
            fig.add_trace(go.Scatter(
                x=list(word_set),
                y=[s] * len(word_set),  # Corrected line
                mode='markers',
                name=f'Slice {s} - Words {marker_color.capitalize()}',  # Corrected line
                marker=dict(
                    color=color_dict[marker_color],
                    size=15,
                ),
                showlegend=False  # Remove legend
            ))

    
    tickvalues = [k for k in grouped_topics.keys()]
    
    fig.update_layout(
        title='',
        xaxis_title='',
        yaxis_title='Time period',
        showlegend=False,  # Remove default legend
        width=2000,  # Set figure width (adjust as needed)
        xaxis=dict(
            tickangle=45,  # Rotate x-axis labels for better readability
            tickfont=dict(size=12),  # Adjust x-axis label font size
            showticklabels=True,  # Show x-axis tick labels
            showgrid=False  # Hide x-axis gridlines
        ),
        yaxis=dict(
            showticklabels=True,  # Show y-axis tick labels
            tickvals=tickvalues,  # Set tick values to unique slice_nums
            tickfont=dict(size=12),
            title_font=dict(size=14),
            tickmode='array'        ),
        

    )

    fig.show()

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def create_evolving_topics_dataframe(df_list, df_list_output, threshold):
    # Join df_list with df_list_output on slice_num and C
    merged_df = df_list.merge(df_list_output[['slice_num', 'C', 'topic_representation']], on=['slice_num', 'C'])

    # Convert embeddings to numeric arrays
    merged_df['embedding'] = merged_df['embedding'].apply(np.array)

    num_slices = merged_df['slice_num'].nunique()
    associations = []

    for slice_num in range(num_slices):
        current_slice_topics = merged_df[merged_df['slice_num'] == slice_num]

        for _, current_topic in current_slice_topics.iterrows():
            current_embedding = current_topic['embedding']
            associated_topics = []

            # Find associated topics in the same slice_num
            same_slice_topics = current_slice_topics[current_slice_topics.index != current_topic.name]
            for _, topic in same_slice_topics.iterrows():
                topic_embedding = topic['embedding']
                similarity = cosine_similarity([current_embedding], [topic_embedding])[0][0]
                if similarity >= threshold:
                    associated_topics.append(topic['topic_representation'])

            # Find associated topics in future slice_num
            future_slices = merged_df[merged_df['slice_num'] > slice_num]
            for _, topic in future_slices.iterrows():
                topic_embedding = topic['embedding']
                similarity = cosine_similarity([current_embedding], [topic_embedding])[0][0]
                if similarity >= threshold:
                    associated_topics.append(topic['topic_representation'])

            if associated_topics:
                associations.append(associated_topics)

    # Create DataFrame with evolving_topics column
    df = pd.DataFrame({'evolving_topics': associations})
    
    # Get corresponding slice_nums
    slice_nums = []
    for topics in associations:
        topic_slice_nums = df_list_output[df_list_output['topic_representation'].isin(topics)]['slice_num'].tolist()
        slice_nums.append(topic_slice_nums)
    
    # Add slice_nums column to the DataFrame
    df['slice_nums'] = slice_nums
    
    return df

def calculate_cluster_embeddings(df_all):
    topic_vectors = []
    for i,df in enumerate(df_all) : 
        # Assuming the 'C' column contains the cluster ids and 'embedding' contains the embeddings
        df['embedding'] = df['embedding'].apply(np.array)  # Ensure the embeddings are numpy arrays
        temp = pd.DataFrame(df.groupby('C')['embedding'].apply(np.stack).apply(np.mean, axis=0))
        temp['slice_num'] = i+1
        topic_vectors.append(temp)
    
    topic_vectors = pd.concat(topic_vectors)
    return topic_vectors

In [10]:
# Load sampled data
df_sampled = pd.read_pickle('nyt_bert_25k.pkl')

In [11]:
df = df_sampled
df_embedded = df_sampled

# Slicing

In [12]:
overlap = 2
window_length = 3

slices,arg1_umap,arg2_umap = sws(df_embedded,overlap,window_length)

# UMAP Reduction

In [None]:
umap_n_neighbors = 15
umap_dim_size = 5

umap_embeddings_clustering, umap_embeddings_visulization = aligned_umap(
            arg1_umap, arg2_umap, n_neighbors=umap_n_neighbors,
            umap_dimension_size=umap_dim_size)

In [191]:
# Exporting umap
# save the model to disk
pickle.dump(umap_embeddings_clustering, open('nyt_bert_25k_clustering', 'wb'))
pickle.dump(umap_embeddings_visulization, open('nyt_bert_25k_viz', 'wb'))

In [13]:
# Loading embedding
umap_embeddings_clustering = pickle.load(open('nyt_bert_25k_clustering', 'rb'))
umap_embeddings_visulization = pickle.load(open('nyt_bert_25k_viz', 'rb'))

# Clusters

In [14]:
def hdbscan_cluster(embedding, size, epsilon) :
    clusters_labels = []
    cluster_proba = []
    c= hdbscan.HDBSCAN(min_cluster_size=size, cluster_selection_epsilon=epsilon, metric = "euclidean",cluster_selection_method = "eom")
    for e in range(len(embedding)) :
        c.fit(embedding[e])
        clusters_labels.append(c.labels_)
        cluster_proba.append(c.probabilities_)
    return c,clusters_labels,cluster_proba


In [15]:
partioned_clusttering_size = 20
epsilon = 0

c,clusters,cluster_proba = hdbscan_cluster(umap_embeddings_clustering, partioned_clusttering_size, epsilon)

cluster_df = clustered_df(slices, clusters)

clustered_df_cent, clustered_np_cent = clustered_cent_df(cluster_df)
dt, concat_cent = dt_creator(clustered_df_cent)
df_tm = alignment_procedure(dt, concat_cent)
list_tm = plot_alignment_no_show(df_tm, umap_embeddings_visulization, clusters, os.getcwd())
documents_per_topic_per_time = rep_prep(cluster_df)
tokens, dictionary, corpus = text_processing(df.content.values)
output = ctfidf_rp2(dictionary, documents_per_topic_per_time, num_doc=len(df), num_words=10)
slice_num = set(output["slice_num"])
topics = [output[output["slice_num"] == i].topic_representation.to_list() for i in slice_num]
topics = list(filter(None, topics))

In [16]:
def topic_evolution(list_tm,output):
    evolving_topics = []
    slice_nums = []
    for et in list_tm:
        evolving_topic = []
        slice_num = []
        for topic in et:
            cl = int(float(topic.split("-")[1]))
            win = int(float(topic.split("-")[0]))
            t = output[output["slice_num"] == win]
            t = t[t["C"] == cl]
            evolving_topic.append(t.topic_representation.to_list()[0])
            slice_num.append(t.slice_num.values[0])
        evolving_topics.append(evolving_topic)
        slice_nums.append(slice_num)
    evolving_topics_df = pd.DataFrame({'evolving_topics': evolving_topics, 'slice_nums':slice_nums})
    return evolving_topics_df

# HDBSCAN ALIGNEMENT

In [17]:
#HDBSCAN alignement
topics = topic_evolution(list_tm,output)

#Most similar clusters alignement
topics_per_slice = calculate_cluster_embeddings(cluster_df)

topics

Unnamed: 0,evolving_topics,slice_nums
0,"[[tyga, vacationer, idol, zendaya, rodarte, lo...","[2, 3, 3, 3, 4]"
1,"[[gaza, hamas, palestinian, israel, modi, iraq...","[1, 2, 3, 4]"
2,"[[gaza, hamas, palestinian, egypt, israel, isr...","[2, 3, 3, 4, 4]"
3,"[[batman, halo, ahs, affleck, spiderman, godzi...","[1, 3, 3, 3, 4, 4, 4, 4, 4]"
4,"[[benedict, vatican, catholic, priest, doctrin...","[1, 2, 3, 4]"
...,...,...
205,"[[ramadan, fasting, quran, salvador, margarita...","[2, 3, 4]"
206,"[[memorial, va, ptsd, vietnam, veteran, vet, r...","[1, 2, 3, 4]"
207,"[[newsbrief, wrapup, mandela, october, homelan...","[2, 3, 4, 4]"
208,"[[genocide, kidnapped, nigerian, nigeria, hara...","[1, 2, 3, 4, 4]"


In [18]:
topics.to_csv('HDBSCAN.csv')
#topics=pd.read_csv('sampled_data/HDBSCAN.csv')

In [29]:
random_topic = topics.iloc[158]
find_similar_clusters(random_topic.evolving_topics,random_topic.slice_nums, title = 'hdbscan t158.png')

# KNN ALIGNEMENT

In order to compare the alignements, we save the results of aligned topics into a csv file, then we manually select a topic to compare. This has to be done manually because the alignement results are not the same and don't have the same indexes.

## KNN ALIGNEMENT : THRESHOLD = 0.6

In [20]:
threshold = 0.6
topics2 = create_evolving_topics_dataframe(topics_per_slice,output,threshold)
title = 'Evolution of topics | Most Similar Clusters method | threshold = ' + str(threshold)

In [21]:
topics2

Unnamed: 0,evolving_topics,slice_nums
0,"[[ebay, clothing, sneak, peek, vintage, weekly...","[1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 4]"
1,"[[pyongyang, jong, korea, rodman, korean, seou...","[2, 3, 4]"
2,"[[playlist, workout, optimal, motivating, ipod...","[2, 3, 4]"
3,"[[greece, bailout, austerity, greek, cyprus, a...","[2, 3, 4]"
4,"[[ukraine, ukrainian, putin, crimea, russian, ...","[2, 3, 4, 4, 4]"
...,...,...
719,"[[cheeky, hill, dose, evening, sign, manafort,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
720,"[[cheeky, hill, dose, evening, sign, manafort,...","[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, ..."
721,"[[nra, amendment, murphy, probation, gun, tigh...","[3, 3, 4, 4, 4, 4, 4, 4]"
722,"[[broadband, verizon, fcc, att, chelsea, neutr...","[3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, ..."


In [23]:
topics2.to_csv('KNN06.csv')

In [24]:
random_topic = topics2.iloc[32]
find_similar_clusters(random_topic.evolving_topics,random_topic.slice_nums, title = 'knn 0,6 t32.png')

## KNN ALIGNEMENT : THRESHOLD = 0.8

In [25]:
threshold = 0.8
topics3 = create_evolving_topics_dataframe(topics_per_slice,output,threshold)
title = 'Evolution of topics | Most Similar Clusters method | threshold = ' + str(threshold)

In [26]:
random_topic = topics3.iloc[4]
find_similar_clusters(random_topic.evolving_topics,random_topic.slice_nums, title = 'knn 0,6 t4.png')

In [27]:
random_topic = topics3.iloc[32]
find_similar_clusters(random_topic.evolving_topics,random_topic.slice_nums, title = 'knn 0,6 t32.png')

In [28]:
random_topic = topics3.iloc[59]
find_similar_clusters(random_topic.evolving_topics,random_topic.slice_nums, title = 'knn 0,6 t59.png')