In [15]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [16]:
df = pd.read_csv("generated_topics.csv")

#### Combine *Topics* and *Definition* columns into the new *text* column

In [17]:
df['text'] = df['Topics'] + ": " + df['Definition']

#### Generate embeddings

In [18]:
model = SentenceTransformer('all-MiniLM-L6-v2')  
embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

#### Clustering based on cosine distance. Only merge topics with a semantic similarity â‰¥ 0.5 (cosine distance â‰¤ 0.5).

In [19]:
clustering = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=0.5,
    metric='cosine',
    linkage='average'
)
cluster_labels = clustering.fit_predict(embeddings)

#### Add clustering results back to the original DataFrame

In [20]:
df['cluster'] = cluster_labels

In [21]:
grouped = df.groupby('cluster')['text'].apply(list).reset_index()

#### Visualize 

In [22]:
for i, row in grouped.iterrows():
    print(f"\nðŸ”¹ Cluster {row['cluster']} ({len(row['text'])} items):")
    for item in row['text']:
        print(" -", item)


ðŸ”¹ Cluster 0 (5 items):
 - Fear of Failure: Discusses the user's fear of failure as a barrier to implementing behavioral changes.
 - Fear of Change: Discusses the user's fear or uncertainty about making necessary changes to their behavior or lifestyle.
 - Fear of Missing Out: Discusses the anxiety that an exciting or interesting event may currently be happening elsewhere, often aroused by posts seen on social media.
 - Fear of Future Health Problems: Discusses the anxiety about potential health issues in the future due to current unhealthy behaviors.
 - Fear of Change: Discusses the anxiety or apprehension individuals may feel when considering or facing changes in their lives.

ðŸ”¹ Cluster 1 (28 items):
 - Healthy Eating: Refers to the desire and efforts to improve dietary habits for better health and wellbeing.
 - Overcoming Temptations: Discusses the struggle to resist unhealthy food choices and the strategies to overcome them.
 - Meal Planning: Discusses strategies for planning 

#### Select one representative topic per cluster (first item in each cluster)

In [23]:
representatives = df.groupby('cluster').first().reset_index()
representatives = representatives[['cluster', 'Topics', 'Definition']]

#### We got 51 representative topics 

In [27]:
len(representatives)

51

#### Save the representative topics

In [24]:
representatives.to_csv("topics_after_deduplication.csv", index=False)