In [1]:
import sys
sys.path.append("../")

In [2]:
import os
import random

from FlagEmbedding import FlagModel
import pandas as pd
from datasets import load_dataset
from dotenv import load_dotenv
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
import ast
import umap

load_dotenv()
from langchain_community.llms import HuggingFaceHub
from sentence_transformers import SentenceTransformer

from langchain_community.embeddings import HuggingFaceEmbeddings
from bunkatopics import Bunka

random.seed(42)

repo_id = "mistralai/Mistral-7B-Instruct-v0.1"
llm = HuggingFaceHub(
    repo_id=repo_id,
    huggingfacehub_api_token=os.environ.get("HF_TOKEN"),
)

figure = False

# Preprocess a dataset
dataset = load_dataset("bunkalab/medium-sample-technology")
df_test = pd.DataFrame(dataset["train"])

df_test = df_test[["title", "tags"]]
df_test["tags"] = df_test["tags"].apply(lambda x: ast.literal_eval(x))
df_test["doc_id"] = df_test.index
df_test = df_test.explode("tags")

top_tags = list(df_test["tags"].value_counts().head(10)[1:].index)
df_test = df_test[df_test["tags"].isin(top_tags)]
df_test = df_test.drop_duplicates("doc_id", keep="first")
df_test = df_test[~df_test["tags"].isna()]
df_test = df_test.sample(1000, random_state=42)

docs = df_test["title"].tolist()
ids = df_test["doc_id"].tolist()
tags = df_test["tags"].tolist()
metadata = {"tags": tags}

df_test

Unnamed: 0,title,tags,doc_id
682,How to Initiate Innovative Business Collaboration,Startup,682
2343,Beginners Guide to Firebase,Programming,2343
2070,Is AI going to take my job?,Artificial Intelligence,2070
1116,A walk through blockchain — 24 hours using blo...,Blockchain,1116
1347,What Are The 8 Steps You Should Take To Have A...,Startup,1347
...,...,...,...
1568,How To Decouple Data from UI in React,Programming,1568
2456,"Robert Gherghe, Head of Communication Modex: C...",Blockchain,2456
2780,Sports and event tech on the fast track,Startup,2780
1290,The New Startup Visa in Australia— a Guide for...,Startup,1290


In [3]:
projection_model = TSNE(
    n_components=2,
    learning_rate="auto",
    init="random",
    perplexity=3,
    random_state=42,
)
embedding_model = SentenceTransformer(model_name_or_path="all-MiniLM-L6-v2")

bunka = Bunka(
    projection_model=projection_model, 
    embedding_model=embedding_model
)

In [4]:
bunka.fit(
ids=ids,
docs=docs,
metadata=metadata,
pre_computed_embeddings=None,
sampling_size_for_terms=1000,
)


[32m2024-04-09 15:12:08 - [94mBunka[0m - INFO - [1mEmbedding documents... (can take varying amounts of time depending on their size)[0m


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[32m2024-04-09 15:12:10 - [94mBunka[0m - INFO - [1mReducing the dimensions of embeddings...[0m
[32m2024-04-09 15:12:12 - [94mBunka[0m - INFO - [1mExtracting meaningful terms from documents...[0m
[32m2024-04-09 15:12:13 - [94mBunka[0m - INFO - [1mSampling 1000 documents for term extraction[0m
100%|██████████| 1000/1000 [00:04<00:00, 227.47it/s]


In [4]:

custom_clustering_model = KMeans(n_clusters=15)

df_topics = bunka.get_topics(
    custom_clustering_model=custom_clustering_model,
    n_clusters=10,
    min_count_terms=2,
    min_docs_per_cluster=30,
)

[32m2024-04-09 15:06:57 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [5]:
topic_fig = bunka.visualize_topics(
        width=800,
        height=800,
        show_text=False,
        density=True,
        colorscale="Portland",
        convex_hull=True,
        color=None,
    )

topic_fig.show()

[32m2024-04-09 15:06:57 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [24]:
df_topics_clean = bunka.get_clean_topic_name(llm=llm)

[32m2024-04-09 15:10:24 - [94mBunka[0m - INFO - [1mUsing LLM to make topic names cleaner[0m
Creating new labels for clusters: 100%|██████████| 10/10 [00:07<00:00,  1.25it/s]


[Topic(topic_id='bt-0', name='Slack, Companies, Gadgets, Tips', lemma_name=None, x_centroid=12.962833776474, y_centroid=-11.113171941377223, size=100, top_doc_id=None, top_term_id=None),
 Topic(topic_id='bt-1', name='Women in Tech World', lemma_name=None, x_centroid=-52.711148842997936, y_centroid=22.08945714947821, size=87, top_doc_id=None, top_term_id=None),
 Topic(topic_id='bt-2', name='Crypto Market Platforms and Mining', lemma_name=None, x_centroid=-22.70493334209343, y_centroid=-48.929523083342225, size=119, top_doc_id=None, top_term_id=None),
 Topic(topic_id='bt-3', name='Tech Teamwork Across Countries', lemma_name=None, x_centroid=-15.313818479483983, y_centroid=12.970227102485469, size=106, top_doc_id=None, top_term_id=None),
 Topic(topic_id='bt-4', name='Multidisciplinary Design', lemma_name=None, x_centroid=22.08502174168825, y_centroid=42.80567560593287, size=96, top_doc_id=None, top_term_id=None),
 Topic(topic_id='bt-5', name='Waves Collection Release Menu', lemma_name=Non

In [26]:
topic_fig = bunka.visualize_topics(
        width=800,
        height=800,
        show_text=False,
        density=True,
        colorscale="Portland",
        convex_hull=True,
        color=None,
    )

topic_fig.show()

[32m2024-04-09 15:10:48 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [27]:
bourdieu_fig = bunka.visualize_bourdieu(
        llm=None,
        x_left_words=["past"],
        x_right_words=["future"],
        y_top_words=["men"],
        y_bottom_words=["women"],
        height=800,
        width=800,
        clustering=True,
        topic_n_clusters=30,
        min_docs_per_cluster=10,
        density=False,
        colorscale="Portland",
    )
   
bourdieu_fig.show()

[32m2024-04-09 15:10:52 - [94mBunka[0m - INFO - [1mCreating the Bourdieu Map[0m


In [11]:
bunka.save_bunka("bunka_dump")

In [16]:
custom_clustering_model = KMeans(n_clusters=10)

df_topics = bunka_new.get_topics(
    custom_clustering_model=custom_clustering_model,
    n_clusters=10,
    min_count_terms=2,
    min_docs_per_cluster=30,
)

[32m2024-04-09 15:08:25 - [94mBunka[0m - INFO - [1mComputing the topics[0m


In [19]:
bourdieu_fig = bunka_new.visualize_bourdieu(
        llm=llm,
        x_left_words=["past"],
        x_right_words=["future"],
        y_top_words=["men"],
        y_bottom_words=["women"],
        height=800,
        width=800,
        clustering=True,
        topic_n_clusters=30,
        min_docs_per_cluster=10,
        density=False,
        colorscale="Portland",
    )
   
bourdieu_fig.show()

[32m2024-04-09 15:08:44 - [94mBunka[0m - INFO - [1mCreating the Bourdieu Map[0m
Creating new labels for clusters: 100%|██████████| 25/25 [00:14<00:00,  1.72it/s]


In [21]:
bunka.visualize_topics(color = 'tags',
                        width=800,
                        height=800,
                        colorscale='Blues',
                        density = True,
                        label_size_ratio = 60,
                        convex_hull = True,
                        show_text = True)

[32m2024-04-09 15:09:19 - [94mBunka[0m - INFO - [1mCreating the Bunka Map[0m


In [22]:
bunka.manually_clean_topics()

VBox(children=(HTML(value='Manually input the new topic names: '), Text(value='Slack | Companies | gadgets | c…

Button(description='Apply Changes', style=ButtonStyle(button_color='#2596be'))