In [1]:
import random
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import jsonlines
from textacy import text_stats, make_spacy_doc
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from langchain.llms import HuggingFaceEndpoint
from langchain_openai import OpenAI

# Using OpenAI to Summarize the Topics
llm = ["no peeking!"]

In [32]:
df_sample = pd.DataFrame(load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")["train"]).sample(n=15000, random_state=42)

### Lang Detect doesn't work here. we'll see if we can fix

Get Metadata

In [33]:
source = df_sample['source'].tolist()
chosen_rating = df_sample['chosen-rating'].tolist()

In [34]:
metadata = {'source' : source, 'rating' : chosen_rating}

Back to Dataset

In [35]:
docs_sample = Dataset.from_pandas(df_sample)

In [37]:
embedding_model = SentenceTransformer(model_name_or_path="mixedbread-ai/mxbai-embed-large-v1")
bunka = Bunka(embedding_model=embedding_model)
bunka.fit(docs_sample['prompt'], metadata=metadata)

[32m2024-05-09 15:20:43 - [94mBunka[0m - INFO - [1mProcessing 2406118 tokens[0m
[32m2024-05-09 15:20:44 - [94mBunka[0m - INFO - [1mDetected language: English[0m
[32m2024-05-09 15:20:44 - [94mBunka[0m - INFO - [1mEmbedding documents... (can take varying amounts of time depending on their size)[0m


Batches:   0%|          | 0/469 [00:00<?, ?it/s]

[32m2024-05-09 15:26:29 - [94mBunka[0m - INFO - [1mReducing the dimensions of embeddings...[0m
[32m2024-05-09 15:26:44 - [94mBunka[0m - INFO - [1mExtracting meaningful terms from documents...[0m
[32m2024-05-09 15:26:44 - [94mBunka[0m - INFO - [1mSampling 2000 documents for term extraction[0m
100%|██████████| 2000/2000 [00:43<00:00, 46.47it/s]


In [41]:
clustering_model = KMeans(n_clusters=25)
bunka.get_topics(name_length=10, custom_clustering_model=clustering_model, min_count_terms=20)# Specify the number of terms to describe each topic

[32m2024-05-09 15:32:42 - [94mBunka[0m - INFO - [1mComputing the topics[0m


Unnamed: 0,topic_id,topic_name,size,percent
0,bt-21,marketing | business | company | clients | pla...,1109,7.39
1,bt-1,country | film | music | members | state | alb...,1041,6.94
2,bt-7,bacteria | animals | space | air | energy | an...,889,5.93
3,bt-5,question | movie | passage | wife | answer | q...,880,5.87
4,bt-8,benefits | evidence | exercise | health | indi...,803,5.35
5,bt-12,server | app | file | code | user | web | com ...,801,5.34
6,bt-10,story | poem | novel | character | baby | conv...,757,5.05
7,bt-4,file | string | code | column | function | com...,741,4.94
8,bt-2,user | users | app | website | features | cust...,725,4.83
9,bt-22,stress | children | office | report | years | ...,717,4.78


## REMOVE TOPICS THAT MESS UP THE METRICS (i.e., Coding, translation related)

In [45]:
bunka.get_clean_topic_name(llm)

[32m2024-05-09 15:41:37 - [94mBunka[0m - INFO - [1mUsing LLM to make topic names cleaner[0m
Creating new labels for clusters: 100%|██████████| 25/25 [00:14<00:00,  1.78it/s]


Unnamed: 0,topic_id,topic_name,size,percent
0,bt-21,Marketing Strategy Platform,1109,7.39
1,bt-1,Cultural Expression,1041,6.94
2,bt-7,Interconnected Systems,889,5.93
3,bt-5,Movie Analysis and Interpretation,880,5.87
4,bt-8,Healthy Lifestyle Exploration,803,5.35
5,bt-12,Digital Infrastructure,801,5.34
6,bt-10,Creative Writing,757,5.05
7,bt-4,Programming Concepts,741,4.94
8,bt-2,System Security Design,725,4.83
9,bt-22,Childhood Stress and Health,717,4.78


In [None]:
bunka.visualize_topics()

In [49]:
bunka.clean_data_by_topics()

VBox(children=(Label(value='Click on the topics you want to remove 🧹✨🧼🧽'), Checkbox(value=True, description='M…

Button(description='Clean Data', style=ButtonStyle(button_color='#2596be'))

[32m2024-05-09 15:46:11 - [94mBunka[0m - INFO - [1mAfter cleaning, you've kept 68.0% of your data[0m


In [50]:
bunka.df_cleaned_

Unnamed: 0,doc_id,content,topic_id,topic_name
0,d8ec5e4d-a426-4830-b,Topics: Wound management for general practitio...,bt-22,Childhood Stress and Health
1,01309769-4e3b-4553-9,"Part 1. Definition\nIn this task, you are give...",bt-14,Analyzing Product Sentiment
2,ac36abb3-f6ee-4cd7-b,You will act as an voice changer. You will cha...,bt-15,Language Analysis
3,6149d606-f9f9-4862-a,Write a well-researched paper on the physiolog...,bt-8,Healthy Lifestyle Exploration
4,90aab3ab-f5db-4158-b,Create a step-by-step recipe for making fluffy...,bt-6,Culinary Creations
...,...,...,...,...
10203,9bdb85aa-899d-48ac-8,Transform is an emerging brand that delivers m...,bt-21,Marketing Strategy Platform
10204,f75b1fa4-b628-4b45-a,"Using Python and the scikit-learn library, how...",bt-14,Analyzing Product Sentiment
10205,429c5c00-e5d2-4413-b,"[QUESTION] If ""Two kids stepping stones over t...",bt-18,Gender and Clothing Perceptions
10206,758cd65f-2bf8-40e6-b,Write me a letter asking my friend Dan what he...,bt-1,Cultural Expression


In [None]:
rating_fig = bunka.visualize_topics(color = 'rating')
rating_fig.write_image("rating_map.png")
source_fig = bunka.visualize_topics(color = 'source')
source_fig.write_image('source_map.png')

In [None]:
map_fig = bunka.visualize_topics()
map_fig.write_image("full_map.png")

## Exporting

Getting all the topics in the sample

In [56]:
df_topics = pd.DataFrame(bunka.topics)

#cleaning and formating
df_topics = df_topics.rename(columns={0: 'topic_id', 1: 'topic_name'})

df_topics = df_topics.drop(columns=[2, 3, 4, 5, 6, 7, 8, 9, 10])

# #remove 'name' from the topic_name column
# for i in range(len(df_topics)):
#     df_topics['topic_name'][i] = df_topics['topic_name'][i][1]

df_topics['topic_name'] = df_topics['topic_name'].apply(lambda x: x[1])

But really what we want is only the topics that we want to keep, which are in df_cleaned_ attribute

In [52]:
bunka.df_cleaned_.to_csv("UF10k_mixedbread_topics.csv", index=False)

Getting Docs (but really what we need is the embeddings)

In [53]:
df_docs = pd.DataFrame(bunka.docs)

In [54]:
df_docs = df_docs.rename(columns={0: 'doc_id', 1: 'content', 3: 'x_cord', 4:'y_cord', 5:'topic_id', 7:'term_id', 8:'embeddings'})
df_docs = df_docs.drop(columns=[2, 6, 9])
df_docs['doc_id'] = df_docs['doc_id'].str[1]
df_docs['content'] = df_docs['content'].str[1]
df_docs['topic_id'] = df_docs['topic_id'].str[1]
df_docs['term_id'] = df_docs['term_id'].str[1]
df_docs['embeddings'] = df_docs['embeddings'].str[1]

In [55]:
df_docs.to_csv('UF_mixedbread_docs.csv', index=False)