In [None]:
import random
import pandas as pd
import numpy as np
from datasets import Dataset, load_dataset
import jsonlines
from textacy import text_stats, make_spacy_doc
from bunkatopics import Bunka
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
df_sample = pd.DataFrame(load_dataset("argilla/ultrafeedback-binarized-preferences-cleaned")["train"]).sample(n=1000, random_state=42)

In [None]:
df_sample

In [None]:
df_sample.isna().sum()

Get Metadata

In [None]:
source = df_sample['source'].tolist()
chosen_rating = df_sample['chosen-rating'].tolist()

In [None]:
metadata = {'source' : source, 'rating' : chosen_rating}

Back to Dataset

In [None]:
docs_sample = Dataset.from_pandas(df_sample)

In [None]:
embedding_model = SentenceTransformer(model_name_or_path="mixedbread-ai/mxbai-embed-large-v1")
bunka = Bunka(embedding_model=embedding_model)
bunka.fit(docs_sample['prompt'], metadata=metadata)

In [None]:
clustering_model = KMeans(n_clusters=15)
bunka.get_topics(name_length=10, custom_clustering_model=clustering_model, min_count_terms=20)# Specify the number of terms to describe each topic

In [None]:
rating_fig = bunka.visualize_topics(color = 'rating')
rating_fig.write_image("rating_map.png")
source_fig = bunka.visualize_topics(color = 'source')
source_fig.write_image('source_map.png')

In [None]:
map_fig = bunka.visualize_topics()
map_fig.write_image("full_map.png")

Getting Topics

In [None]:
df_topics = pd.DataFrame(bunka.topics)

#cleaning and formating
df_topics = df_topics.rename(columns={0: 'topic_id', 1: 'topic_name'})

df_topics = df_topics.drop(columns=[2, 3, 4, 5, 6, 7, 8, 9, 10])

# #remove 'name' from the topic_name column
# for i in range(len(df_topics)):
#     df_topics['topic_name'][i] = df_topics['topic_name'][i][1]

df_topics['topic_name'] = df_topics['topic_name'].apply(lambda x: x[1])

In [None]:
df_topics.to_csv("UF_mixedbread_topics.csv", index=False)

Getting Docs

In [None]:
df_docs = pd.DataFrame(bunka.docs)

In [None]:
df_docs = df_docs.rename(columns={0: 'doc_id', 1: 'content', 3: 'x_cord', 4:'y_cord', 5:'topic_id', 7:'term_id', 8:'embeddings'})
df_docs = df_docs.drop(columns=[2, 6, 9])
df_docs['doc_id'] = df_docs['doc_id'].str[1]
df_docs['content'] = df_docs['content'].str[1]
df_docs['topic_id'] = df_docs['topic_id'].str[1]
df_docs['term_id'] = df_docs['term_id'].str[1]
df_docs['embeddings'] = df_docs['embeddings'].str[1]

In [None]:
df_docs.to_csv('UF_mixedbread_docs.csv', index=False)