In [None]:
!pip install datasets sentence_transformers umap-learn hdbscan openai

In [70]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from umap import UMAP
from hdbscan import HDBSCAN
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import openai
import os
openai.api_key = ''

In [3]:
ag_news_dataset = load_dataset("ag_news", split='test')
df_ag = pd.DataFrame(ag_news_dataset)
df_ag.sample(5)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Unnamed: 0,text,label
6038,"MPAA seeks research, P2P cop role on Internet2...",3
2969,Legal expert joins open-source screening firm ...,3
2476,Pakistan and India agree to cooperate on easin...,0
6938,Automakers sue California over emissions FRESN...,2
7572,Wenger vows no repeat of frailties against Bay...,1


In [7]:
# embeddings #
embed_model = SentenceTransformer("BAAI/bge-small-en")
text_embeddings = embed_model.encode(df_ag['text'].tolist(), show_progress_bar=True)
df_ag['embeddings'] = list(text_embeddings)

# dimension reduction #
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')
reduced_text_embeddings = umap_model.fit_transform(text_embeddings)

# clustering
hdbscan_model = HDBSCAN(
    min_cluster_size=40,
    metric='euclidean',
    min_samples=5,
    prediction_data=False)

text_cluster = hdbscan_model.fit(reduced_text_embeddings)
df_ag['cluster'] = text_cluster.labels_
print(set(text_cluster.labels_))

Batches:   0%|          | 0/238 [00:00<?, ?it/s]

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, -1}


In [95]:
top_n = 10
diversity = 0.5
cluster_dict = {}

for cluster, df in df_ag.groupby('cluster'):
    if cluster == -1:
        continue

    # find the most representative documents
    candidate_d = cosine_similarity(df['embeddings'].tolist(), df['embeddings'].tolist())
    candidate_d_sum = candidate_d.sum(axis=1)
    doc_list = [np.argmax(candidate_d.sum(axis=1))]
    candidates_idx = [i for i in range(len(df)) if i != doc_list[0]]

    # filter based on maximal marginal relevance
    for _ in range(top_n - 1):
        candidate_similarities = candidate_d.sum(axis=1)[candidates_idx]
        target_similarities = np.max(candidate_d[candidates_idx][:, doc_list], axis=1)

        # Calculate MMR
        mmr = (1 - diversity) * candidate_similarities - diversity * target_similarities
        # Update keywords & candidates
        mmr_idx = candidates_idx[np.argmax(mmr)]
        doc_list.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    cluster_dict[cluster] = {'doc': [df['text'].tolist()[idx] for idx in doc_list]}

In [96]:
def get_reponse(prompt, temperature=0):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0,
        max_tokens=100)
    return response.choices[0].message["content"]

for i in cluster_dict.keys():

    # prompt
    documents = "\n".join([f"{c + 1}.{text}" for c, text in enumerate(cluster_dict[i]['doc'])])
    delimiter = "####"
    prompt = f"""\
    You will be provided with multiple documents that form the same cluster. \
    The documents will be delimited with {delimiter} characters. \
    Your task is to define a topic title that is well representing all the listed documents.\
    Output starts with 'Title: '

    Documents:
    {delimiter}{documents}{delimiter}
    """

    # response from openai
    llm_output = get_reponse(prompt)
    cluster_dict[i]['topic'] = llm_output

In [106]:
idx = 9
print(cluster_dict[idx]['topic'])
print("\n")
for i, d in enumerate(cluster_dict[idx]['doc']):
    print(f'{i}. {d} \n')

Title: Recent Developments in Space Exploration


0. Cassini flies past Titan; pictures expected tonight NASA #39;s Cassini spacecraft streaked by Saturn #39;s smoggy moon Titan today, targeted to pass within just 750 miles of the planet-sized satellite to give scientists their first  

1. LIVE: Launch of Expedition Ten Crew to the ISS / ESA TV Live / 14 &lt;b&gt;...&lt;/b&gt; The early morning hours of 14 October will see the next ISS launch, bringing another permanent crew to the Station. Expedition 10 crew is made of Commander Leroy Chiao and Flight Engineer Salizhan Sharipov. 

2. (B)old new frontiers Northrop Grumman Corp. and Boeing Co. yesterday announced plans to team up to design a vehicle to take astronauts back to the moon and even beyond, but they #39;ve got to make one stop first  

3. Lawmakers give the space agency greater flexibility in its new &lt;b&gt;...&lt;/b&gt; After a year of wrangling over NASA #39;s \$16.2 billion budget, lawmakers have delivered in a big way, 