Imports

In [29]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import hdbscan
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
import torch

Load data

In [30]:
# Loading only titles
df = pd.read_csv('../data/raw_bbc.csv', usecols=['Text'])

# take first 10 words as title
df["Title"] = df["Text"].apply(lambda x: " ".join(x.split()[:100]))
titles = df["Title"].tolist()
df.head()

Unnamed: 0,Text,Title
0,Ad sales boost Time Warner profit\n\nQuarterly...,Ad sales boost Time Warner profit Quarterly pr...
1,Dollar gains on Greenspan speech\n\nThe dollar...,Dollar gains on Greenspan speech The dollar ha...
2,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos unit buyer faces loan claim The owners o...
3,High fuel prices hit BA's profits\n\nBritish A...,High fuel prices hit BA's profits British Airw...
4,Pernod takeover talk lifts Domecq\n\nShares in...,Pernod takeover talk lifts Domecq Shares in UK...


In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"
# load embedding model onto gpu
embedder = SentenceTransformer("intfloat/multilingual-e5-large", device=device)

In [32]:
# batch encode
embeddings = embedder.encode(df["Title"].tolist(), batch_size=32, show_progress_bar=True)

Batches: 100%|██████████| 70/70 [00:46<00:00,  1.51it/s]


Clustering with HDBSCAN

In [33]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=5,metric='euclidean')
cluster_labels = clusterer.fit_predict(embeddings)

df["Cluster"] = cluster_labels



In [34]:
# representative titles
def representative_title(cluster_id, n=5):
    cluster_titles = df[df["Cluster"] == cluster_id]["Title"].tolist()
    return cluster_titles[:n]

clusters = sorted(df["Cluster"].unique())

In [35]:
# LLM for Cluster labelling
llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    device=0  # ensures it uses CUDA:0
)

# Use the same model
classifier = pipeline("text2text-generation", model="google/flan-t5-base", device=0)

def label_cluster(titles):
    prompt = (
        "These are headlines from the same cluster:\n\n"
        + "\n".join(f"- {t}" for t in titles)
        + "\n\nGive a short, general subcategory label (like 'Economy', 'Corporate Profits', "
          "'Entertainment – Movies', 'Sports – Football'). "
          "Do not repeat the headlines. Only return the label."
    )
    result = classifier(prompt, max_new_tokens=20, clean_up_tokenization_spaces=True)[0]["generated_text"]
    return result.strip()

Device set to use cuda:0
Device set to use cuda:0


In [36]:
# representative titles
def get_representative_titles(cluster_id, n=5):
    cluster_titles = df[df["Cluster"] == cluster_id]["Title"].tolist()
    return cluster_titles[:n]

# Run labeling once per cluster
cluster_labels_dict = {}
for cid in clusters:
    if cid == -1:  # outliers
        cluster_labels_dict[cid] = "Miscellaneous"
    else:
        reps = get_representative_titles(cid, n=5)
        cluster_labels_dict[cid] = label_cluster(reps)

df["Subcategory"] = df["Cluster"].map(cluster_labels_dict)

Token indices sequence length is longer than the specified maximum sequence length for this model (772 > 512). Running this sequence through the model will result in indexing errors
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [39]:
df.to_csv("bbc_with_subcategories_embedding.csv", index=False)