## 1. Preparation


In [3]:
from google.colab import drive

drive.mount('/content/drive')

use_gpu = False

Mounted at /content/drive


In [4]:
%%capture
!pip install bertopic
!pip install datasets
# !pip install openai

# GPU **acceleration**

In [5]:
%%capture
if use_gpu:
  !pip install git+https://github.com/MaartenGr/BERTopic.git@master

  !pip install cudf-cu12 dask-cudf-cu12 --extra-index-url=https://pypi.nvidia.com
  !pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
  !pip install cugraph-cu12 --extra-index-url=https://pypi.nvidia.com
  !pip install cupy-cuda12x -f https://pip.cupy.dev/aarch64

  !pip install safetensors
  !pip install datasets
  !pip install datashader

  import locale
  locale.getpreferredencoding = lambda: "UTF-8"

## 2. Data Undersranding and Preprocessing

In [6]:
import pandas as pd
import numpy as np


news_df = pd.read_csv("/content/drive/MyDrive/ML_project/new_text_withstem.csv")
# Extract
num_samples = 1000
cleaned_text = news_df['headline_cleaned_text'][:num_samples]
original_text = news_df['headline_text'][:num_samples]
cleaned_text = [cleaned_text[i] if isinstance(cleaned_text[i],str) else original_text[i] for i in range(len(cleaned_text))]
news_df.head()

Unnamed: 0.1,Unnamed: 0,publish_date,headline_text,headline_cleaned_text
0,0,20030219,aba decides against community broadcasting lic...,decid against communiti broadcast licenc
1,1,20030219,act fire witnesses must be aware of defamation,fire must awar defam
2,2,20030219,a g calls for infrastructure protection summit,call infrastructur protect summit
3,3,20030219,air nz staff in aust strike for pay rise,staff aust strike rise
4,4,20030219,air nz strike to affect australian travellers,strike affect australian travel


In [7]:

# count = 0
# index = 0
# for h in cleaned_text:
#   if not isinstance(h,str):
#     print(index,h, type(h))
#     count += 1
#   index += 1
# print(count)

# BERTopic

In [8]:
from bertopic import BERTopic

# Pre-calculate Embeddings

In [9]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(cleaned_text, show_progress_bar=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

# Preventing Stochastic Behavior

In [10]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

# Improving Default Representation
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Controlling Number of Topics
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [11]:
# !pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
# !pip install --upgrade cuml dask
# !pip uninstall cuml dask
# !pip install dask
# !pip install cuml-cu12 --extra-index-url=https://pypi.nvidia.com
# import dask
# # dask.config.config['parquet']["query-planning"] = False
# dask.config.config["dataframe"]["query-planning"] = False
# print(dask)
# import cuml
# from cuml.manifold import UMAP
# from cuml.cluster import HDBSCAN
# from bertopic import BERTopic

# # Prepare sub-models
# umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
# # hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, min_cluster_size=20,prediction_data=False, verbose=True)
# hdbscan_model = HDBSCAN(min_cluster_size=20, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# vectorizer_model = CountVectorizer(stop_words="english",min_df=2, ngram_range=(1, 2))

# Additional Representations

In [12]:
#import openai
# from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from bertopic.representation import KeyBERTInspired

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
#pos_model = PartOfSpeech("en_core_web_sm")

# MMR
# mmr_model = MaximalMarginalRelevance(diversity=0.3)

# All representation models
representation_model = {
    "KeyBERT": keybert_model,
    # "OpenAI": openai_model,  # Uncomment if you will use OpenAI
    #"MMR": mmr_model,
    #"POS": pos_model
}

# train

In [13]:
from bertopic import BERTopic

topic_model = BERTopic(
  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  representation_model=representation_model,

  # Hyperparameters
  top_n_words=10,
  nr_topics = 8,
  # min_topic_size = 8,
  verbose=True
)

# Train model
topics, probs = topic_model.fit_transform(cleaned_text, embeddings)

# Show topics
topic_model.get_topic_info()

2024-05-09 09:15:01,521 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-09 09:15:10,420 - BERTopic - Dimensionality - Completed ✓
2024-05-09 09:15:10,421 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-09 09:15:10,461 - BERTopic - Cluster - Completed ✓
2024-05-09 09:15:10,462 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-09 09:15:10,712 - BERTopic - Representation - Completed ✓
2024-05-09 09:15:10,713 - BERTopic - Topic reduction - Reducing number of topics
2024-05-09 09:15:10,932 - BERTopic - Topic reduction - Reduced number of topics from 11 to 8


Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,Representative_Docs
0,-1,331,-1_govt_warn_race_health,"[govt, warn, race, health, claim, meet, anti, ...","[protest, critic, warn, anti, defend, announc,...","[protest condemn howard critic anti, state ter..."
1,0,384,0_polic_murder_face_charg,"[polic, murder, face, charg, stab, appeal, zim...","[stab, polic, murder, polic investig, charg, k...","[polic crack down driver safeti, charg with at..."
2,1,85,1_rain_plan_flood_boost,"[rain, plan, flood, boost, break, good, restri...","[rain, rainfal, flood, weather, despit, plant,...","[warn water contamin after recent rain, rain e..."
3,2,54,2_council_fund_park_reject,"[council, fund, park, reject, land, plan, prot...","[council, secur council, road fund, fund, summ...","[council fund groundwat studi, council host fa..."
4,3,48,3_price_qanta_petrol_aust,"[price, qanta, petrol, aust, rise, record, rat...","[petrol, price, cost, rate, rise, fuel, wage, ...","[beef price rise, petrol price hike compar int..."
5,4,44,4_iraq_iraqi_downer_british,"[iraq, iraqi, downer, british, weapon, missil,...","[stay iraq, iraqi, iraq, iran, downer, troop, ...","[saudi arabia tell arab iraq inevit, downer ex..."
6,5,33,5_club_nightclub_toll_dead,"[club, nightclub, toll, dead, blaze, smoke, co...","[death toll, nightclub, club, blaze, toll, fla...","[least dead club fire, toll rise club fire, de..."
7,6,21,6_talk_south_powel_north,"[talk, south, powel, north, death toll, nuclea...","[japan, nuclear, japanes, talk, powel, summit,...","[downer push urgent talk with north korea, dea..."


# diagrams

In [14]:
# # test model save and load
# import os
# os.makedirs("/content/output", exist_ok = True)
# topic_model.save("/content/output/test_model")

In [15]:
# my_model = BERTopic.load("/content/output/test_model")

In [16]:
# my_model.get_topic_info()

In [17]:
# topic_model.get_topic(5, full=True)

In [18]:
# # `topic_distr` contains the distribution of topics in each document
# topics, probs = topic_model.approximate_distribution(cleaned_text, window=8, stride=4)

# import numpy as np



In [19]:
# abstract_id = 10
# print(cleaned_text[abstract_id])

In [20]:
# Visualize the topic-document distribution for a single document
# topic_model.visualize_distribution(topic_distr[abstract_id], min_probability=0.01)

In [21]:
# # Calculate the topic distributions on a token-level
# topic_distr, topic_token_distr = topic_model.approximate_distribution(cleaned_text, calculate_tokens=True)

# # Visualize the token-level distributions
# df = topic_model.visualize_approximate_distribution(cleaned_text[abstract_id], topic_token_distr[abstract_id])
# df

In [22]:
# # !pip install -U kaleido
# import plotly
# # import kaleido

# diagram = topic_model.visualize_hierarchy(top_n_topics=8)

# image = diagram.to_image(format="png")

# diagram.write_image("/content/output/diagram.png")


In [23]:
# topic_model.visualize_barchart(top_n_topics=8)
# topic_model.visualize_hierarchical_documents(docs, hierarchical_topics, reduced_embeddings=reduced_embeddings)

In [24]:
# topic_model.visualize_topics(custom_labels=True, top_n_topics=8)

In [25]:
#  topic_model.visualize_topics_over_time(topics_over_time)

In [26]:
# topic_model.visualize_heatmap( width=1000, height=1000)

In [27]:
# # tsne distribusion
# visualize_samples = 100
# sample_percentage = visualize_samples / len(cleaned_text)
# print(sample_percentage)

# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# # print(reduced_embeddings.shape)
# # print(len(cleaned_text[:visualize_samples]))
# topic_model.visualize_documents(cleaned_text, reduced_embeddings=reduced_embeddings,sample = sample_percentage,  custom_labels=True,hide_annotations=True)

In [28]:
# topic_model.visualize_term_rank()
# data_map = topic_model.visualize_document_datamap(docs, reduced_embeddings=reduced_embeddings)
# topic_model.visualize_barchart()

# Metrics

In [29]:
%%capture
!pip install octis

In [30]:
from octis.evaluation_metrics.diversity_metrics import TopicDiversity
from octis.evaluation_metrics.coherence_metrics import Coherence

from octis.dataset.dataset import Dataset

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

coherence_scores = []
diversity_scores = []

dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")

data = dataset.get_corpus()
data = [" ".join(words) for words in data]


# data = cleaned_text

model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(data, show_progress_bar=True)

nr_topics = 8

params = {
    "embedding_model": "all-mpnet-base-v2",
    "nr_topics": nr_topics,
    "min_topic_size": 15,
    "verbose": False,
    "calculate_probabilities" : False
}

model = BERTopic(**params)
topics, _ = model.fit_transform(data, embeddings)


nr_topics = 8

params = {
    "embedding_model": "all-mpnet-base-v2",
    "nr_topics": nr_topics,
    "min_topic_size": 15,
    "verbose": False,
    "calculate_probabilities" : False
}

model = BERTopic(**params)
topics, _ = model.fit_transform(data, embeddings)

# topics, probs = topic_model.fit_transform(cleaned_text, embeddings)

all_words = [word for words in dataset.get_corpus() for word in words]

bertopic_topics_test = [
    [
        vals[0] if vals[0] in all_words else all_words[0]
        for vals in model.get_topic(i)[:10]
    ]
    for i in range(len(set(topics)) - 1)
]

diversity = TopicDiversity(topk=10)
coherence = Coherence(texts=dataset.get_corpus(),
                      topk=10, measure="c_npmi")

coherence_scores.append(coherence.score({"topics": bertopic_topics_test}))
diversity_scores.append(diversity.score({"topics": bertopic_topics_test}))

print(f"topic coherence npmi : {sum(coherence_scores)/len(coherence_scores)}")
print(f"topic diversity : {sum(diversity_scores)/len(diversity_scores)}")



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/510 [00:00<?, ?it/s]

topic coherence npmi : 0.061441453287797945
topic diversity : 0.9


In [31]:
print(bertopic_topics_test)

[['people', 'make', 'time', 'year', 'good', 'give', 'game', 'thing', 'find', 'government'], ['drive', 'card', 'scsi', 'disk', 'image', 'work', 'problem', 'driver', 'color', 'monitor'], ['motif', 'run', 'problem', 'error', 'window', 'file', 'compile', 'server', 'program', 'machine'], ['battery', 'ground', 'heat', 'fire', 'fan', 'gas', 'water', 'cool', 'temperature', 'flame'], ['mouse', 'keyboard', 'driver', 'button', 'problem', 'work', 'window', 'load', 'serial', 'ball'], ['window', 'manager', 'widget', 'program', 'position', 'title', 'application', 'region', 'event', 'set'], ['plastic', 'blue', 'board', 'paint', 'pair', 'green', 'light', 'lead', 'sell', 'material']]


In [32]:
nr_topics = 5 ###

topk = 10  # words
bertopic_topics = []
for i in range(nr_topics):
  bertopic_topics.append([vals[0] for vals in  topic_model.get_topic(i)[:topk]])

print(bertopic_topics)


[['polic', 'murder', 'face', 'charg', 'stab', 'appeal', 'zimbabw', 'final', 'probe', 'jail'], ['rain', 'plan', 'flood', 'boost', 'break', 'good', 'restrict', 'predict', 'suppli', 'bring'], ['council', 'fund', 'park', 'reject', 'land', 'plan', 'protect', 'welcom', 'claim', 'ambul'], ['price', 'qanta', 'petrol', 'aust', 'rise', 'record', 'rate', 'million', 'bank', 'emerg land'], ['iraq', 'iraqi', 'downer', 'british', 'weapon', 'missil', 'resolut', 'inspect', 'ass', 'expert']]


In [33]:
# dataset.get_corpus()

In [34]:

corpus = [text.split(" ") for text in cleaned_text]
coherence = Coherence(texts=corpus,
                      topk=topk, measure="c_npmi")

diversity = TopicDiversity(topk=topk)

coherence = coherence.score({"topics": bertopic_topics})
diversity = diversity.score({"topics": bertopic_topics})

print(f"topic coherence npmi : {coherence}")
print(f"topic diversity : {diversity}")

topic coherence npmi : -0.30428084745535855
topic diversity : 0.98


In [35]:
print(corpus[:10])

[['decid', 'against', 'communiti', 'broadcast', 'licenc'], ['fire', 'must', 'awar', 'defam'], ['call', 'infrastructur', 'protect', 'summit'], ['staff', 'aust', 'strike', 'rise'], ['strike', 'affect', 'australian', 'travel'], ['ambiti', 'olsson', 'tripl', 'jump'], ['antic', 'delight', 'with', 'record', 'break', 'barca'], ['aussi', 'qualifi', 'stosur', 'wast', 'four', 'memphi', 'match'], ['aust', 'address', 'secur', 'council', 'over', 'iraq'], ['australia', 'lock', 'into', 'timet']]


In [36]:

dataset = Dataset()
dataset.fetch_dataset("20NewsGroup")
print(dataset.get_corpus()[:10])

[['fax', 'modem', 'card', 'sell', 'mail'], ['run', 'server', 'server', 'install', 'run', 'add'], ['live', 'part', 'lead', 'wait', 'important', 'remember', 'judge', 'judge', 'guess', 'close', 'situation', 'listen', 'statement', 'sense', 'regard', 'passage', 'remember', 'letter', 'church', 'people', 'body', 'talk', 'work', 'translation', 'lack', 'concern', 'make', 'sick', 'point', 'throw', 'faith', 'faith', 'catch', 'meaning', 'offer', 'explanation', 'fire', 'cold', 'make', 'aware', 'child', 'eternal'], ['doesn', 'pain', 'deserve', 'die', 'lie', 'rape'], ['sale', 'mile', 'good', 'condition', 'good', 'condition', 'player', 'component', 'speaker', 'mount', 'door', 'car', 'maintain', 'clean', 'good', 'car', 'solid', 'body', 'spot', 'surface', 'spot', 'touch', 'make', 'car', 'problem', 'firm', 'car', 'average', 'cost', 'interested', 'call', 'email'], ['post', 'real', 'disease', 'disease', 'question', 'case', 'active', 'culture', 'reduce', 'hear', 'work', 'mechanism', 'common', 'minor', 'comm