In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
dataset_path = '/content/drive/MyDrive/processed_novels_sentences_new.csv'

Mounted at /content/drive


In [2]:
# Load the raw dataset
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/processed_novels_sentences_new.csv')

# Clean up the sentences
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'\n+', ' ', x))
df['Sentence'] = df['Sentence'].apply(lambda x: re.sub(r'\s+', ' ', x).strip().lower())

# List of sentence strings for BERTopic input
dataset_as_list_of_strings = df['Sentence'].tolist()
print(f"Total sentences in dataset: {len(dataset_as_list_of_strings)}")

# Install necessary packages
!pip install bertopic hdbscan umap-learn sentence-transformers

# Import required libraries
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import umap
import hdbscan
from sentence_transformers import SentenceTransformer

# Define the embedding model
embedding_model = SentenceTransformer("all-MiniLM-L12-v2")

# Configure UMAP settings
umap_model = umap.UMAP(
    n_neighbors=7,
    n_components=2,
    min_dist=0.005022461555,
    random_state=42
)

# Configure HDBSCAN settings
hdbscan_model = hdbscan.HDBSCAN(
    min_cluster_size=281,
    min_samples=72,
    gen_min_span_tree=True
)

# Configure CountVectorizer settings
vectorizer_model = CountVectorizer(
    min_df=0.001503935747,
    stop_words="english"
)

# Initialize BERTopic model with specified parameters
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    min_topic_size=102,
    top_n_words=30
)

# Fit the model on the cleaned data
topics, probabilities = topic_model.fit_transform(dataset_as_list_of_strings)


Total sentences in dataset: 680822
Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.8/88.8 kB[0m [31m8.5 MB/s[0m eta [3

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
# Display general information about the topics
topic_info = topic_model.get_topic_info()
print(topic_info)

# Display the top words for each topic
# This loop goes through each topic and prints the top words
for topic_id in range(len(topic_info)):
    topic_words = topic_model.get_topic(topic_id)
    print(f"\nTopic {topic_id}:")
    for word, score in topic_words:
        print(f"{word} ({score:.4f})")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
looks (0.0046)
anne (0.0045)
marry (0.0044)
nod (0.0044)
oreo (0.0044)
truly (0.0043)

Topic 254:
sasha (0.2411)
sense (0.1908)
bonnie (0.1448)
sir (0.1028)
makes (0.0475)
pierce (0.0315)
yes (0.0219)
make (0.0197)
powerpoint (0.0131)
cruella (0.0114)
bale (0.0114)
guess (0.0108)
address (0.0108)
caller (0.0107)
papa (0.0096)
instagram (0.0090)
sirs (0.0089)
vil (0.0089)
making (0.0087)
piercingpine32 (0.0085)
nautical (0.0080)
doesn (0.0077)
sue (0.0074)
welp (0.0072)
awestruck (0.0070)
followhill (0.0067)
says (0.0066)
suppose (0.0063)
xellum (0.0061)
straightens (0.0060)

Topic 255:
lawyer (0.1593)
jamieson (0.1424)
lawyers (0.0911)
attorney (0.0726)
jamie (0.0532)
court (0.0210)
legal (0.0208)
spoke (0.0201)
attorneys (0.0147)
malcomb (0.0106)
draft (0.0092)
law (0.0088)
judge (0.0087)
papers (0.0085)
tony (0.0079)
case (0.0071)
trial (0.0070)
stuart (0.0068)
need (0.0065)
hire (0.0063)
review (0.0063)
robert (0.0060)

TypeError: 'bool' object is not iterable