# TASK ZERO 

### UNDERSTANDING THE DATA

In [29]:
# Import required libraries
import os
from collections import Counter
import re
import random
import json
from pathlib import Path

random.seed(42)


In [30]:
# Read the text files (keeps original behaviour for quick inspection)
base_path = r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\novels'

# Example: load two files (these variables are used by legacy analysis cells)
with open(os.path.join(base_path, 'metamorphosis.txt'), 'r', encoding='utf-8', errors='ignore') as f:
    metamorphosis_text = f.read()

with open(os.path.join(base_path, 'the_trial.txt'), 'r', encoding='utf-8', errors='ignore') as f:
    trial_text = f.read()

print("Files loaded successfully!")


Files loaded successfully!


In [31]:
# Basic statistics
def analyze_text(text, title):
    words = re.findall(r'\b\w+\b', text.lower())
    
    print(f"\n{'='*50}")
    print(f"Analysis for: {title}")
    print(f"{'='*50}")
    print(f"Total characters: {len(text):,}")
    print(f"Total words: {len(words):,}")
    print(f"Unique words: {len(set(words)):,}")
    
    # Top 10 most common words
    word_counts = Counter(words)
    print(f"\nTop 10 most common words:")
    for word, count in word_counts.most_common(10):
        print(f"  {word}: {count}")

analyze_text(metamorphosis_text, "The Metamorphosis")
analyze_text(trial_text, "The Trial")


Analysis for: The Metamorphosis
Total characters: 138,259
Total words: 25,602
Unique words: 3,001

Top 10 most common words:
  the: 1348
  to: 835
  and: 710
  he: 593
  of: 557
  his: 550
  in: 411
  was: 411
  it: 385
  that: 360

Analysis for: The Trial
Total characters: 469,532
Total words: 89,557
Unique words: 5,090

Top 10 most common words:
  the: 4930
  to: 2937
  he: 2077
  and: 2074
  of: 1742
  it: 1482
  that: 1446
  a: 1380
  you: 1343
  in: 1307


### FUNCTIONS FOR CLEANING THE DATASET

#### READ + GUTENBERG HEADERS



In [34]:

def read_text(path):
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()


def remove_gutenberg_headers(text):
    """Strip Project Gutenberg header/footer using common markers and fallbacks."""
    start = re.search(r"\*\*\*\s*START OF (THE )?PROJECT GUTENBERG EBOOK.*?\*\*\*", text, flags=re.IGNORECASE|re.DOTALL)
    end = re.search(r"\*\*\*\s*END OF (THE )?PROJECT GUTENBERG EBOOK.*?\*\*\*", text, flags=re.IGNORECASE|re.DOTALL)
    if start and end:
        return text[start.end():end.start()]
    if start:
        return text[start.end():]
    if end:
        return text[:end.start()]
    # fallback: strip leading license block up to first major section marker
    fallback = re.search(r"\n(?:CONTENTS|TABLE OF CONTENTS|CHAPTER [IVXLC0-9]+\.|CHAPTER\s+I\b|BOOK I\b)", text, flags=re.IGNORECASE)
    if fallback:
        return text[fallback.start():]
    return text

#### BASIC CLEANUP



In [35]:
def basic_cleanup(text):
    # normalize newlines
    text = text.replace('\r\n', '\n').replace('\r', '\n')
    # remove obvious Gutenberg boilerplate lines and short ALL-CAPS headers
    lines = text.splitlines()
    cleaned = []
    for ln in lines:
        s = ln.strip()
        if not s:
            cleaned.append('')
            continue
        low = s.lower()
        if 'project gutenberg' in low or 'this ebook' in low or 'http' in low:
            continue
        # skip short all-caps lines that are likely headers/footers
        if re.match(r'^[A-Z0-9 \-\(\)\/\:]{1,60}$', s) and len(s.split()) <= 6:
            continue
        cleaned.append(ln)
    text = '\n'.join(cleaned)
    # collapse multiple blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()


#### REMOVE CHAPTER HEADINGS



In [36]:
def remove_chapter_headings(text):
    # More aggressive removal of chapter/book/part headings in many formats.
    # 1) Remove explicit lines starting with chapter/book/part (case-insensitive),
    #    but only if they're short (likely a heading) to avoid removing body text.
    def _drop_short_heading(m):
        line = m.group(0).strip()
        words = re.findall(r"\w+['-]?\w*", line)
        if len(words) <= 6:
            return ''
        return m.group(0)
    text = re.sub(r'(?im)^[ \t]*(?:CHAPTER|CHAP\.?|BOOK|PART)\b[^\n]*\n', _drop_short_heading, text)

    # 2) Remove standalone lines that look like chapter headings even when not
    #    fully ALL-CAPS (e.g. "Chapter Two", "Chapter I."). Match short lines
    #    that begin with chapter/chap/book/part and contain only a few words/punct.
    text = re.sub(r"(?im)^[ \t]*(?:chapter|chap\.?|book|part)\b[\sA-Za-z0-9\-\.,:\(\)]+\n", '', text)

    # 3) Keep the previous heuristic: drop short ALL-CAPS centered headings.
    def _drop_if_heading(m):
        line = m.group(0).strip()
        if len(line) <= 60 and len(line.split()) <= 6 and re.match(r"^[A-Z0-9 ,:\'\"\-()]+$", line):
            return ''
        return m.group(0)
    text = re.sub(r'(?m)^[ \t]*[A-Z0-9 ,:\'\"\-()]{1,60}\n', _drop_if_heading, text)

    # Collapse any resulting runs of blank lines introduced by removals
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text


#### CLEAN TEXT



In [37]:
def clean_text(text):
    text = remove_gutenberg_headers(text)
    text = basic_cleanup(text)
    text = remove_chapter_headings(text)
    # normalize common Unicode punctuation
    repl = {'\u2018': "'", '\u2019': "'", '\u201c': '"', '\u201d': '"', '\u2013': '-', '\u2014': ' - '}
    for k, v in repl.items():
        text = text.replace(k, v)
    # normalize spaces
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()

#### SPLIT SENTENCES



In [38]:
def split_sentences(text, min_words=6):
    """Heuristic sentence splitter that avoids breaking on many abbreviations.
    Returns sentences with at least `min_words` words."""
    parts = re.split(r'(?<=[\.\?\!])\s+(?=(?:["\'\(]?[A-Z0-9]))', text)
    if len(parts) < 20:
        parts = re.split(r'(?<=[\.\?\!])\s+', text)
    sentences = []
    for p in parts:
        s = p.strip()
        if not s:
            continue
        wc = len(re.findall(r'\b\w+\b', s))
        if wc < min_words:
            continue
        # drop enumerated list items
        if re.match(r'^\d+[\.)]\s+', s):
            continue
        s = s.strip(' "\'\-–—')
        sentences.append(s)
    return sentences


  s = s.strip(' "\'\-–—')


#### BUILD DATASET



In [39]:
def build_class1_dataset(novels_dir, output_dir, sample_size=100):
    novels_dir = Path(novels_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    summary = {}
    for txt in sorted(novels_dir.glob('*.txt')):
        raw = read_text(txt)
        cleaned = clean_text(raw)
        sentences = split_sentences(cleaned, min_words=6)
        sampled = random.sample(sentences, min(sample_size, len(sentences)))
        # write cleaned text and samples
        (output_dir / f"{txt.stem}_cleaned.txt").write_text(cleaned, encoding='utf-8')
        (output_dir / f"{txt.stem}_samples.json").write_text(json.dumps({'samples': sampled, 'num_sentences': len(sentences)}, indent=2), encoding='utf-8')
        summary[txt.stem] = {'num_sentences': len(sentences), 'num_samples': len(sampled)}
    (output_dir / 'task_zero_summary.json').write_text(json.dumps(summary, indent=2), encoding='utf-8')
    return summary

### FINAL OUTPUT GENERATION FOR DATA CLEANING

In [45]:

base = r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\novels'
out = r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1'
summary = build_class1_dataset(base, out, sample_size=100)
print('Class-1 cleaning complete. Summary:', summary)

Class-1 cleaning complete. Summary: {'heart_of_darkness': {'num_sentences': 1829, 'num_samples': 100}, 'lord_jim': {'num_sentences': 5529, 'num_samples': 100}, 'metamorphosis': {'num_sentences': 724, 'num_samples': 100}, 'the_trial': {'num_sentences': 3013, 'num_samples': 100}, 'typhoon': {'num_sentences': 1429, 'num_samples': 100}}


### FINDING RELEVANT TOPIC ASSOICIATED WITH EACH NOVEL

In [47]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

#### TF-IDF VECTORIZATION

In [49]:
def vectorize_sentences(sentences):
    vectorizer = TfidfVectorizer(
        ngram_range=(1, 2),
        stop_words="english",
        min_df=5,        # ignore extremely rare words
        max_df=0.7,      # ignore overly common words
        max_features=5000
    )
    
    tfidf_matrix = vectorizer.fit_transform(sentences)
    feature_names = vectorizer.get_feature_names_out()
    
    return tfidf_matrix, feature_names


#### KMEANS CLUSTERING

In [50]:
def cluster_sentences(tfidf_matrix, k=7):
    kmeans = KMeans(
        n_clusters=k,
        random_state=42,
        n_init=10
    )
    
    labels = kmeans.fit_predict(tfidf_matrix)
    centroids = kmeans.cluster_centers_
    
    return labels, centroids


#### Extract Top Keywords per Cluster

In [51]:
def get_top_terms_per_cluster(centroids, feature_names, top_n=15):
    top_terms = {}
    
    for cluster_id, centroid in enumerate(centroids):
        top_indices = centroid.argsort()[::-1][:top_n]
        terms = [feature_names[i] for i in top_indices]
        top_terms[cluster_id] = terms
    
    return top_terms


#### Inspect Clusters (keywords + example sentences)

In [52]:
def inspect_clusters(sentences, labels, top_terms, samples_per_cluster=5):
    for cluster_id, terms in top_terms.items():
        print(f"\n=== Cluster {cluster_id} ===")
        print("Top terms:", terms)
        
        cluster_sentences = [
            s for s, label in zip(sentences, labels) if label == cluster_id
        ]
        
        print(f"Number of sentences: {len(cluster_sentences)}")
        print("Sample sentences:")
        
        for s in cluster_sentences[:samples_per_cluster]:
            print("-", s)


#### Run everything together

In [53]:
def extract_topics(sentences, k=7):
    tfidf_matrix, feature_names = vectorize_sentences(sentences)
    labels, centroids = cluster_sentences(tfidf_matrix, k=k)
    top_terms = get_top_terms_per_cluster(centroids, feature_names)
    
    inspect_clusters(sentences, labels, top_terms)
    
    return labels, top_terms


#### RUN TOPIC DISCOVERY ON ALL NOVELS



In [54]:
# Load sentences from cleaned novels and run topic discovery per novel

output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1')
novels_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\novels')

# Collect all novels with their sentences
all_novels_topics = {}

for cleaned_file in sorted(output_dir.glob('*_cleaned.txt')):
    novel_name = cleaned_file.stem.replace('_cleaned', '')
    
    print(f"\n{'='*70}")
    print(f"NOVEL: {novel_name.upper()}")
    print(f"{'='*70}")
    
    # Read cleaned text and split into sentences
    cleaned_text = cleaned_file.read_text(encoding='utf-8')
    sentences = split_sentences(cleaned_text, min_words=6)
    
    print(f"Total sentences: {len(sentences):,}")
    
    # Run topic discovery (adjust k based on novel size if needed)
    k = min(8, len(sentences) // 100)  # ~1 cluster per 100 sentences, max 8
    k = max(5, k)  # at least 5 clusters
    
    print(f"Discovering {k} topics...\n")
    
    try:
        labels, top_terms = extract_topics(sentences, k=k)
        all_novels_topics[novel_name] = {
            'labels': labels,
            'top_terms': top_terms,
            'num_sentences': len(sentences)
        }
    except Exception as e:
        print(f"Error processing {novel_name}: {e}")
        continue

print(f"\n\n{'='*70}")
print("TOPIC DISCOVERY COMPLETE")
print(f"{'='*70}")
print(f"Processed {len(all_novels_topics)} novels")



NOVEL: HEART_OF_DARKNESS
Total sentences: 1,829
Discovering 8 topics...


=== Cluster 0 ===
Top terms: ['said', 'did', 'mr', 'think', 'just', 'company', 'way', 'tone', 'like', 'good', 'talked', 'attack', 'suddenly', 'tell', 've']
Number of sentences: 118
Sample sentences:
- Between us there was, as I have already said somewhere, the bond of the
sea.
- For some reason or
other we did not begin that game of dominoes.
- And this also," said Marlow suddenly, "has been one of the dark places
of the earth."

He was the only man of us who still "followed the sea." The worst that
could be said of him was that he did not represent his class.
- His remark did not seem at all surprising.
- No one took the trouble to grunt even; and
presently he said, very slow - "I was thinking of very old times, when
the Romans first came here, nineteen hundred years ago - the other day
....

=== Cluster 1 ===
Top terms: ['like', 'little', 'came', 'river', 'men', 'think', 'got', 'eyes', 'black', 'look', 'great'

### TOPIC EXTRACTION (EXPORT FOR GEMINI)

In [None]:
# Export topics from TF-IDF clustering for Gemini API generation
# This creates JSON files with topics for each novel

from pathlib import Path
import json

output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1')
topics_output = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics')
topics_output.mkdir(parents=True, exist_ok=True)

all_topics_export = {}

for cleaned_file in sorted(output_dir.glob('*_cleaned.txt')):
    novel_name = cleaned_file.stem.replace('_cleaned', '')
    
    print(f"\n{'='*60}")
    print(f"EXTRACTING TOPICS: {novel_name.upper()}")
    print(f"{'='*60}")
    
    # Read cleaned text and split into sentences
    cleaned_text = cleaned_file.read_text(encoding='utf-8')
    sentences = split_sentences(cleaned_text, min_words=6)
    
    print(f"Total sentences: {len(sentences):,}")
    
    # Determine number of topics (5-10 range)
    k = min(10, max(5, len(sentences) // 150))
    print(f"Extracting {k} topics...")
    
    try:
        # Run TF-IDF + KMeans clustering
        tfidf_matrix, feature_names = vectorize_sentences(sentences)
        labels, centroids = cluster_sentences(tfidf_matrix, k=k)
        top_terms = get_top_terms_per_cluster(centroids, feature_names, top_n=15)
        
        # Build export structure
        topics_export = {
            'novel': novel_name,
            'num_sentences': len(sentences),
            'num_topics': k,
            'topics': []
        }
        
        for cluster_id, terms in top_terms.items():
            cluster_sentences = [s for s, label in enumerate(labels) if label == cluster_id]
            
            topics_export['topics'].append({
                'topic_id': cluster_id,
                'keywords': terms[:10],  # top 10 keywords
                'num_sentences': len(cluster_sentences)
            })
            
            print(f"  Topic {cluster_id}: {', '.join(terms[:5])}")
        
        # Save per-novel topics
        novel_json = topics_output / f"{novel_name}_topics.json"
        novel_json.write_text(json.dumps(topics_export, indent=2, ensure_ascii=False), encoding='utf-8')
        print(f"✓ Saved to {novel_json.name}")
        
        all_topics_export[novel_name] = topics_export
        
    except Exception as e:
        print(f"✗ Error: {e}")
        continue

# Save master file
master_file = topics_output / 'all_novels_topics.json'
master_file.write_text(json.dumps(all_topics_export, indent=2, ensure_ascii=False), encoding='utf-8')

print(f"\n{'='*60}")
print(f"TOPIC EXTRACTION COMPLETE")
print(f"{'='*60}")
print(f"Processed {len(all_topics_export)} novels")
print(f"Output: {topics_output}")


#### TOPIC FINDING USING BERT LOGIC

In [55]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re
from pathlib import Path

nltk.download("punkt")
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eisas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Load & prepare a novel

In [56]:
def load_and_split_novel(path):
    text = Path(path).read_text(encoding="utf-8", errors="ignore")

    # Basic safety cleanup (metadata should already be removed)
    text = re.sub(r"\s+", " ", text)

    sentences = sent_tokenize(text)

    # Remove very short / junk sentences
    sentences = [
        s for s in sentences
        if len(s.split()) >= 8
    ]

    return sentences


##### Build the BERTopic model (key part)

In [63]:
def build_topic_model(min_df=2):
    """Build BERTopic model with configurable min_df for flexibility."""
    embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

    vectorizer_model = CountVectorizer(
        stop_words="english",
        ngram_range=(1, 2),
        min_df=min_df,  # flexible minimum
        max_df=0.95     # ignore very common terms
    )

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        calculate_probabilities=True,
        verbose=True
    )

    return topic_model


##### Run topic extraction on one novel

In [58]:
def extract_topics(sentences, topic_model):
    topics, probs = topic_model.fit_transform(sentences)
    return topic_model, topics, probs


##### Reduce to 5–10 core topics

In [59]:
def reduce_topics(topic_model, sentences, target_topics=8):
    topic_model.reduce_topics(
        sentences,
        nr_topics=target_topics
    )
    return topic_model

##### Inspect and export topics

In [60]:
def print_topics(topic_model):
    topics = topic_model.get_topics()

    for topic_id, terms in topics.items():
        if topic_id == -1:
            continue  # skip outliers

        keywords = [term for term, _ in terms[:10]]
        print(f"\nTopic {topic_id}")
        print("Keywords:", keywords)

#### RUN BERT TOPIC EXTRACTION ON ALL NOVELS

In [65]:
# Run BERTopic-based topic extraction on all cleaned novels
from pathlib import Path
import json

output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1')
topics_output = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_bert')
topics_output.mkdir(parents=True, exist_ok=True)

all_bert_topics = {}

for cleaned_file in sorted(output_dir.glob('*_cleaned.txt')):
    novel_name = cleaned_file.stem.replace('_cleaned', '')
    
    print(f"\n{'='*70}")
    print(f"NOVEL: {novel_name.upper()}")
    print(f"{'='*70}")
    
    try:
        # Load and split into sentences
        sentences = load_and_split_novel(cleaned_file)
        print(f"Total sentences (≥8 words): {len(sentences):,}")
        
        if len(sentences) < 50:
            print(f"⚠ Skipping {novel_name}: too few sentences for meaningful clustering")
            continue
        
        # Build fresh topic model for this novel (BERTopic models can't be reused)
        # Use conservative min_df (BERTopic creates topic-level docs, not sentence-level)
        # so min_df must be small enough to work with ~5-10 topic clusters
        adaptive_min_df = 2  # safe for all corpus sizes
        print(f"Building BERTopic model (min_df={adaptive_min_df})...")
        topic_model_fresh = build_topic_model(min_df=adaptive_min_df)
        
        # Extract topics using BERTopic
        print("Running BERTopic extraction...")
        topic_model, topics, probs = extract_topics(sentences, topic_model_fresh)
        
        # Reduce to 5-10 core topics
        target_topics = min(10, max(5, len(set(topics)) // 2))
        print(f"Reducing to {target_topics} core topics...")
        topic_model = reduce_topics(topic_model, sentences, target_topics=target_topics)
        
        # Print topics to console
        print_topics(topic_model)
        
        # Export to JSON
        topics_dict = topic_model.get_topics()
        topics_export = {
            'novel': novel_name,
            'num_sentences': len(sentences),
            'num_topics': len([t for t in topics_dict.keys() if t != -1]),
            'topics': []
        }
        
        for topic_id, terms in topics_dict.items():
            if topic_id == -1:
                continue  # skip outliers
            
            keywords = [term for term, score in terms[:15]]
            topic_sentences = [s for s, t in zip(sentences, topics) if t == topic_id]
            
            topics_export['topics'].append({
                'topic_id': topic_id,
                'keywords': keywords,
                'num_sentences': len(topic_sentences),
                'sample_sentences': topic_sentences[:3]
            })
        
        # Save per-novel topics
        novel_json = topics_output / f"{novel_name}_topics_bert.json"
        novel_json.write_text(json.dumps(topics_export, indent=2, ensure_ascii=False), encoding='utf-8')
        print(f" Saved topics to {novel_json.name}")
        
        all_bert_topics[novel_name] = topics_export
        
    except Exception as e:
        print(f" Error processing {novel_name}: {e}")
        import traceback
        traceback.print_exc()
        continue

# Save master file with all novels
master_file = topics_output / 'all_novels_topics_bert.json'
master_file.write_text(json.dumps(all_bert_topics, indent=2, ensure_ascii=False), encoding='utf-8')

print(f"\n\n{'='*70}")
print("BERT TOPIC EXTRACTION COMPLETE")
print(f"{'='*70}")
print(f"Processed {len(all_bert_topics)} novels")
print(f"Topics saved to: {topics_output}")
print(f"Master file: {master_file.name}")



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:42,298 - BERTopic - Embedding - Transforming documents to embeddings.



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:42,298 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:42,298 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]


NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:42,298 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

2026-01-29 19:04:49,955 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:49,956 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:49,971 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:49,973 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:50,076 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:50,084 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:50,145 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:50,173 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:50,174 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:50,175 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:50,278 - BERTopic - Representation - Completed ✓



NOVEL: HEART_OF_DARKNESS
Total sentences (≥8 words): 1,678
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:17,992 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/53 [00:00<?, ?it/s]

2026-01-29 19:03:26,153 - BERTopic - Embedding - Completed ✓
2026-01-29 19:03:26,154 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:03:26,166 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:03:26,167 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:03:26,277 - BERTopic - Cluster - Completed ✓
2026-01-29 19:03:26,281 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,358 - BERTopic - Representation - Completed ✓
2026-01-29 19:03:26,409 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:03:26,410 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:03:26,411 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:03:26,490 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['like', 'man', 'know', 'came', 'little', 'river', 'looked', 'long', 'men', 'white']

Topic 1
Keywords: ['rivets', 'envy', 'excited', 'heard kurtz', 'excuse', 'delay', 'sole purpose', 'ripe', 'time ripe', 'favour']
 Saved topics to heart_of_darkness_topics_bert.json

NOVEL: LORD_JIM
Total sentences (≥8 words): 5,178
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:03:31,508 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/162 [00:00<?, ?it/s]

2026-01-29 19:04:02,320 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:02,321 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:02,391 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:02,392 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:02,763 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:02,769 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:02,882 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:02,912 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:02,913 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:02,914 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:03,040 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['man', 'said', 'old', 'men', 'away', 'say', 'white', 'cornelius', 'thing', 'people']

Topic 1
Keywords: ['addressing', 'believing', 'child started', 'clawed', 'choked', 'did dare', 'crammed', 'telling just', 'sorcerer', 'didn care']
 Saved topics to lord_jim_topics_bert.json

NOVEL: METAMORPHOSIS
Total sentences (≥8 words): 702
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:07,746 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

2026-01-29 19:04:12,451 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:12,452 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:12,462 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:12,464 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:12,499 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:12,503 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,531 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:12,555 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:12,556 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:12,557 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:12,606 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['really', 'legs', 'feel', 'lay', 'wanted', 'food', 'window', 'great', 'set', 'pain']

Topic 1
Keywords: ['gregor father', 'gregor mother', 'gregor room', 'gregor sister', 'heard', 'wanted', 'furniture', 'help', 'straight', 'new']
 Saved topics to metamorphosis_topics_bert.json

NOVEL: THE_TRIAL
Total sentences (≥8 words): 3,201
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:17,161 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/101 [00:00<?, ?it/s]

2026-01-29 19:04:36,814 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:36,815 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:36,835 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:36,837 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:37,081 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:37,085 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,162 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:37,207 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:37,208 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:37,209 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:37,326 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['trial', 'lawyer', 'lawyers', 'acquittal', 'judges', 'men', 'proceedings', 'accused', 'defendant', 'cases']

Topic 1
Keywords: ['mr', 'watching', 'men', 'said looking', 'wiped', 'mr said', 'cassock', 'looking round', 'greatly', 'glanced']
 Saved topics to the_trial_topics_bert.json

NOVEL: TYPHOON
Total sentences (≥8 words): 1,308
Building BERTopic model (min_df=2)...


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
2026-01-29 19:04:42,298 - BERTopic - Embedding - Transforming documents to embeddings.


Running BERTopic extraction...


Batches:   0%|          | 0/41 [00:00<?, ?it/s]

2026-01-29 19:04:49,955 - BERTopic - Embedding - Completed ✓
2026-01-29 19:04:49,956 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-01-29 19:04:49,971 - BERTopic - Dimensionality - Completed ✓
2026-01-29 19:04:49,973 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-01-29 19:04:50,076 - BERTopic - Cluster - Completed ✓
2026-01-29 19:04:50,084 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:50,145 - BERTopic - Representation - Completed ✓
2026-01-29 19:04:50,173 - BERTopic - Topic reduction - Reducing number of topics
2026-01-29 19:04:50,174 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(3).
2026-01-29 19:04:50,175 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-01-29 19:04:50,278 - BERTopic - Representation - Completed ✓


Reducing to 5 core topics...

Topic 0
Keywords: ['ship', 'macwhirr', 'captain', 'captain macwhirr', 'sea', 'end', 'way', 'room', 'went', 'black']

Topic 1
Keywords: ['mr jukes', 'caused', 'louder', 'jukes turned', 'like man', 'catching', 'ventilators', 'hastily', 'meaning', 'doorway']
 Saved topics to typhoon_topics_bert.json


BERT TOPIC EXTRACTION COMPLETE
Processed 5 novels
Topics saved to: c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_bert
Master file: all_novels_topics_bert.json


#### GEMINI-BASED APPROACH (GOOGLE AI STUDIO)

### SETUP: Gemini API Configuration

In [19]:
import google.generativeai as genai
import os
import json
import time
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

# Configure Gemini API
api_key = os.environ.get('GEMINI_API_KEY')
if not api_key:
    raise ValueError("GEMINI_API_KEY not found in environment variables!")

genai.configure(api_key=api_key)


MODEL_NAME = 'gemini-3-flash-preview'  

# Initialize model
model = genai.GenerativeModel(MODEL_NAME)

print("✓ Gemini API configured successfully")
print(f"✓ Model: {MODEL_NAME}")
print(f"✓ API Key: {api_key[:20]}...")


✓ Gemini API configured successfully
✓ Model: gemini-3-flash-preview
✓ API Key: AIzaSyD9iULbli9fennY...


### TASK 1: Topic Extraction Using Gemini

In [62]:
def extract_topics_gemini(novel_text, novel_name, num_topics=8):
    """
    Use Gemini to extract main topics from a novel.
    
    Args:
        novel_text: Full text of the novel
        novel_name: Name of the novel
        num_topics: Number of topics to extract (5-10 range)
    
    Returns:
        List of topic strings
    """
    prompt = f"""Analyze the following novel excerpt and identify {num_topics} main themes or topics.

Novel: {novel_name}

Text excerpt (first 15000 characters):
{novel_text[:15000]}

Please provide exactly {num_topics} distinct topics/themes that are central to this novel. 
Format your response as a JSON array of strings, like this:
["topic1", "topic2", "topic3", ...]

Each topic should be 2-4 words describing a key theme, subject, or motif in the novel.
Examples: "colonial exploitation", "moral ambiguity", "existential dread", "social alienation"

Return ONLY the JSON array, no additional text."""

    try:
        response = model.generate_content(prompt)
        topics_text = response.text.strip()
        
        # Extract JSON from response
        if '```json' in topics_text:
            topics_text = topics_text.split('```json')[1].split('```')[0].strip()
        elif '```' in topics_text:
            topics_text = topics_text.split('```')[1].split('```')[0].strip()
        
        topics = json.loads(topics_text)
        
        return topics
    
    except Exception as e:
        print(f"Error extracting topics: {e}")
        print(f"Response was: {response.text[:500]}")
        return None

print("✓ Topic extraction function ready")

✓ Topic extraction function ready


In [16]:
# Extract topics for all novels using Gemini
output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1')
topics_output = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_gemini')
topics_output.mkdir(parents=True, exist_ok=True)

# Author mapping
author_map = {
    'heart_of_darkness': 'Joseph Conrad',
    'lord_jim': 'Joseph Conrad',
    'metamorphosis': 'Franz Kafka',
    'the_trial': 'Franz Kafka',
    'typhoon': 'Joseph Conrad'
}

all_topics_gemini = {}

print("="*70)
print("TASK 1: TOPIC EXTRACTION USING GEMINI")
print("="*70)

for cleaned_file in sorted(output_dir.glob('*_cleaned.txt')):
    novel_name = cleaned_file.stem.replace('_cleaned', '')
    author = author_map.get(novel_name, 'Unknown')
    
    print(f"\n{'='*70}")
    print(f"NOVEL: {novel_name} by {author}")
    print(f"{'='*70}")
    
    # Read cleaned text
    novel_text = cleaned_file.read_text(encoding='utf-8')
    
    # Extract topics using Gemini
    print("Asking Gemini to extract topics...")
    topics = extract_topics_gemini(novel_text, novel_name, num_topics=8)
    
    if topics:
        print(f"✓ Extracted {len(topics)} topics:")
        for i, topic in enumerate(topics, 1):
            print(f"  {i}. {topic}")
        
        # Save topics
        topic_data = {
            'novel': novel_name,
            'author': author,
            'topics': topics,
            'num_topics': len(topics)
        }
        
        topic_file = topics_output / f"{novel_name}_topics.json"
        topic_file.write_text(json.dumps(topic_data, indent=2, ensure_ascii=False), encoding='utf-8')
        print(f"✓ Saved to {topic_file.name}")
        
        all_topics_gemini[novel_name] = topic_data
    else:
        print(f"✗ Failed to extract topics")
    
    # Rate limiting
    time.sleep(2)

# Save master file
master_file = topics_output / 'all_topics.json'
master_file.write_text(json.dumps(all_topics_gemini, indent=2, ensure_ascii=False), encoding='utf-8')

print(f"\n{'='*70}")
print("TOPIC EXTRACTION COMPLETE")
print(f"{'='*70}")
print(f"Processed {len(all_topics_gemini)} novels")
print(f"Output: {topics_output}")

TASK 1: TOPIC EXTRACTION USING GEMINI

NOVEL: heart_of_darkness by Joseph Conrad
Asking Gemini to extract topics...
✓ Extracted 8 topics:
  1. colonial exploitation
  2. savagery versus civilization
  3. the nature of darkness
  4. moral ambiguity
  5. the lure of exploration
  6. fascination of the abomination
  7. devotion to efficiency
  8. imperialist hypocrisy
✓ Saved to heart_of_darkness_topics.json

NOVEL: lord_jim by Joseph Conrad
Asking Gemini to extract topics...
✓ Extracted 8 topics:
  1. lost honour
  2. romantic idealism
  3. moral failure
  4. identity and reputation
  5. the seafaring life
  6. heroism and cowardice
  7. moral ambiguity
  8. the weight of the past
✓ Saved to lord_jim_topics.json

NOVEL: metamorphosis by Franz Kafka
Asking Gemini to extract topics...
✓ Extracted 8 topics:
  1. social alienation
  2. existential absurdity
  3. dehumanization of labor
  4. familial obligation
  5. loss of identity
  6. physical transformation
  7. communication breakdown
  

### TASK 2: Class 2 - Generic Paragraph Generation (500 paragraphs on topics)

In [17]:
def generate_generic_paragraphs(topics, novel_name, n_paragraphs=500, min_words=100, max_words=200, batch_size=20):
    """
    Generate generic paragraphs on given topics (no author style mimicking).
    
    Args:
        topics: List of topic strings
        novel_name: Name of the novel (for file naming)
        n_paragraphs: Total paragraphs to generate
        min_words: Minimum words per paragraph
        max_words: Maximum words per paragraph
        batch_size: Paragraphs per API call
    
    Returns:
        Path to output file
    """
    output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class2')
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{novel_name}_generic.jsonl"
    
    # Remove existing file if present
    if output_file.exists():
        output_file.unlink()
    
    topics_str = ", ".join(topics)
    generated_count = 0
    
    print(f"Generating {n_paragraphs} generic paragraphs...")
    print(f"Topics: {topics_str[:100]}...")
    
    while generated_count < n_paragraphs:
        to_generate = min(batch_size, n_paragraphs - generated_count)
        
        prompt = f"""Write {to_generate} distinct paragraphs about the following topics: {topics_str}

Requirements:
- Each paragraph should be {min_words}-{max_words} words
- Write in a clear, general prose style (not mimicking any specific author)
- Each paragraph should explore different aspects of the topics
- Make each paragraph standalone and coherent
- Focus on thematic exploration, not storytelling

Return ONLY the paragraphs, separated by three dashes (---), like this:
paragraph 1 text here...
---
paragraph 2 text here...
---
paragraph 3 text here..."""

        try:
            response = model.generate_content(prompt)
            text = response.text.strip()
            
            # Split by delimiter
            paragraphs = [p.strip() for p in text.split('---') if p.strip()]
            
            # Save paragraphs
            with output_file.open('a', encoding='utf-8') as f:
                for para in paragraphs[:to_generate]:
                    # Basic word count check
                    word_count = len(para.split())
                    if word_count < min_words * 0.8:  # Allow 20% tolerance
                        continue
                    
                    entry = {
                        'novel': novel_name,
                        'class': 'class2_generic',
                        'text': para,
                        'word_count': word_count
                    }
                    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
                    generated_count += 1
                    
                    if generated_count >= n_paragraphs:
                        break
            
            print(f"  Progress: {generated_count}/{n_paragraphs} paragraphs")
            time.sleep(1)  # Rate limiting
            
        except Exception as e:
            print(f"  Error in batch: {e}")
            time.sleep(2)
            continue
    
    print(f"✓ Generated {generated_count} paragraphs")
    return output_file

print("✓ Class 2 generation function ready")

✓ Class 2 generation function ready


In [19]:
# Run Class 2 generation for all novels
print("="*70)
print("TASK 2: CLASS 2 - GENERIC PARAGRAPH GENERATION")
print("="*70)

topics_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_gemini')

for topic_file in sorted(topics_dir.glob('*_topics.json')):
    if topic_file.name == 'all_topics.json':
        continue
    
    # Load topics
    with topic_file.open() as f:
        topic_data = json.load(f)
    
    novel_name = topic_data['novel']
    topics = topic_data['topics']
    
    print(f"\n{'='*70}")
    print(f"NOVEL: {novel_name}")
    print(f"{'='*70}")
    
    # Generate 500 generic paragraphs
    output_file = generate_generic_paragraphs(
        topics=topics,
        novel_name=novel_name,
        n_paragraphs=500,
        min_words=100,
        max_words=200,
        batch_size=20
    )
    
    print(f"✓ Saved to: {output_file}")

print(f"\n{'='*70}")
print("CLASS 2 GENERATION COMPLETE")
print(f"{'='*70}")

TASK 2: CLASS 2 - GENERIC PARAGRAPH GENERATION

NOVEL: heart_of_darkness
Generating 500 generic paragraphs...
Topics: colonial exploitation, savagery versus civilization, the nature of darkness, moral ambiguity, the lu...
  Progress: 20/500 paragraphs
  Progress: 40/500 paragraphs
  Progress: 60/500 paragraphs
  Progress: 80/500 paragraphs
  Progress: 100/500 paragraphs
  Progress: 120/500 paragraphs
  Progress: 140/500 paragraphs
  Progress: 160/500 paragraphs
  Progress: 180/500 paragraphs
  Progress: 200/500 paragraphs
  Progress: 220/500 paragraphs
  Progress: 240/500 paragraphs
  Progress: 260/500 paragraphs
  Progress: 280/500 paragraphs
  Progress: 300/500 paragraphs
  Progress: 320/500 paragraphs
  Progress: 340/500 paragraphs
  Progress: 360/500 paragraphs
  Progress: 380/500 paragraphs
  Progress: 400/500 paragraphs
  Error in batch: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.

KeyboardInterrupt: 

### RESUME CLASS 2 GENERATION (Continue from where it left off)

In [7]:
def count_existing_paragraphs(output_file):
    """Count how many paragraphs already exist in a JSONL file."""
    if not output_file.exists():
        return 0
    
    count = 0
    with output_file.open('r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                count += 1
    return count


In [5]:
def resume_generic_paragraphs(novel_name, topics, target_total=500, min_words=100, max_words=200, batch_size=20):
    """
    Resume generating paragraphs for a specific novel.
    Only generates the remaining paragraphs needed to reach target_total.
    """
    output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class2')
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{novel_name}_generic.jsonl"
    
    # Count existing paragraphs
    existing_count = count_existing_paragraphs(output_file)
    print(f"Novel: {novel_name}")
    print(f"Existing paragraphs: {existing_count}")
    print(f"Target total: {target_total}")
    
    if existing_count >= target_total:
        print(f"✓ Already complete! ({existing_count}/{target_total})")
        return output_file
    
    remaining = target_total - existing_count
    print(f"Generating {remaining} more paragraphs...\n")
    
    topics_str = ", ".join(topics)
    generated_count = 0
    
    while generated_count < remaining:
        to_generate = min(batch_size, remaining - generated_count)
        
        prompt = f"""Write {to_generate} distinct paragraphs about the following topics: {topics_str}

Requirements:
- Each paragraph should be {min_words}-{max_words} words
- Write in a clear, general prose style (not mimicking any specific author)
- Each paragraph should explore different aspects of the topics
- Make each paragraph standalone and coherent
- Focus on thematic exploration, not storytelling

Return ONLY the paragraphs, separated by three dashes (---), like this:
paragraph 1 text here...
---
paragraph 2 text here...
---
paragraph 3 text here..."""

        try:
            response = model.generate_content(prompt)
            text = response.text.strip()
            
            # Split by delimiter
            paragraphs = [p.strip() for p in text.split('---') if p.strip()]
            
            # Append paragraphs (not overwrite!)
            with output_file.open('a', encoding='utf-8') as f:
                for para in paragraphs[:to_generate]:
                    # Basic word count check
                    word_count = len(para.split())
                    if word_count < min_words * 0.8:  # Allow 20% tolerance
                        continue
                    
                    entry = {
                        'novel': novel_name,
                        'class': 'class2_generic',
                        'text': para,
                        'word_count': word_count
                    }
                    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
                    generated_count += 1
                    
                    if generated_count >= remaining:
                        break
            
            total_now = existing_count + generated_count
            print(f"  Progress: {total_now}/{target_total} paragraphs (added {generated_count})")
            time.sleep(1.5)  # Slightly longer delay to avoid rate limits
            
        except Exception as e:
            error_msg = str(e)
            print(f"  Error in batch: {error_msg[:200]}")
            
            # If rate limit error, show how to resume
            if "429" in error_msg or "quota" in error_msg.lower():
                print(f"\n⚠ Rate limit hit!")
                print(f"Progress saved: {existing_count + generated_count}/{target_total}")
                print(f"To resume, run this cell again after waiting.")
                break
            
            time.sleep(3)
            continue
    
    final_count = count_existing_paragraphs(output_file)
    print(f"\n✓ Final count: {final_count}/{target_total} paragraphs")
    return output_file

print("✓ Resume function ready")
print("Usage: resume_generic_paragraphs('novel_name', topics_list, target_total=500)")

✓ Resume function ready
Usage: resume_generic_paragraphs('novel_name', topics_list, target_total=500)


In [20]:
# RESUME GENERATION FOR SPECIFIC NOVEL
# Edit the novel_name below to resume generation for any incomplete novel

# Load topics for the novel
topics_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_gemini')

# Specify which novel to resume (change this as needed)
novel_to_resume = 'typhoon'  # Change to: 'lord_jim', 'metamorphosis', 'the_trial', 'typhoon'

# Load topics for this novel
topic_file = topics_dir / f"{novel_to_resume}_topics.json"
with topic_file.open() as f:
    topic_data = json.load(f)

print(f"{'='*70}")
print(f"RESUMING GENERATION FOR: {novel_to_resume}")
print(f"{'='*70}\n")

# Resume generation
resume_generic_paragraphs(
    novel_name=novel_to_resume,
    topics=topic_data['topics'],
    target_total=500,
    min_words=100,
    max_words=200,
    batch_size=20
)

RESUMING GENERATION FOR: typhoon

Novel: typhoon
Existing paragraphs: 326
Target total: 500
Generating 174 more paragraphs...

  Progress: 346/500 paragraphs (added 20)
  Progress: 366/500 paragraphs (added 40)
  Progress: 386/500 paragraphs (added 60)
  Progress: 406/500 paragraphs (added 80)
  Progress: 426/500 paragraphs (added 100)
  Progress: 446/500 paragraphs (added 120)
  Progress: 466/500 paragraphs (added 140)
  Progress: 486/500 paragraphs (added 160)
  Progress: 500/500 paragraphs (added 174)

✓ Final count: 500/500 paragraphs


WindowsPath('c:/Users/eisas/OneDrive/Desktop/PROJECTS/Precog_task/output/class2/typhoon_generic.jsonl')

In [22]:
# CHECK STATUS OF ALL NOVELS
# Run this to see which novels are incomplete

output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class2')
topics_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_gemini')

print("="*70)
print("CLASS 2 GENERATION STATUS")
print("="*70)

novel_names = ['heart_of_darkness', 'lord_jim', 'metamorphosis', 'the_trial', 'typhoon']

for novel_name in novel_names:
    output_file = output_dir / f"{novel_name}_generic.jsonl"
    count = count_existing_paragraphs(output_file)
    status = "✓ COMPLETE" if count >= 500 else f"⚠ INCOMPLETE ({500 - count} remaining)"
    print(f"{novel_name:20s}: {count:3d}/500  {status}")

print("="*70)

CLASS 2 GENERATION STATUS
heart_of_darkness   : 500/500  ✓ COMPLETE
lord_jim            : 500/500  ✓ COMPLETE
metamorphosis       : 500/500  ✓ COMPLETE
the_trial           : 500/500  ✓ COMPLETE
typhoon             : 500/500  ✓ COMPLETE


### TASK 3: Class 3 - Style-Matched Paragraph Generation (500 paragraphs mimicking author)

In [None]:
def generate_style_matched_paragraphs(topics, novel_name, author, style_sample, n_paragraphs=500, min_words=100, max_words=200, batch_size=15):
    """
    Generate paragraphs mimicking the author's writing style.
    
    Args:
        topics: List of topic strings
        novel_name: Name of the novel
        author: Author name
        style_sample: Sample text from the author (for style reference)
        n_paragraphs: Total paragraphs to generate
        min_words: Minimum words per paragraph
        max_words: Maximum words per paragraph
        batch_size: Paragraphs per API call
    
    Returns:
        Path to output file
    """
    output_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class3_styled')
    output_dir.mkdir(parents=True, exist_ok=True)
    output_file = output_dir / f"{novel_name}_styled.jsonl"
    
    # Remove existing file if present
    if output_file.exists():
        output_file.unlink()
    
    topics_str = ", ".join(topics)
    generated_count = 0
    
    print(f"Generating {n_paragraphs} style-matched paragraphs...")
    print(f"Author: {author}")
    print(f"Topics: {topics_str[:100]}...")
    
    while generated_count < n_paragraphs:
        to_generate = min(batch_size, n_paragraphs - generated_count)
        
        prompt = f"""You are a skilled writer tasked with writing in the style of {author}.

Here is a sample of {author}'s writing style from their work:
{style_sample[:3000]}

Now, write {to_generate} distinct paragraphs in {author}'s distinctive writing style, exploring these topics: {topics_str}

Requirements:
- Each paragraph should be {min_words}-{max_words} words
- Mimic {author}'s sentence structure, vocabulary, tone, and literary techniques
- Maintain {author}'s characteristic voice and perspective
- Each paragraph should explore different aspects of the topics
- Make each paragraph standalone and coherent

Return ONLY the paragraphs, separated by three dashes (---), like this:
paragraph 1 text here...
---
paragraph 2 text here...
---
paragraph 3 text here..."""

        try:
            response = model.generate_content(prompt)
            text = response.text.strip()
            
            # Split by delimiter
            paragraphs = [p.strip() for p in text.split('---') if p.strip()]
            
            # Save paragraphs
            with output_file.open('a', encoding='utf-8') as f:
                for para in paragraphs[:to_generate]:
                    # Basic word count check
                    word_count = len(para.split())
                    if word_count < min_words * 0.8:  # Allow 20% tolerance
                        continue
                    
                    entry = {
                        'novel': novel_name,
                        'author': author,
                        'class': 'class3_styled',
                        'text': para,
                        'word_count': word_count
                    }
                    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
                    generated_count += 1
                    
                    if generated_count >= n_paragraphs:
                        break
            
            print(f"  Progress: {generated_count}/{n_paragraphs} paragraphs")
            time.sleep(1)  # Rate limiting
            
        except Exception as e:
            print(f"  Error in batch: {e}")
            time.sleep(2)
            continue
    
    print(f"✓ Generated {generated_count} paragraphs")
    return output_file

print("✓ Class 3 generation function ready")

In [None]:
# Run Class 3 generation for all novels
print("="*70)
print("TASK 3: CLASS 3 - STYLE-MATCHED PARAGRAPH GENERATION")
print("="*70)

topics_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\topics_gemini')
class1_dir = Path(r'c:\Users\eisas\OneDrive\Desktop\PROJECTS\Precog_task\output\class1')

for topic_file in sorted(topics_dir.glob('*_topics.json')):
    if topic_file.name == 'all_topics.json':
        continue
    
    # Load topics
    with topic_file.open() as f:
        topic_data = json.load(f)
    
    novel_name = topic_data['novel']
    author = topic_data['author']
    topics = topic_data['topics']
    
    # Load style sample from cleaned novel
    cleaned_file = class1_dir / f"{novel_name}_cleaned.txt"
    style_sample = cleaned_file.read_text(encoding='utf-8')
    
    print(f"\n{'='*70}")
    print(f"NOVEL: {novel_name} by {author}")
    print(f"{'='*70}")
    
    # Generate 500 style-matched paragraphs
    output_file = generate_style_matched_paragraphs(
        topics=topics,
        novel_name=novel_name,
        author=author,
        style_sample=style_sample,
        n_paragraphs=500,
        min_words=100,
        max_words=200,
        batch_size=15
    )
    
    print(f"✓ Saved to: {output_file}")

print(f"\n{'='*70}")
print("CLASS 3 GENERATION COMPLETE")
print(f"{'='*70}")
print("\n🎉 ALL TASKS COMPLETE! 🎉")
print("\nSummary:")
print("✓ Task 1: Topics extracted using Gemini")
print("✓ Task 2: 500 generic paragraphs per novel (class2_generic/)")
print("✓ Task 3: 500 style-matched paragraphs per novel (class3_styled/)")