In [38]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

def load_docs_with_embeddings(
    book_ids = None,
    books_dir=Path("../data/books"),
    embeddings_dir=Path("../data/books/embeddings")
):
    docs = []
    doc_ids = []
    embeddings = None
    if book_ids is None:
        book_ids = [embeddings_file.stem.split("_")[0] for embeddings_file in embeddings_dir.glob("*.npy")]

    for book_id in tqdm(book_ids):
        embeddings_file = embeddings_dir / f"{book_id}_embeddings.npy"
        if not embeddings_file.exists():
            print(f"File {embeddings_file} not found")
            continue
        
        if embeddings is None:
            embeddings = np.load(embeddings_file)
        else:
            _embeddings = np.load(embeddings_file)
            embeddings = np.concatenate([embeddings, _embeddings])

        metadata = pd.read_csv(embeddings_dir / f"{book_id}_metadata.csv")
        try:


            with open(books_dir / f"{book_id}.txt", "r", encoding="utf-8") as f:
                text = f.read()

        except FileNotFoundError:
            print(f"File {book_id}.txt not found")

            continue

        for k, meta_row in metadata.iterrows():
            docs.append(text[meta_row["start_index"]:meta_row["end_index"]])
            doc_ids.append(book_id)

    return docs, doc_ids, embeddings

docs, doc_ids, embeddings = load_docs_with_embeddings()


100%|██████████| 455/455 [00:56<00:00,  8.10it/s]


In [39]:
from bertopic import BERTopic
from bertopic.representation import PartOfSpeech

# repr_model = PartOfSpeech()
# topic_model = BERTopic(representation_model=repr_model, verbose=True)
# topics, _ = topic_model.fit_transform(docs, embeddings=embeddings)
# print(topics)
# topic_model.save("topic_model.pkl")
topic_model = BERTopic.load("topic_model.pkl")




In [7]:
topic_model.save("topic_model.pkl")




In [None]:
genre_csv = pd.read_csv("lit_goodreads_genre_filtered.csv")
book_genre_map = {book_id: genre_csv[genre_csv["title_id"] == book_id]["Trope"].to_list()[0]  for book_id in set(doc_ids)}
tropes = [book_genre_map[doc_id] for doc_id in doc_ids]

# df = topic_model.get_topic_info()

topics_per_class = topic_model.topics_per_class(docs, tropes)
topic_model.visualize_topics_per_class(topics_per_class)    

















283955 283955


32it [26:50, 20.14s/it] 