In [38]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm

def load_docs_with_embeddings(
    book_ids = None,
    books_dir=Path("../data/books"),
    embeddings_dir=Path("../data/books/embeddings")
):
    docs = []
    doc_ids = []
    embeddings = None
    if book_ids is None:
        book_ids = [embeddings_file.stem.split("_")[0] for embeddings_file in embeddings_dir.glob("*.npy")]

    for book_id in tqdm(book_ids):
        embeddings_file = embeddings_dir / f"{book_id}_embeddings.npy"
        if not embeddings_file.exists():
            print(f"File {embeddings_file} not found")
            continue
        
        if embeddings is None:
            embeddings = np.load(embeddings_file)
        else:
            _embeddings = np.load(embeddings_file)
            embeddings = np.concatenate([embeddings, _embeddings])

        metadata = pd.read_csv(embeddings_dir / f"{book_id}_metadata.csv")
        try:


            with open(books_dir / f"{book_id}.txt", "r", encoding="utf-8") as f:
                text = f.read()

        except FileNotFoundError:
            print(f"File {book_id}.txt not found")

            continue

        for k, meta_row in metadata.iterrows():
            docs.append(text[meta_row["start_index"]:meta_row["end_index"]])
            doc_ids.append(book_id)

    return docs, doc_ids, embeddings

docs, doc_ids, embeddings = load_docs_with_embeddings()


100%|██████████| 455/455 [00:56<00:00,  8.10it/s]


In [39]:
from bertopic import BERTopic
from bertopic.representation import PartOfSpeech

# repr_model = PartOfSpeech()
# topic_model = BERTopic(representation_model=repr_model, verbose=True)
# topics, _ = topic_model.fit_transform(docs, embeddings=embeddings)
# print(topics)
# topic_model.save("topic_model.pkl")
topic_model = BERTopic.load("topic_model.pkl")




In [7]:
topic_model.save("topic_model.pkl")




In [40]:
genre_csv = pd.read_csv("lit_goodreads_genre_filtered.csv")
book_genre_map = {book_id: genre_csv[genre_csv["title_id"] == book_id]["Trope"].to_list()[0]  for book_id in set(doc_ids)}
tropes = [book_genre_map[doc_id] for doc_id in doc_ids]

# df = topic_model.get_topic_info()

topics_per_class = topic_model.topics_per_class(docs, tropes)
topic_model.visualize_topics_per_class(topics_per_class)    

















283955 283955


37it [32:30, 52.72s/it] 


In [41]:
topic_model.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,51135,-1_chapter_one_more_man,"[chapter, one, more, man, like, little, such, ...","[“Perhaps, sir,” said he, “you may want a litt..."
1,0,2180,0_monkey_pig_sage_friar,"[monkey, pig, sage, friar, demon, bodhisattva,...",[Everyone in Heaven and Earth knows that I am ...
2,1,1896,1_madame_francs_order_fashion,"[madame, francs, order, fashion, monsieur, eve...",[Instead of bringing people to dine every day ...
3,2,1695,2_scout_rifle_savage_woods,"[scout, rifle, savage, woods, warrior, chief, ...",[The Mingos know him better by the death he gi...
4,3,1564,3_pater_tunic_tablet_wax,"[pater, tunic, tablet, wax, marble, slaves, pa...","[‘A bit of both, I suppose,’ said Jonathan, as..."
...,...,...,...,...,...
2280,2279,10,2279_eventfulness_wordage_postscript_newborn,"[eventfulness, wordage, postscript, newborn, m...",[Our gallant friend was dead. Next the Count p...
2281,2280,10,2280_raft_jezail_coffin_cholera,"[raft, jezail, coffin, cholera, marksman, clif...",[As the sun rose and the morning mists smoked ...
2282,2281,10,2281_monsieur_hostler_cabaret_convent,"[monsieur, hostler, cabaret, convent, horse, m...","[If not, I will apply to the chancellor, I wil..."
2283,2282,10,2282_svartalves_paranoid_ballroom_defenses,"[svartalves, paranoid, ballroom, defenses, cry...",[“Keep an eye out. If you see them sending the...


In [47]:
topic_model.get_document_info(docs)


Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,"About the Author The author of two novels, Ho...",50,50_chapter_copyright_author_publishers,"[chapter, copyright, author, publishers, publi...",[ALSO BY RAINBOW ROWELL LANDLINE FANGIRL ELEAN...,chapter - copyright - author - publishers - pu...,0.642400,False
1,"I have no words I can rely on because, much to...",303,303_race_car_zebra_driver,"[race, car, zebra, driver, racing, track, dog,...",[“Hold!” Denny turned around and placed his ha...,race - car - zebra - driver - racing - track -...,0.213414,False
2,I’ve seen it on TV. A documentary I saw about ...,303,303_race_car_zebra_driver,"[race, car, zebra, driver, racing, track, dog,...",[“Hold!” Denny turned around and placed his ha...,race - car - zebra - driver - racing - track -...,0.576407,False
3,It takes humanlike willpower to hold back on t...,-1,-1_chapter_one_more_man,"[chapter, one, more, man, like, little, such, ...","[“Perhaps, sir,” said he, “you may want a litt...",chapter - one - more - man - like - little - s...,0.000000,False
4,"It’s cheap, and sometimes he takes a container...",-1,-1_chapter_one_more_man,"[chapter, one, more, man, like, little, such, ...","[“Perhaps, sir,” said he, “you may want a litt...",chapter - one - more - man - like - little - s...,0.000000,False
...,...,...,...,...,...,...,...,...
283950,"The doors were opened, and he went in stiffly,...",112,112_dragons_ferals_dragon_harness,"[dragons, ferals, dragon, harness, covert, egg...","[Temeraire, if you have not seen our signal, a...",dragons - ferals - dragon - harness - covert -...,0.078469,False
283951,He had given his own personal assurances to th...,112,112_dragons_ferals_dragon_harness,"[dragons, ferals, dragon, harness, covert, egg...","[Temeraire, if you have not seen our signal, a...",dragons - ferals - dragon - harness - covert -...,0.680578,False
283952,"His captain, Richard Clark, was lying on a cot...",112,112_dragons_ferals_dragon_harness,"[dragons, ferals, dragon, harness, covert, egg...","[Temeraire, if you have not seen our signal, a...",dragons - ferals - dragon - harness - covert -...,0.601261,False
283953,"""The surgeons think now it was the American dr...",112,112_dragons_ferals_dragon_harness,"[dragons, ferals, dragon, harness, covert, egg...","[Temeraire, if you have not seen our signal, a...",dragons - ferals - dragon - harness - covert -...,0.576276,False
