This notebook is to merge the trope vocabulary and cleaned data set (books) together as this will be used during the TF-IDF process.

In [45]:
import pandas as pd

In [50]:
#import data 
tropes = pd.read_csv("../data/model_trope_vocabulary.csv")
books = pd.read_csv("../data/clean_best_books.csv")

# keep only tropes you want in the model
tropes = tropes[tropes["include_in_model"] == True].copy()

In [51]:
tropes

Unnamed: 0,trope_id,name,category,description,raw_tags,include_in_model,notes
0,action_adventure,Action & Adventure,genre,Plot-driven stories focused on physical action...,"Action, Adventure",True,
1,contemporary_fiction,Contemporary / General Fiction,genre,Stories set roughly in the present day with re...,"Christian Contemporary Fiction, Contemporary, ...",True,
2,fantasy,Fantasy,genre,"Stories with magical or supernatural elements,...","Christian Fantasy, Comic Fantasy, Dark Fantasy...",True,
3,science_fiction,Science Fiction,genre,"Stories centered on speculative technology, sc...","Cyberpunk, Hard Science Fiction, M M Science F...",True,
4,dystopian_post_apocalyptic,Dystopian / Post-Apocalyptic,genre,"Stories set in collapsed, oppressive, or radic...","Aftermath, Apocalypse, Dystopia, Dystopian, Po...",True,
5,horror,Horror,genre,"Stories intended to frighten or unsettle, ofte...","Erotic Horror, Gothic, Gothic Horror, Gothic R...",True,
6,mystery_crime_thriller,Mystery / Crime / Thriller,genre,"Stories driven by investigation, crime, or sus...","Cozy Mystery, Crime, Golden Age Mystery, Histo...",True,
7,romance,Romance,genre,Stories where a romantic relationship is the p...,"African American Romance, Category Romance, Ch...",True,
8,historical_fiction,Historical Fiction,genre,"Stories set primarily in a past era, often eng...","Alternate History, Christian Historical Fictio...",True,
9,paranormal_supernatural,Paranormal / Supernatural,genre,"Stories featuring ghosts, magic, or supernatur...","Angels, Demons, Erotic Paranormal Romance, M M...",True,


In [52]:
#build a phrase list for each trope
def make_trope_patterns(tropes_df):
    trope_patterns = {}
    for _, row in tropes_df.iterrows():
        trope_id = row["trope_id"]

        # split raw_tags into individual phrases
        raw_tags = str(row.get("raw_tags", ""))
        tags = [t.strip().lower() for t in raw_tags.split(",") if t.strip()]

        # also include the main trope name
        name = str(row.get("name", "")).lower()
        if name and name not in tags:
            tags.append(name)

        trope_patterns[trope_id] = tags
    return trope_patterns

trope_patterns = make_trope_patterns(tropes)


In [53]:
#this should print a dictionary of all the trope tags
trope_patterns

{'action_adventure': ['action', 'adventure', 'action & adventure'],
 'contemporary_fiction': ['christian contemporary fiction',
  'contemporary',
  'contemporary romance',
  'm m contemporary',
  'urban',
  'young adult contemporary',
  'contemporary / general fiction'],
 'fantasy': ['christian fantasy',
  'comic fantasy',
  'dark fantasy',
  'epic fantasy',
  'fairy tale',
  'fantasy',
  'fantasy of manners',
  'fantasy romance',
  'heroic fantasy',
  'high fantasy',
  'historical fantasy',
  'isekai',
  'low fantasy',
  'm m fantasy',
  'paranormal urban fantasy',
  'sci fi fantasy',
  'science fiction fantasy',
  'urban fantasy',
  'young adult fantasy'],
 'science_fiction': ['cyberpunk',
  'hard science fiction',
  'm m science fiction',
  'military science fiction',
  'sci fi fantasy',
  'sci-fi',
  'science fiction',
  'science fiction fantasy',
  'science fiction romance',
  'space opera',
  'steampunk',
  'young adult science fiction'],
 'dystopian_post_apocalyptic': ['aftermat

In [54]:
def infer_tropes_for_row(row, trope_patterns):
    # combine description + genres into one text field
    text = (
        str(row.get("blurb", "")) + " "
        + str(row.get("genres", ""))
    ).lower()

    labels = {}
    for trope_id, tags in trope_patterns.items():
        hit = 0
        for phrase in tags:
            if phrase and phrase in text:
                hit = 1
                break
        labels[trope_id] = hit
    return labels

# build labels for every book
rows = []
for _, row in books.iterrows():
    gid = row["goodreads_id"]
    labels = infer_tropes_for_row(row, trope_patterns)
    labels["goodreads_id"] = gid
    rows.append(labels)

trope_labels_df = pd.DataFrame(rows)


In [55]:
#print the data frame (should have updated rows)
trope_labels_df.head()

Unnamed: 0,action_adventure,contemporary_fiction,fantasy,science_fiction,dystopian_post_apocalyptic,horror,mystery_crime_thriller,romance,historical_fiction,paranormal_supernatural,...,faith_centered_lead,cw_violence_gore,cw_abuse,cw_bullying_harassment,cw_self_harm_suicide,cw_substance_use,cw_medical_trauma,cw_kidnapping_captivity,cw_trauma_ptsd,goodreads_id
0,1,0,1,1,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,2767052
1,1,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,2657
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1885
4,0,1,1,0,0,0,1,1,0,1,...,0,1,0,0,0,0,0,0,0,41865


In [56]:
#merge the clean data set with the rows for each trope
books_with_tropes = books.merge(
    trope_labels_df,
    on="goodreads_id",
    how="left"
)

In [57]:
books_with_tropes

Unnamed: 0,goodreads_id,title,blurb,genres,action_adventure,contemporary_fiction,fantasy,science_fiction,dystopian_post_apocalyptic,horror,...,neurodivergent_lead,faith_centered_lead,cw_violence_gore,cw_abuse,cw_bullying_harassment,cw_self_harm_suicide,cw_substance_use,cw_medical_trauma,cw_kidnapping_captivity,cw_trauma_ptsd
0,2767052,The Hunger Games,winning means fame and fortune.losing means ce...,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...",1,0,1,1,1,0,...,0,0,1,0,0,0,0,0,0,0
1,2,Harry Potter and the Order of the Phoenix,there is a door at the end of a silent corrido...,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...",1,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2657,To Kill a Mockingbird,the unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ...",0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1885,Pride and Prejudice,alternate cover edition of isbn 9780679783268s...,"['Classics', 'Fiction', 'Romance', 'Historical...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,41865,Twilight,about three things i was absolutely positive.\...,"['Young Adult', 'Fantasy', 'Romance', 'Vampire...",0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52581,11492014,Fractured,the fateful trilogy continues with fractured. ...,"['Vampires', 'Paranormal', 'Young Adult', 'Rom...",0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
52582,11836711,Anasazi,"'anasazi', sequel to 'the thirteenth chime' by...","['Mystery', 'Young Adult']",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52583,10815662,Marked,--readers favorite awards winner 2011--sixteen...,"['Fantasy', 'Young Adult', 'Paranormal', 'Ange...",1,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
52584,11330278,Wayward Son,a powerful tremor unearths an ancient secretbu...,"['Fiction', 'Mystery', 'Historical Fiction', '...",1,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0


In [63]:
#export as a csv
#trope_labels_df = trope_labels_df.loc[:, ~trope_labels_df.columns.str.contains(r"^Unnamed")]
books_with_tropes.to_csv("merged_trope_vocabulary.csv", index=False)