In [17]:
import pandas as pd
import pickle
import glob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from typing import List

# Load all .pkl files in the current directory
pkl_files = glob.glob('./chunks/sherlock_holmes/0*.pkl')
loaded_data = [pickle.load(open(file, 'rb')) for file in pkl_files]

In [18]:
min_len = 3

def get_character_set(book_name: str, book_number: str) -> pd.DataFrame:
  df_entities = pd.read_csv(f'./booknlp/output/sherlock_holmes/{book_name}.entities', sep='\t')
  df_entities = df_entities[(df_entities['cat'] == 'PER') & (df_entities['prop'] == 'PROP')]
  df_entities['text'] = df_entities['text'].str.lower()
  df_entities = df_entities.rename(columns={'COREF': 'character_id', 'text': 'characters'})
  df_entities['character_id'] = book_number + '_' + df_entities['character_id'].astype(str)
  df_entities = df_entities.groupby('character_id')['characters'].apply(lambda x: list(set(x))).reset_index()
  df_entities = df_entities[df_entities['characters'].apply(lambda x: len(x) >= min_len)]
  return df_entities

In [9]:
eye_of_the_world_characters = get_character_set('the_eye_of_the_world', '01')
great_hunt_characters = get_character_set('02_the_great_hunt', '02')
dragon_reborn_characters = get_character_set('03_the_dragon_reborn', '03')
shadow_rising_characters = get_character_set('04_shadow_rising', '04')
fires_of_heaven_characters = get_character_set('05_fires_of_heaven', '05')
lord_of_chaos_characters = get_character_set('06_lord_of_chaos', '06')
crown_of_swords_characters = get_character_set('07_crown_of_swords', '07')
path_of_daggers_characters = get_character_set('08_the_path_of_daggers', '08')
winter_heart_characters = get_character_set('09_winters_heart', '09')
# crossroads_of_twilight_characters = get_character_set('10_crossroads_of_twilight', '10')
# knife_of_dreams_characters = get_character_set('11_knife_of_dreams', '11')
# gathering_storm_characters = get_character_set('12_the_gathering_storm', '12')
# towers_of_midnight_characters = get_character_set('13_towers_of_midnight', '13')
# memory_of_light_characters = get_character_set('14_a_memory_of_light', '14')


In [23]:
book_titles = ['01_a_study_in_scarlet', '02_the_sign_of_four', '03_the_hound_of_the_baskervilles', '04_the_valley_of_fear']

In [30]:

def collect_characters(book_titles: list) -> pd.DataFrame:
  result_df = pd.DataFrame(columns=['book_title', 'character_id', 'characters'])
  for book_title in book_titles:
    print(f'Processing {book_title}')
    book_number = book_title.split('_')[0]
    df = get_character_set(book_title, book_number)
    print(f'Found {len(df)} characters')
    df['book_title'] = book_title
    result_df = pd.concat([result_df, df])
  return result_df

In [31]:
all_characters = collect_characters(book_titles)

Processing 01_a_study_in_scarlet
Found 13 characters
Processing 02_the_sign_of_four
Found 14 characters
Processing 03_the_hound_of_the_baskervilles
Found 11 characters
Processing 04_the_valley_of_fear
Found 28 characters


In [32]:
all_characters.head()

Unnamed: 0,book_title,character_id,characters
3,01_a_study_in_scarlet,01_119,"[holmes, mr. sherlock holmes, sherlock holmes,..."
6,01_a_study_in_scarlet,01_122,"[john ferrier talks, ferrier, old john ferrier..."
7,01_a_study_in_scarlet,01_123,"[watson, john watson, _ john h. watson]"
22,01_a_study_in_scarlet,01_138,"[edgar allen poe ’s dupin, poe, dupin]"
26,01_a_study_in_scarlet,01_142,"[mr. drebber, mr. enoch j. drebber, enoch dreb..."


In [33]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('mps')

In [35]:
# Function to merge characters based on similarity
def merge_characters(df1, df2, threshold=0.8):
    merged_characters = {}
    df2_merged_ids = set()
    similarity_scores = []
    # Encode characters in each dataframe
    df1['embeddings'] = df1['characters'].apply(lambda x: model.encode(x))
    df2['embeddings'] = df2['characters'].apply(lambda x: model.encode(x))


    for idx1, row1 in df1.iterrows():
        characters1 = row1['characters']
        embeddings1 = np.mean(row1['embeddings'], axis=0)
        primary_character_id = row1['character_id']

        # Create a row for the character_id from the first dataframe
        merged_characters[primary_character_id] = set(characters1)

        for idx2, row2 in df2.iterrows():
            characters2 = row2['characters']
            embeddings2 =  np.mean(row2['embeddings'], axis=0)

            # Calculate cosine similarity between embeddings
            similarity_matrix = cosine_similarity(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))
            similarity_score = similarity_matrix[0][0]
            similarity_scores.append(similarity_score)

            # Merge characters if similarity is above the threshold
            if similarity_score > threshold:
                merged_characters[primary_character_id].update(characters2)
                df2_merged_ids.update(row2['character_id'])

    # Add characters from the second dataframe that were not merged
    for idx, row in df2.iterrows():
        if row['character_id'] not in df2_merged_ids:
            merged_characters[row['character_id']] = set(row['characters'])

    return merged_characters

In [37]:
# Initialize the merged_characters dataframe with the characters from the first book
merged_characters = all_characters[all_characters['book_title'] == book_titles[0]].copy()

# Loop through the remaining books and merge characters
for book_title in book_titles[1:]:
    print(f'Merging characters from {book_title}')
    next_book_characters = all_characters[all_characters['book_title'] == book_title].copy()
    merged_characters_dict = merge_characters(merged_characters, next_book_characters)
    merged_characters = pd.DataFrame([(k, list(v)) for k, v in merged_characters_dict.items()], columns=['character_id', 'characters'])

# Display the merged characters dataframe
merged_characters.head()

Merging characters from 02_the_sign_of_four
Merging characters from 03_the_hound_of_the_baskervilles
Merging characters from 04_the_valley_of_fear


Unnamed: 0,character_id,characters
0,01_119,"[holmes, mr. sherlock holmes, mr. sherlock hol..."
1,01_122,"[john ferrier talks, old john ferrier, john, f..."
2,01_123,"[watson, john watson, _ john h. watson]"
3,01_138,"[edgar allen poe ’s dupin, poe, dupin]"
4,01_142,"[mr. drebber, mr. enoch j. drebber, enoch dreb..."


In [40]:
merged_characters_dict = merged_characters.set_index('character_id')['characters'].to_dict()
merged_characters_dict

{'01_119': ['holmes',
  'mr. sherlock holmes',
  'mr. sherlock holmes mr. sherlock holmes',
  'sherlock holmes',
  'mr . sherlock holmes',
  'dear mr. holmes',
  'mr. holmes',
  'mr. sherlock'],
 '01_122': ['john ferrier talks',
  'old john ferrier',
  'john',
  'ferrier',
  'john ferrier'],
 '01_123': ['watson', 'john watson', '_ john h. watson'],
 '01_138': ['edgar allen poe ’s dupin', 'poe', 'dupin'],
 '01_142': ['mr. drebber',
  'mr. enoch j. drebber',
  'enoch drebber',
  'brother drebber',
  'mr. enoch drebber',
  'young drebber',
  'enoch j. drebber',
  'drebber'],
 '01_146': ['mr. stangerson',
  'joseph stangerson',
  'a mr. stangerson',
  'mr. joseph stangerson',
  'stangerson'],
 '01_149': ['rachel', 'rache', 'miss rachel'],
 '01_183': ['jefferson hope', 'hope', 'mr. jefferson hope'],
 '01_192': ['brother stangerson', 'young stangerson', 'stangerson'],
 '01_193': ['ferrier',
  'lucy ferrier',
  'old ferrier',
  'lucy',
  'sweet lucy',
  'poor lucy',
  'little lucy ferrier'],


In [41]:
# Save the merged characters to a .pkl file
pickle.dump(merged_characters_dict, open('./characters/sherlock_holmes_characters.pkl', 'wb'))