In [7]:
import pandas as pd
import pickle
import glob
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load all .pkl files in the current directory
pkl_files = glob.glob('0*.pkl')
loaded_data = [pickle.load(open(file, 'rb')) for file in pkl_files]

In [8]:
min_len = 3

def get_character_set(book_name: str, book_number: str) -> pd.DataFrame:
  df_entities = pd.read_csv(f'./booknlp/output/{book_name}.entities', sep='\t')
  df_entities = df_entities[(df_entities['cat'] == 'PER') & (df_entities['prop'] == 'PROP')]
  df_entities['text'] = df_entities['text'].str.lower()
  df_entities = df_entities.rename(columns={'COREF': 'character_id', 'text': 'characters'})
  df_entities['character_id'] = book_number + '_' + df_entities['character_id'].astype(str)
  df_entities = df_entities.groupby('character_id')['characters'].apply(lambda x: list(set(x))).reset_index()
  df_entities = df_entities[df_entities['characters'].apply(lambda x: len(x) >= min_len)]
  return df_entities

In [9]:
eye_of_the_world_characters = get_character_set('the_eye_of_the_world', '01')
great_hunt_characters = get_character_set('02_the_great_hunt', '02')
dragon_reborn_characters = get_character_set('03_the_dragon_reborn', '03')
shadow_rising_characters = get_character_set('04_shadow_rising', '04')
fires_of_heaven_characters = get_character_set('05_fires_of_heaven', '05')
lord_of_chaos_characters = get_character_set('06_lord_of_chaos', '06')
crown_of_swords_characters = get_character_set('07_crown_of_swords', '07')
path_of_daggers_characters = get_character_set('08_the_path_of_daggers', '08')
winter_heart_characters = get_character_set('09_winters_heart', '09')
# crossroads_of_twilight_characters = get_character_set('10_crossroads_of_twilight', '10')
# knife_of_dreams_characters = get_character_set('11_knife_of_dreams', '11')
# gathering_storm_characters = get_character_set('12_the_gathering_storm', '12')
# towers_of_midnight_characters = get_character_set('13_towers_of_midnight', '13')
# memory_of_light_characters = get_character_set('14_a_memory_of_light', '14')


In [10]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('mps')

In [11]:
# Function to merge characters based on similarity
def merge_characters(df1, df2, threshold=0.8):
    merged_characters = {}
    df2_merged_ids = set()
    similarity_scores = []
    # Encode characters in each dataframe
    df1['embeddings'] = df1['characters'].apply(lambda x: model.encode(x))
    df2['embeddings'] = df2['characters'].apply(lambda x: model.encode(x))


    for idx1, row1 in df1.iterrows():
        characters1 = row1['characters']
        embeddings1 = np.mean(row1['embeddings'], axis=0)
        primary_character_id = row1['character_id']

        # Create a row for the character_id from the first dataframe
        merged_characters[primary_character_id] = set(characters1)

        for idx2, row2 in df2.iterrows():
            characters2 = row2['characters']
            embeddings2 =  np.mean(row2['embeddings'], axis=0)

            # Calculate cosine similarity between embeddings
            similarity_matrix = cosine_similarity(embeddings1.reshape(1, -1), embeddings2.reshape(1, -1))
            similarity_score = similarity_matrix[0][0]
            similarity_scores.append(similarity_score)

            # Merge characters if similarity is above the threshold
            if similarity_score > threshold:
                merged_characters[primary_character_id].update(characters2)
                df2_merged_ids.update(row2['character_id'])

    # Add characters from the second dataframe that were not merged
    for idx, row in df2.iterrows():
        if row['character_id'] not in df2_merged_ids:
            merged_characters[row['character_id']] = set(row['characters'])

    return merged_characters


In [12]:
first_merge = merge_characters(eye_of_the_world_characters, great_hunt_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in first_merge.items()], columns=['character_id', 'characters'])
second_merge = merge_characters(merged_df, dragon_reborn_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in second_merge.items()], columns=['character_id', 'characters'])
third_merge = merge_characters(merged_df, shadow_rising_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in third_merge.items()], columns=['character_id', 'characters'])
fourth_merge = merge_characters(merged_df, fires_of_heaven_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in fourth_merge.items()], columns=['character_id', 'characters'])
fifth_merge = merge_characters(merged_df, lord_of_chaos_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in fifth_merge.items()], columns=['character_id', 'characters'])
sixth_merge = merge_characters(merged_df, crown_of_swords_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in sixth_merge.items()], columns=['character_id', 'characters'])
seventh_merge = merge_characters(merged_df, path_of_daggers_characters)
merged_df = pd.DataFrame([(k, list(v)) for k, v in seventh_merge.items()], columns=['character_id', 'characters'])
eighth_merge = merge_characters(merged_df, winter_heart_characters)

In [13]:
# Save the merged characters to a .pkl file
pickle.dump(eighth_merge, open('merged_characters.pkl', 'wb'))