In [8]:
import pandas as pd
from itertools import combinations
import re
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import networkx as nx
from fa2_modified import ForceAtlas2
import numpy as np
import pickle as pkl
from textblob import TextBlob

# Load the dataset
df = pd.read_csv('data/avatar.csv', encoding='ISO-8859-1')

# Dictionary to store character connections
character_connections = {}
# Dictionary to store connections by chapter
chapter_connections = {}
# Dictionary to store character connections by book
character_connections_by_book = {}


scene_characters = set()  # Characters in the current scene


# Name replacements for normalization
name_replacements = {
    'young zuko': 'zuko',
    'young azula': 'azula',
    'young katara': 'katara',
    'young sokka': 'sokka',
    'young toph': 'toph',
    'young aang': 'aang',
    'king bumi': 'bumi',
    'avatar roku': 'roku',
    'avatar kyoshi': 'kyoshi',
    'avatar kuruk': 'kuruk',
    'avatar yangchen': 'yangchen',
    'aang:': 'aang',
    'sha-mo:': 'sha-mo',    
}

# Exclude these words from being counted as characters
invalid_characters = {'together', 'both'}

# Function to normalize character names
def normalize_name(name):
    # Convert to lowercase, strip spaces, and standardize
    normalized = name.lower().strip()
    return name_replacements.get(normalized, normalized)

# Function to split multiple characters and normalize their names
def split_characters(character):
    # Replace "Team Avatar" with its members
    if 'team avatar' in character.lower():
        return ['sokka', 'katara', 'aang', 'toph']
    
    # Use regex to split by commas or 'and', and normalize each name
    names = [normalize_name(name) for name in re.split(r',|\band\b', character)]
    # Filter out invalid characters
    return [name for name in names if name not in invalid_characters]

In [9]:
# Iterate over each row to detect scene boundaries
for _, row in df.iterrows():
    # Check if the row indicates a new scene
    if row['character'] == 'Scene Description':
        # Create character pairs for the completed scene
        pairs = combinations(scene_characters, 2)
        for pair in pairs:
            pair = tuple(sorted(pair))  # Ensure consistent ordering
            if pair in character_connections:
                character_connections[pair] += 1
            else:
                character_connections[pair] = 1

        # Reset the scene_characters for the next scene
        scene_characters = set()
    else:
        # Check for multiple characters and add each to the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

# Convert to DataFrame for analysis
connections_df = pd.DataFrame(list(character_connections.items()), columns=['pair', 'count'])

# Display the connections
print(connections_df)

# Save the DataFrame to a CSV file
connections_df.to_csv('character_connections.csv', index=False)


                pair  count
0    (katara, sokka)    386
1       (iroh, zuko)    112
2     (aang, katara)    411
3      (aang, sokka)    409
4      (aang, kanna)      3
..               ...    ...
963     (bumi, iroh)      1
964    (iroh, pakku)      1
965     (ozai, suki)      1
966      (iroh, mai)      1
967      (mai, toph)      1

[968 rows x 2 columns]


In [10]:
# Iterate over each chapter
for (book, chapter), chapter_group in df.groupby(['book', 'chapter']):
    scene_characters = set()  # Characters in the current scene
    chapter_key = (book, chapter)  # Use book and chapter as a key
    chapter_connections[chapter_key] = {}

    # Iterate over rows within the chapter
    for _, row in chapter_group.iterrows():
        if row['character'] == 'Scene Description':
            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))
                if pair in chapter_connections[chapter_key]:
                    chapter_connections[chapter_key][pair] += 1
                else:
                    chapter_connections[chapter_key][pair] = 1

            # Reset the scene_characters for the next scene
            scene_characters = set()
        else:
            # Check for multiple characters and add each to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

# Convert chapter-based connections to a DataFrame
chapter_connections_list = []
for (book, chapter), connections in chapter_connections.items():
    for pair, count in connections.items():
        chapter_connections_list.append({
            'book': book,
            'chapter': chapter,
            'pair': pair,
            'count': count
        })

chapter_connections_df = pd.DataFrame(chapter_connections_list)

# Display the connections
print(chapter_connections_df)

# Save the DataFrame to a CSV file
chapter_connections_df.to_csv('chapter_character_connections.csv', index=False)


       book                               chapter  \
0     Earth                      Appa's Lost Days   
1     Earth                      Appa's Lost Days   
2     Earth                      Appa's Lost Days   
3     Earth                      Appa's Lost Days   
4     Earth                      Appa's Lost Days   
...     ...                                   ...   
1594  Water  Winter Solstice, Part 2: Avatar Roku   
1595  Water  Winter Solstice, Part 2: Avatar Roku   
1596  Water  Winter Solstice, Part 2: Avatar Roku   
1597  Water  Winter Solstice, Part 2: Avatar Roku   
1598  Water  Winter Solstice, Part 2: Avatar Roku   

                                pair  count  
0                   (ghashiun, toph)      1  
1          (ghashiun, sandbender #2)      1  
2          (ghashiun, sandbender #1)      1  
3     (sandbender #1, sandbender #2)      1  
4            (ghashiun, merchant #1)      1  
...                              ...    ...  
1594                    (aang, roku)     

In [11]:
# Iterate over each book
for book, book_group in df.groupby('book'):
    scene_characters = set()  # Characters in the current scene
    character_connections = {}  # Connections for this book

    # Iterate over each row within the book
    for _, row in book_group.iterrows():
        if row['character'] == 'Scene Description':
            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))  # Ensure consistent ordering
                if pair in character_connections:
                    character_connections[pair] += 1
                else:
                    character_connections[pair] = 1

            # Reset the scene_characters for the next scene
            scene_characters = set()
        else:
            # Add characters to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

    # Store connections for this book
    character_connections_by_book[book] = character_connections

# Convert connections_by_book to a DataFrame
book_connections = []

for book, connections in character_connections_by_book.items():
    for pair, count in connections.items():
        book_connections.append({'book': book, 'pair': pair, 'count': count})

book_connections_df = pd.DataFrame(book_connections)

# Display the book-based connections
print(book_connections_df)

# Save to CSV for further analysis
book_connections_df.to_csv('character_connections_by_book.csv', index=False)


       book                   pair  count
0     Earth         (aang, katara)    113
1     Earth          (aang, pakku)      1
2     Earth        (katara, pakku)      1
3     Earth           (iroh, zuko)     55
4     Earth       (azula, captain)      2
...     ...                    ...    ...
1071  Water            (yue, zuko)      1
1072  Water  (aang, baboon spirit)      1
1073  Water            (aang, koh)      3
1074  Water            (yue, zhao)      1
1075  Water            (iroh, yue)      1

[1076 rows x 3 columns]


In [12]:
# Prepare the attributes dictionary
character_attributes = {}

# Sentiment Analysis Function
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity if pd.notna(text) else 0

# Iterate through each row
for _, row in df.iterrows():
    if row['character'] == 'Scene Description':  # New scene boundary
        # Increment scene counts for all characters in the scene
        for character in scene_characters:
            if character in character_attributes:
                character_attributes[character]['scene_count'] += 1
        scene_characters = set()  # Reset for the next scene
    else:
        # Track characters in the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

        for character in characters:
            if character not in character_attributes:
                character_attributes[character] = {
                    'dialogue_count': 0,
                    'scene_count': 0,
                    'episode_count': set(),
                    'arc_presence': set(),
                    'sentiment_sum': 0,
                    'sentiment_count': 0
                }

            # Increment dialogue count by the number of words in the dialogue
            dialogue = row['character_words']
            word_count = len(dialogue.split()) if pd.notna(dialogue) else 0
            character_attributes[character]['dialogue_count'] += word_count

            # Track episodes and arcs
            character_attributes[character]['episode_count'].add((row['book'], row['chapter']))
            character_attributes[character]['arc_presence'].add(row['book'])

            # Update sentiment analysis
            sentiment = calculate_sentiment(dialogue)
            character_attributes[character]['sentiment_sum'] += sentiment
            character_attributes[character]['sentiment_count'] += 1

# Process the attributes into a DataFrame
processed_attributes = []
for character, attributes in character_attributes.items():
    processed_attributes.append({
        'character': character,
        'dialogue_count': attributes['dialogue_count'],
        'scene_count': attributes['scene_count'],
        'episode_count': len(attributes['episode_count']),
        'arc_presence': ', '.join(attributes['arc_presence']),
        'average_sentiment': (attributes['sentiment_sum'] / attributes['sentiment_count']) if attributes['sentiment_count'] > 0 else None
    })

character_df = pd.DataFrame(processed_attributes)

# Save to CSV
character_df.to_csv('character_attributes.csv', index=False)

# Display the DataFrame
print(character_df)


            character  dialogue_count  scene_count  episode_count  \
0              katara           15067          875             59   
1               sokka           18392         1009             59   
2                zuko            9277          466             47   
3                iroh            5276          198             36   
4                aang           17947         1120             60   
..                ...             ...          ...            ...   
345          yangchen              90            1              1   
346       lion turtle              78            4              2   
347  banished servant              33            1              1   
348    head of dai li              28            1              1   
349           qin lee              10            1              1   

           arc_presence  average_sentiment  
0    Fire, Water, Earth           0.059311  
1    Fire, Water, Earth           0.058501  
2    Fire, Water, Earth           0.