In [1]:
import pandas as pd
from itertools import combinations
import re
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import networkx as nx
from fa2_modified import ForceAtlas2
import numpy as np
import pickle as pkl
from textblob import TextBlob

# Load the dataset
df = pd.read_csv('data/avatar.csv', encoding='ISO-8859-1')

# Dictionary to store character connections
character_connections = {}
# Dictionary to store connections by chapter
chapter_connections = {}
# Dictionary to store character connections by book
character_connections_by_book = {}


scene_characters = set()  # Characters in the current scene


# Name replacements for normalization
name_replacements = {
    'young zuko': 'zuko',
    'young azula': 'azula',
    'young katara': 'katara',
    'young sokka': 'sokka',
    'young toph': 'toph',
    'young aang': 'aang',
    'king bumi': 'bumi',
    'avatar roku': 'roku',
    'avatar kyoshi': 'kyoshi',
    'avatar kuruk': 'kuruk',
    'avatar yangchen': 'yangchen',
    'aang:': 'aang',
    'sha-mo:': 'sha-mo',    
}

# Exclude these words from being counted as characters
invalid_characters = {'together', 'both'}

# Function to normalize character names
def normalize_name(name):
    # Convert to lowercase, strip spaces, and standardize
    normalized = name.lower().strip()
    return name_replacements.get(normalized, normalized)

# Function to split multiple characters and normalize their names
def split_characters(character):
    # Replace "Team Avatar" with its members
    if 'team avatar' in character.lower():
        return ['sokka', 'katara', 'aang', 'toph']
    
    # Use regex to split by commas or 'and', and normalize each name
    names = [normalize_name(name) for name in re.split(r',|\band\b', character)]
    # Filter out invalid characters
    return [name for name in names if name not in invalid_characters]

In [2]:
# Iterate over each row to detect scene boundaries
for _, row in df.iterrows():
    # Check if the row indicates a new scene
    if row['character'] == 'Scene Description':
        # Create character pairs for the completed scene
        pairs = combinations(scene_characters, 2)
        for pair in pairs:
            pair = tuple(sorted(pair))  # Ensure consistent ordering
            if pair in character_connections:
                character_connections[pair] += 1
            else:
                character_connections[pair] = 1

        # Reset the scene_characters for the next scene
        scene_characters = set()
    else:
        # Check for multiple characters and add each to the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

# Convert to DataFrame for analysis
connections_df = pd.DataFrame(list(character_connections.items()), columns=['pair', 'count'])

# Display the connections
print(connections_df)

# Save the DataFrame to a CSV file
connections_df.to_csv('character_connections.csv', index=False)


                pair  count
0    (katara, sokka)    386
1       (iroh, zuko)    112
2     (aang, katara)    411
3      (aang, sokka)    409
4      (aang, kanna)      3
..               ...    ...
963     (bumi, iroh)      1
964    (iroh, pakku)      1
965     (ozai, suki)      1
966      (mai, toph)      1
967      (iroh, mai)      1

[968 rows x 2 columns]


In [3]:
# Iterate over each chapter
for (book, chapter), chapter_group in df.groupby(['book', 'chapter']):
    scene_characters = set()  # Characters in the current scene
    chapter_key = (book, chapter)  # Use book and chapter as a key
    chapter_connections[chapter_key] = {}

    # Iterate over rows within the chapter
    for _, row in chapter_group.iterrows():
        if row['character'] == 'Scene Description':
            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))
                if pair in chapter_connections[chapter_key]:
                    chapter_connections[chapter_key][pair] += 1
                else:
                    chapter_connections[chapter_key][pair] = 1

            # Reset the scene_characters for the next scene
            scene_characters = set()
        else:
            # Check for multiple characters and add each to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

# Convert chapter-based connections to a DataFrame
chapter_connections_list = []
for (book, chapter), connections in chapter_connections.items():
    for pair, count in connections.items():
        chapter_connections_list.append({
            'book': book,
            'chapter': chapter,
            'pair': pair,
            'count': count
        })

chapter_connections_df = pd.DataFrame(chapter_connections_list)

# Display the connections
print(chapter_connections_df)

# Save the DataFrame to a CSV file
chapter_connections_df.to_csv('chapter_character_connections.csv', index=False)


       book                               chapter  \
0     Earth                      Appa's Lost Days   
1     Earth                      Appa's Lost Days   
2     Earth                      Appa's Lost Days   
3     Earth                      Appa's Lost Days   
4     Earth                      Appa's Lost Days   
...     ...                                   ...   
1594  Water  Winter Solstice, Part 2: Avatar Roku   
1595  Water  Winter Solstice, Part 2: Avatar Roku   
1596  Water  Winter Solstice, Part 2: Avatar Roku   
1597  Water  Winter Solstice, Part 2: Avatar Roku   
1598  Water  Winter Solstice, Part 2: Avatar Roku   

                                pair  count  
0                   (ghashiun, toph)      1  
1          (ghashiun, sandbender #1)      1  
2          (ghashiun, sandbender #2)      1  
3     (sandbender #1, sandbender #2)      1  
4            (ghashiun, merchant #1)      1  
...                              ...    ...  
1594         (great fire sage, roku)     

In [4]:
# Iterate over each book
for book, book_group in df.groupby('book'):
    scene_characters = set()  # Characters in the current scene
    character_connections = {}  # Connections for this book

    # Iterate over each row within the book
    for _, row in book_group.iterrows():
        if row['character'] == 'Scene Description':
            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))  # Ensure consistent ordering
                if pair in character_connections:
                    character_connections[pair] += 1
                else:
                    character_connections[pair] = 1

            # Reset the scene_characters for the next scene
            scene_characters = set()
        else:
            # Add characters to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

    # Store connections for this book
    character_connections_by_book[book] = character_connections

# Convert connections_by_book to a DataFrame
book_connections = []

for book, connections in character_connections_by_book.items():
    for pair, count in connections.items():
        book_connections.append({'book': book, 'pair': pair, 'count': count})

book_connections_df = pd.DataFrame(book_connections)

# Display the book-based connections
print(book_connections_df)

# Save to CSV for further analysis
book_connections_df.to_csv('character_connections_by_book.csv', index=False)


       book                   pair  count
0     Earth         (aang, katara)    113
1     Earth        (katara, pakku)      1
2     Earth          (aang, pakku)      1
3     Earth           (iroh, zuko)     55
4     Earth       (azula, captain)      2
...     ...                    ...    ...
1071  Water            (yue, zuko)      1
1072  Water  (aang, baboon spirit)      1
1073  Water            (aang, koh)      3
1074  Water            (yue, zhao)      1
1075  Water            (iroh, yue)      1

[1076 rows x 3 columns]


In [14]:
from transformers import pipeline
from itertools import combinations
import pandas as pd

# Load the BERT sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", framework="pt")

# Function to calculate sentiment using BERT
def calculate_bert_sentiment(text, positive_bias=1.5, negative_bias=1.5):
    """
    Calculate sentiment using a BERT model and apply bias.
    Positive sentiments are amplified by `positive_bias`.
    Negative sentiments are amplified by `negative_bias`.
    """
    if pd.notna(text) and text.strip():
        result = sentiment_analyzer(text[:512])[0]  # Analyze up to 512 characters
        label = result["label"]
        score = float(result["score"])

        if "POSITIVE" in label.upper():
            return score * positive_bias
        elif "NEGATIVE" in label.upper():
            return -score * negative_bias
        return 0  # Neutral sentiment
    return 0  # No valid text

# Iterate over each book
character_connections_by_book = {}
for book, book_group in df.groupby('book'):
    scene_characters = set()  # Characters in the current scene
    scene_sentiment_sum = 0  # Total sentiment for the current scene
    total_sentences = 0  # Count of sentences for normalization
    character_connections = {}  # Connections for this book

    # Iterate over each row within the book
    for _, row in book_group.iterrows():
        if row['character'] == 'Scene Description':
            if total_sentences > 0:
                # Calculate average sentiment for the completed scene
                average_scene_sentiment = scene_sentiment_sum / total_sentences
            else:
                # No character dialogue, sentiment is neutral
                average_scene_sentiment = 0

            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))  # Ensure consistent ordering
                if pair in character_connections:
                    # Update weight and average sentiment
                    character_connections[pair]['count'] += 1
                    character_connections[pair]['sentiment'] += average_scene_sentiment
                else:
                    character_connections[pair] = {
                        'count': 1,
                        'sentiment': average_scene_sentiment
                    }

            # Reset the scene_characters and sentiment trackers for the next scene
            scene_characters = set()
            scene_sentiment_sum = 0
            total_sentences = 0
        else:
            # Add characters to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

            # Calculate BERT sentiment for this character's full text if available
            character_text = row['character_words']
            if pd.notna(character_text) and character_text.strip():
                sentiment = calculate_bert_sentiment(character_text, positive_bias=1.5, negative_bias=1.5)
                scene_sentiment_sum += sentiment
                total_sentences += 1

    # Normalize sentiment scores (average over the number of scenes)
    for pair in character_connections:
        connection = character_connections[pair]
        connection['sentiment'] /= connection['count']

    # Store connections for this book
    character_connections_by_book[book] = character_connections

# Convert connections_by_book to a DataFrame
book_connections = []

for book, connections in character_connections_by_book.items():
    for pair, data in connections.items():
        book_connections.append({
            'book': book,
            'pair': pair,
            'count': data['count'],
            'average_sentiment_between_characters': data['sentiment']
        })

book_connections_df = pd.DataFrame(book_connections)

# Display the book-based connections with sentiment
print(book_connections_df)

# Save to CSV for further analysis
book_connections_df.to_csv('character_connections_by_book_with_sentiment.csv', index=False)


Device set to use mps:0


       book                   pair  count  \
0     Earth         (aang, katara)    113   
1     Earth        (katara, pakku)      1   
2     Earth          (aang, pakku)      1   
3     Earth           (iroh, zuko)     55   
4     Earth       (azula, captain)      2   
...     ...                    ...    ...   
1071  Water            (yue, zuko)      1   
1072  Water  (aang, baboon spirit)      1   
1073  Water            (aang, koh)      3   
1074  Water            (yue, zhao)      1   
1075  Water            (iroh, yue)      1   

      average_sentiment_between_characters  
0                                -0.096103  
1                                 0.904363  
2                                 0.904363  
3                                -0.009508  
4                                -0.074584  
...                                    ...  
1071                              0.213319  
1072                              0.744977  
1073                              0.259550  
1074     

In [19]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", 
                              model="distilbert-base-uncased-finetuned-sst-2-english", 
                              framework="pt")

# Prepare the attributes dictionary
character_attributes = {}

# Keep track of the current arc (book) and scene characters
current_book = None
scene_characters = set()

# Sentiment Analysis Function using BERT
def calculate_sentiment(text):
    if pd.notna(text) and text.strip():
        # Analyze sentiment using the BERT pipeline
        result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT
        label = result[0]['label']
        # Map BERT labels to categories
        if "positive" in label.lower():
            return "positive"
        elif "negative" in label.lower():
            return "negative"
        else:
            return "neutral"
    return "neutral"

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    if pd.notna(row['book']):
        current_book = row['book']

    if row['character'] == 'Scene Description':
        # At the end of the scene, increment scene counts for all characters in that scene
        for character in scene_characters:
            if character in character_attributes:
                character_attributes[character]['scene_count'] += 1
        scene_characters = set()  # Reset for the next scene
    else:
        # Track characters in the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

        for character in characters:
            if character not in character_attributes:
                character_attributes[character] = {
                    'dialogue_count': 0,
                    'scene_count': 0,
                    'episode_count': set(),
                    'arc_presence': set(),
                    'positive_dialogue': 0,
                    'negative_dialogue': 0,
                    'neutral_dialogue': 0
                }

            # Increment dialogue count by the number of words in the dialogue
            dialogue = row['character_words']
            word_count = len(dialogue.split()) if pd.notna(dialogue) else 0
            character_attributes[character]['dialogue_count'] += word_count

            # Track episodes
            character_attributes[character]['episode_count'].add((row['book'], row['chapter']))

            # Track arc presence if there's a known current_book
            if current_book is not None:
                character_attributes[character]['arc_presence'].add(current_book)

            # Update sentiment analysis
            sentiment_category = calculate_sentiment(dialogue)
            if sentiment_category == "positive":
                character_attributes[character]['positive_dialogue'] += word_count
            elif sentiment_category == "negative":
                character_attributes[character]['negative_dialogue'] += word_count
            else:
                character_attributes[character]['neutral_dialogue'] += word_count

# Manually update Momo and Appa to appear in all arcs
# First, find all arcs (books) present in the DataFrame
all_arcs = set(df['book'].dropna())

for character_name in ['momo', 'appa']:
    if character_name in character_attributes:
        character_attributes[character_name]['arc_presence'] = all_arcs

# Process the attributes into a DataFrame
processed_attributes = []
for character, attributes in character_attributes.items():
    total_dialogue = attributes['dialogue_count']
    processed_attributes.append({
        'character': character,
        'dialogue_count': total_dialogue,
        'scene_count': attributes['scene_count'],
        'episode_count': len(attributes['episode_count']),
        'arc_presence': ', '.join(sorted(attributes['arc_presence'])),
        'positive_proportion': (attributes['positive_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
        'negative_proportion': (attributes['negative_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
        'neutral_proportion': (attributes['neutral_dialogue'] / total_dialogue) if total_dialogue > 0 else 0
    })

character_df = pd.DataFrame(processed_attributes)

# Save to CSV
character_df.to_csv('character_attributes.csv', index=False)

# Display the DataFrame
print(character_df)


Device set to use mps:0


            character  dialogue_count  scene_count  episode_count  \
0              katara           15067          875             59   
1               sokka           18392         1009             59   
2                zuko            9277          466             47   
3                iroh            5276          198             36   
4                aang           17947         1120             60   
..                ...             ...          ...            ...   
345          yangchen              90            1              1   
346       lion turtle              78            4              2   
347  banished servant              33            1              1   
348    head of dai li              28            1              1   
349           qin lee              10            1              1   

           arc_presence  positive_proportion  negative_proportion  \
0    Earth, Fire, Water             0.405057             0.594943   
1    Earth, Fire, Water          

In [20]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", 
                              model="distilbert-base-uncased-finetuned-sst-2-english", 
                              framework="pt")

# Prepare the attributes dictionary
character_attributes_by_arc = {}

# Keep track of the current arc (book) and scene characters
current_book = None
scene_characters = set()

# Sentiment Analysis Function using BERT
def calculate_sentiment(text):
    if pd.notna(text) and text.strip():
        # Analyze sentiment using the BERT pipeline
        result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT
        label = result[0]['label']
        # Map BERT labels to categories
        if "positive" in label.lower():
            return "positive"
        elif "negative" in label.lower():
            return "negative"
        else:
            return "neutral"
    return "neutral"

# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    if pd.notna(row['book']):
        current_book = row['book']  # Update the current book (arc)

    if row['character'] == 'Scene Description':
        # At the end of the scene, increment scene counts for all characters in that scene
        for character in scene_characters:
            if character in character_attributes_by_arc[current_book]:
                character_attributes_by_arc[current_book][character]['scene_count'] += 1
        scene_characters = set()  # Reset for the next scene
    else:
        # Track characters in the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

        for character in characters:
            # Initialize character data for this arc if not already done
            if current_book not in character_attributes_by_arc:
                character_attributes_by_arc[current_book] = {}
            if character not in character_attributes_by_arc[current_book]:
                character_attributes_by_arc[current_book][character] = {
                    'dialogue_count': 0,
                    'scene_count': 0,
                    'episode_count': set(),
                    'positive_dialogue': 0,
                    'negative_dialogue': 0,
                    'neutral_dialogue': 0
                }

            # Increment dialogue count by the number of words in the dialogue
            dialogue = row['character_words']
            word_count = len(dialogue.split()) if pd.notna(dialogue) else 0
            character_attributes_by_arc[current_book][character]['dialogue_count'] += word_count

            # Track episodes
            character_attributes_by_arc[current_book][character]['episode_count'].add(row['chapter'])

            # Update sentiment analysis
            sentiment_category = calculate_sentiment(dialogue)
            if sentiment_category == "positive":
                character_attributes_by_arc[current_book][character]['positive_dialogue'] += word_count
            elif sentiment_category == "negative":
                character_attributes_by_arc[current_book][character]['negative_dialogue'] += word_count
            else:
                character_attributes_by_arc[current_book][character]['neutral_dialogue'] += word_count

# Manually update Momo and Appa to appear in all arcs
# First, find all arcs (books) present in the DataFrame
all_arcs = set(df['book'].dropna())

for arc in all_arcs:
    if arc not in character_attributes_by_arc:
        character_attributes_by_arc[arc] = {}
    for character_name in ['momo', 'appa']:
        if character_name not in character_attributes_by_arc[arc]:
            character_attributes_by_arc[arc][character_name] = {
                'dialogue_count': 0,
                'scene_count': 0,
                'episode_count': set(),
                'positive_dialogue': 0,
                'negative_dialogue': 0,
                'neutral_dialogue': 0
            }

# Process the attributes into a DataFrame per arc
processed_data = []

for arc, characters in character_attributes_by_arc.items():
    for character, attributes in characters.items():
        total_dialogue = attributes['dialogue_count']
        processed_data.append({
            'arc': arc,
            'character': character,
            'dialogue_count': total_dialogue,
            'scene_count': attributes['scene_count'],
            'episode_count': len(attributes['episode_count']),
            'positive_proportion': (attributes['positive_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
            'negative_proportion': (attributes['negative_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
            'neutral_proportion': (attributes['neutral_dialogue'] / total_dialogue) if total_dialogue > 0 else 0
        })

# Create a DataFrame from the processed data
character_arc_df = pd.DataFrame(processed_data)

# Save to CSV
character_arc_df.to_csv('character_attributes_by_arc.csv', index=False)

# Display the DataFrame
print(character_arc_df)


Device set to use mps:0


       arc       character  dialogue_count  scene_count  episode_count  \
0    Water          katara            6275          368             20   
1    Water           sokka            5980          374             20   
2    Water            zuko            1773          122             13   
3    Water            iroh            1814           85             13   
4    Water            aang            7810          508             20   
..     ...             ...             ...          ...            ...   
428   Fire  head of dai li              28            1              1   
429   Fire         qin lee              10            1              1   
430   Fire        engineer              30            2              1   
431   Fire     crew member              12            1              1   
432   Fire            ursa              45            1              1   

     positive_proportion  negative_proportion  neutral_proportion  
0               0.409084             0.5909