In [None]:
import pandas as pd
from itertools import combinations
import re


# Load the dataset
df = pd.read_csv('data/avatar.csv', encoding='ISO-8859-1')

# Dictionary to store character connections & chapters 
character_connections = {}
chapter_connections = {}
# Dictionary to store character connections by book
character_connections_by_book = {}


scene_characters = set()  # Characters in the current scene


# Name replacements for normalization
name_replacements = {
    'young zuko': 'zuko',
    'young azula': 'azula',
    'young katara': 'katara',
    'young sokka': 'sokka',
    'young toph': 'toph',
    'young aang': 'aang',
    'king bumi': 'bumi',
    'avatar roku': 'roku',
    'avatar kyoshi': 'kyoshi',
    'avatar kuruk': 'kuruk',
    'avatar yangchen': 'yangchen',
    'aang:': 'aang',
    'sha-mo:': 'sha-mo',    
}

# Exclude these words from being counted as characters
invalid_characters = {'together', 'both'}

# Function to normalize character names
def normalize_name(name):
    # Convert to lowercase, strip spaces, and standardize
    normalized = name.lower().strip()
    return name_replacements.get(normalized, normalized)

# Function to split multiple characters and normalize their names
def split_characters(character):
    # Replace "Team Avatar" with its members
    if 'team avatar' in character.lower():
        return ['sokka', 'katara', 'aang', 'toph']
    
    # Use regex to split by commas or 'and', and normalize each name
    names = [normalize_name(name) for name in re.split(r',|\band\b', character)]
    # Filter out invalid characters
    return [name for name in names if name not in invalid_characters]

# Scene Boundary Detection and Character Network Analysis

This script processes a dataset to detect scene boundaries and analyze character interactions in a narrative. It identifies character pairs who appear together in the same scene and calculates their connection frequency. The results are saved to a CSV file for further analysis.

In [2]:
# Iterate over each row to detect scene boundaries
for _, row in df.iterrows():
    # Check if the row indicates a new scene
    if row['character'] == 'Scene Description':
        # Create character pairs for the completed scene
        pairs = combinations(scene_characters, 2)
        for pair in pairs:
            pair = tuple(sorted(pair))  # Ensure consistent ordering
            if pair in character_connections:
                character_connections[pair] += 1
            else:
                character_connections[pair] = 1

        # Reset the scene_characters for the next scene
        scene_characters = set()
    else:
        # Check for multiple characters and add each to the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

# Convert to DataFrame for analysis
connections_df = pd.DataFrame(list(character_connections.items()), columns=['pair', 'count'])

# Display the connections
print(connections_df)

# Save the DataFrame to a CSV file
connections_df.to_csv('character_connections.csv', index=False)


                pair  count
0    (katara, sokka)    386
1       (iroh, zuko)    112
2     (aang, katara)    411
3      (aang, sokka)    409
4      (aang, kanna)      3
..               ...    ...
963    (iroh, pakku)      1
964     (bumi, iroh)      1
965     (ozai, suki)      1
966      (mai, toph)      1
967      (iroh, mai)      1

[968 rows x 2 columns]


This script combines natural language processing and network analysis to explore character interactions and sentiments across scenes in multiple books. It uses a BERT-based sentiment analysis model to evaluate dialogue and calculate average sentiment within scenes. Character connections are quantified based on co-occurrence and sentiment, and the results are stored in a book-specific dataset for further analysis.

In [None]:
from transformers import pipeline
from itertools import combinations
import pandas as pd

# Load the BERT sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", framework="pt")

# Function to calculate sentiment using BERT
def calculate_bert_sentiment(text, positive_bias=1.5, negative_bias=1.5):
    """
    Calculate sentiment using a BERT model and apply bias.
    Positive sentiments are amplified by `positive_bias`.
    Negative sentiments are amplified by `negative_bias`.
    """
    if pd.notna(text) and text.strip():
        result = sentiment_analyzer(text[:512])[0]  # Analyze up to 512 characters
        label = result["label"]
        score = float(result["score"])

        if "POSITIVE" in label.upper():
            return score * positive_bias
        elif "NEGATIVE" in label.upper():
            return -score * negative_bias
        return 0  # Neutral sentiment
    return 0  # No valid text


def calculate_sentiment(text):
    if pd.notna(text) and text.strip():
        result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT
        label = result[0]['label']
        score = result[0]['score']
        
        # Map the labels to sentiments
        if label == "LABEL_1":  # Positive
            return "positive"
        elif label == "LABEL_0":  # Negative
            return "negative"
        else:  # Add a fallback for unexpected labels
            return "neutral"
    return "neutral"


# Iterate over each book
character_connections_by_book = {}
for book, book_group in df.groupby('book'):
    scene_characters = set()  # Characters in the current scene
    scene_sentiment_sum = 0  # Total sentiment for the current scene
    total_sentences = 0  # Count of sentences for normalization
    character_connections = {}  # Connections for this book

    # Iterate over each row within the book
    for _, row in book_group.iterrows():
        if row['character'] == 'Scene Description':
            if total_sentences > 0:
                # Calculate average sentiment for the completed scene
                average_scene_sentiment = scene_sentiment_sum / total_sentences
            else:
                # No character dialogue, sentiment is neutral
                average_scene_sentiment = 0

            # Create character pairs for the completed scene
            pairs = combinations(scene_characters, 2)
            for pair in pairs:
                pair = tuple(sorted(pair))  
                if pair in character_connections:
                    # Update weight and average sentiment
                    character_connections[pair]['count'] += 1
                    character_connections[pair]['sentiment'] += average_scene_sentiment
                else:
                    character_connections[pair] = {
                        'count': 1,
                        'sentiment': average_scene_sentiment
                    }

            # Reset the scene_characters and sentiment trackers for the next scene
            scene_characters = set()
            scene_sentiment_sum = 0
            total_sentences = 0
        else:
            # Add characters to the current scene
            characters = split_characters(row['character'])
            scene_characters.update(characters)

            # Calculate BERT sentiment for this character's full text if available
            character_text = row['character_words']
            if pd.notna(character_text) and character_text.strip():
                sentiment = calculate_bert_sentiment(character_text, positive_bias=1.5, negative_bias=1.5)
                scene_sentiment_sum += sentiment
                total_sentences += 1

    # Normalize sentiment scores (average over the number of scenes)
    for pair in character_connections:
        connection = character_connections[pair]
        connection['sentiment'] /= connection['count']

    # Store connections for this book
    character_connections_by_book[book] = character_connections

# Convert connections_by_book to a DataFrame
book_connections = []

for book, connections in character_connections_by_book.items():
    for pair, data in connections.items():
        book_connections.append({
            'book': book,
            'pair': pair,
            'count': data['count'],
            'average_sentiment_between_characters': data['sentiment']
        })

book_connections_df = pd.DataFrame(book_connections)

# Display the book-based connections with sentiment
print(book_connections_df)

# Save to CSV for further analysis
book_connections_df.to_csv('character_connections_by_book.csv', index=False)


Device set to use mps:0


       book                   pair  count  \
0     Earth         (aang, katara)    113   
1     Earth          (aang, pakku)      1   
2     Earth        (katara, pakku)      1   
3     Earth           (iroh, zuko)     55   
4     Earth       (azula, captain)      2   
...     ...                    ...    ...   
1071  Water            (yue, zuko)      1   
1072  Water  (aang, baboon spirit)      1   
1073  Water            (aang, koh)      3   
1074  Water            (yue, zhao)      1   
1075  Water            (iroh, yue)      1   

      average_sentiment_between_characters  
0                                -0.096103  
1                                 0.904363  
2                                 0.904363  
3                                -0.009508  
4                                -0.074584  
...                                    ...  
1071                              0.213319  
1072                              0.744977  
1073                              0.259550  
1074     

This script performs character-level analysis by integrating sentiment evaluation and tracking their presence across scenes, episodes, and story arcs. Using a RoBERTa-based sentiment analysis model, it categorizes dialogue sentiment as positive, negative, or neutral. It calculates various metrics such as dialogue counts, scene counts, episode appearances, arc presence, and sentiment proportions for each character. The processed data is saved into a CSV file for detailed exploration.

In [5]:
from transformers import pipeline
import pandas as pd

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", framework="pt")


# Prepare the attributes dictionary
character_attributes = {}

# Keep track of the current arc (book) and scene characters
current_book = None
scene_characters = set()

# Sentiment Analysis Function using BERT
def calculate_sentiment(text):
    if pd.notna(text) and text.strip():
        result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT
        label = result[0]['label']
        score = result[0]['score']
        
        # Map the labels to sentiments
        if label == "LABEL_1":  # Positive
            return "positive"
        elif label == "LABEL_0":  # Negative
            return "negative"
        else:  # Add a fallback for unexpected labels
            return "neutral"
    return "neutral"


# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    if pd.notna(row['book']):
        current_book = row['book']

    if row['character'] == 'Scene Description':
        # At the end of the scene, increment scene counts for all characters in that scene
        for character in scene_characters:
            if character in character_attributes:
                character_attributes[character]['scene_count'] += 1
        scene_characters = set()  # Reset for the next scene
    else:
        # Track characters in the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

        for character in characters:
            if character not in character_attributes:
                character_attributes[character] = {
                    'dialogue_count': 0,
                    'scene_count': 0,
                    'episode_count': set(),
                    'arc_presence': set(),
                    'positive_dialogue': 0,
                    'negative_dialogue': 0,
                    'neutral_dialogue': 0
                }

            # Increment dialogue count by the number of words in the dialogue
            dialogue = row['character_words']
            word_count = len(dialogue.split()) if pd.notna(dialogue) else 0
            character_attributes[character]['dialogue_count'] += word_count

            # Track episodes
            character_attributes[character]['episode_count'].add((row['book'], row['chapter']))

            # Track arc presence if there's a known current_book
            if current_book is not None:
                character_attributes[character]['arc_presence'].add(current_book)

            # Update sentiment analysis
            sentiment_category = calculate_sentiment(dialogue)
            if sentiment_category == "positive":
                character_attributes[character]['positive_dialogue'] += word_count
            elif sentiment_category == "negative":
                character_attributes[character]['negative_dialogue'] += word_count
            else:
                character_attributes[character]['neutral_dialogue'] += word_count

# Manually update Momo and Appa to appear in all arcs
# First, find all arcs (books) present in the DataFrame
all_arcs = set(df['book'].dropna())

for character_name in ['momo', 'appa']:
    if character_name in character_attributes:
        character_attributes[character_name]['arc_presence'] = all_arcs

# Process the attributes into a DataFrame
processed_attributes = []
for character, attributes in character_attributes.items():
    total_dialogue = attributes['dialogue_count']
    processed_attributes.append({
        'character': character,
        'dialogue_count': total_dialogue,
        'scene_count': attributes['scene_count'],
        'episode_count': len(attributes['episode_count']),
        'arc_presence': ', '.join(sorted(attributes['arc_presence'])),
        'positive_proportion': (attributes['positive_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
        'negative_proportion': (attributes['negative_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
        'neutral_proportion': (attributes['neutral_dialogue'] / total_dialogue) if total_dialogue > 0 else 0
    })

character_df = pd.DataFrame(processed_attributes)

# Save to CSV
character_df.to_csv('character_attributes.csv', index=False)

# Display the DataFrame
print(character_df)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0


            character  dialogue_count  scene_count  episode_count  \
0              katara           15067          875             59   
1               sokka           18392         1009             59   
2                zuko            9277          466             47   
3                iroh            5276          198             36   
4                aang           17947         1120             60   
..                ...             ...          ...            ...   
345          yangchen              90            1              1   
346       lion turtle              78            4              2   
347  banished servant              33            1              1   
348    head of dai li              28            1              1   
349           qin lee              10            1              1   

           arc_presence  positive_proportion  negative_proportion  \
0    Earth, Fire, Water             0.488817             0.319307   
1    Earth, Fire, Water          

In [6]:
from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment", framework="pt")


# Prepare the attributes dictionary
character_attributes_by_arc = {}

# Keep track of the current arc (book) and scene characters
current_book = None
scene_characters = set()

def calculate_sentiment(text):
    if pd.notna(text) and text.strip():
        result = sentiment_analyzer(text[:512])  # Truncate to 512 tokens for BERT
        label = result[0]['label']
        score = result[0]['score']
        
        # Map the labels to sentiments
        if label == "LABEL_1":  # Positive
            return "positive"
        elif label == "LABEL_0":  # Negative
            return "negative"
        else:  # Add a fallback for unexpected labels
            return "neutral"
    return "neutral"


# Iterate through each row in the DataFrame
for _, row in df.iterrows():
    if pd.notna(row['book']):
        current_book = row['book']  # Update the current book (arc)

    if row['character'] == 'Scene Description':
        # At the end of the scene, increment scene counts for all characters in that scene
        for character in scene_characters:
            if character in character_attributes_by_arc[current_book]:
                character_attributes_by_arc[current_book][character]['scene_count'] += 1
        scene_characters = set()  # Reset for the next scene
    else:
        # Track characters in the current scene
        characters = split_characters(row['character'])
        scene_characters.update(characters)

        for character in characters:
            # Initialize character data for this arc if not already done
            if current_book not in character_attributes_by_arc:
                character_attributes_by_arc[current_book] = {}
            if character not in character_attributes_by_arc[current_book]:
                character_attributes_by_arc[current_book][character] = {
                    'dialogue_count': 0,
                    'scene_count': 0,
                    'episode_count': set(),
                    'positive_dialogue': 0,
                    'negative_dialogue': 0,
                    'neutral_dialogue': 0
                }

            # Increment dialogue count by the number of words in the dialogue
            dialogue = row['character_words']
            word_count = len(dialogue.split()) if pd.notna(dialogue) else 0
            character_attributes_by_arc[current_book][character]['dialogue_count'] += word_count

            # Track episodes
            character_attributes_by_arc[current_book][character]['episode_count'].add(row['chapter'])

            # Update sentiment analysis
            sentiment_category = calculate_sentiment(dialogue)
            if sentiment_category == "positive":
                character_attributes_by_arc[current_book][character]['positive_dialogue'] += word_count
            elif sentiment_category == "negative":
                character_attributes_by_arc[current_book][character]['negative_dialogue'] += word_count
            else:
                character_attributes_by_arc[current_book][character]['neutral_dialogue'] += word_count

# Manually update Momo and Appa to appear in all arcs
# First, find all arcs (books) present in the DataFrame
all_arcs = set(df['book'].dropna())

for arc in all_arcs:
    if arc not in character_attributes_by_arc:
        character_attributes_by_arc[arc] = {}
    for character_name in ['momo', 'appa']:
        if character_name not in character_attributes_by_arc[arc]:
            character_attributes_by_arc[arc][character_name] = {
                'dialogue_count': 0,
                'scene_count': 0,
                'episode_count': set(),
                'positive_dialogue': 0,
                'negative_dialogue': 0,
                'neutral_dialogue': 0
            }

# Process the attributes into a DataFrame per arc
processed_data = []

for arc, characters in character_attributes_by_arc.items():
    for character, attributes in characters.items():
        total_dialogue = attributes['dialogue_count']
        processed_data.append({
            'arc': arc,
            'character': character,
            'dialogue_count': total_dialogue,
            'scene_count': attributes['scene_count'],
            'episode_count': len(attributes['episode_count']),
            'positive_proportion': (attributes['positive_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
            'negative_proportion': (attributes['negative_dialogue'] / total_dialogue) if total_dialogue > 0 else 0,
            'neutral_proportion': (attributes['neutral_dialogue'] / total_dialogue) if total_dialogue > 0 else 0
        })

# Create a DataFrame from the processed data
character_arc_df = pd.DataFrame(processed_data)

# Save to CSV
character_arc_df.to_csv('character_attributes_by_arc.csv', index=False)

# Display the DataFrame
print(character_arc_df)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Device set to use mps:0


       arc       character  dialogue_count  scene_count  episode_count  \
0    Water          katara            6275          368             20   
1    Water           sokka            5980          374             20   
2    Water            zuko            1773          122             13   
3    Water            iroh            1814           85             13   
4    Water            aang            7810          508             20   
..     ...             ...             ...          ...            ...   
428   Fire  head of dai li              28            1              1   
429   Fire         qin lee              10            1              1   
430   Fire        engineer              30            2              1   
431   Fire     crew member              12            1              1   
432   Fire            ursa              45            1              1   

     positive_proportion  negative_proportion  neutral_proportion  
0               0.487171             0.2917


This script uses the NRC Emotion Lexicon to analyze emotions and sentiments in text. It processes character dialogues across episodes, calculates emotion counts and proportions, and aggregates results into a dataset. The output is saved as a CSV file for further exploration.

In [7]:
from collections import defaultdict

# Load the NRC Emotion Lexicon
def load_nrc_lexicon(filepath):
    lexicon = defaultdict(lambda: {"positive": 0, "negative": 0, "anger": 0, "fear": 0, 
                                    "anticipation": 0, "trust": 0, "surprise": 0, 
                                    "sadness": 0, "joy": 0, "disgust": 0})
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            term, category, association = line.strip().split('\t')
            if int(association) == 1:  # Only include words with a positive association
                lexicon[term][category] = 1
    return lexicon

# Tokenize text into words
def tokenize(text):
    return re.findall(r'\b\w+\b', text.lower())

# Analyze sentiment/emotion for a block of text
def analyze_text_sentiment(text, lexicon):
    emotion_counts = defaultdict(int)

    # Tokenize and count words
    words = tokenize(text)
    for word in words:
        for emotion, value in lexicon[word].items():
            emotion_counts[emotion] += value

    # Calculate proportions
    total_emotions = sum(emotion_counts.values())
    if total_emotions > 0:
        proportions = {k: v / total_emotions for k, v in emotion_counts.items()}
    else:
        proportions = {k: 0 for k in emotion_counts.keys()}

    return emotion_counts, proportions

# Process each episode (chapter)
def process_episode(data, book, chapter, lexicon):
    # Filter rows for the given book and chapter
    episode_data = data[(data['book'] == book) & (data['chapter'] == chapter)]

    # Concatenate all character dialogues
    episode_text = ' '.join(episode_data['character_words'].dropna())
    
    # Analyze sentiment/emotions
    counts, proportions = analyze_text_sentiment(episode_text, lexicon)
    return counts, proportions


# Filter out rows where the character is "Scene Description"
df = df[df['character'] != "Scene Description"]

# Keep only necessary columns
df = df[['book', 'chapter', 'chapter_num', 'character_words']]

# Load the NRC Emotion Lexicon (update the file path)
nrc_lexicon_path = 'NRC-Emotion-Lexicon/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt'
nrc_lexicon = load_nrc_lexicon(nrc_lexicon_path)

# Analyze all episodes
results = []

for (book, chapter), group in df.groupby(['book', 'chapter']):
    counts, proportions = process_episode(df, book, chapter, nrc_lexicon)
    results.append({
        'book': book,
        'chapter': chapter,
        'positive': proportions.get('positive', 0),
        'negative': proportions.get('negative', 0),
        'anger': proportions.get('anger', 0),
        'fear': proportions.get('fear', 0),
        'anticipation': proportions.get('anticipation', 0),
        'trust': proportions.get('trust', 0),
        'surprise': proportions.get('surprise', 0),
        'sadness': proportions.get('sadness', 0),
        'joy': proportions.get('joy', 0),
        'disgust': proportions.get('disgust', 0)
    })

# Convert results to a DataFrame
emotion_results_df = pd.DataFrame(results)

# Save to a CSV file
emotion_results_df.to_csv('episode_emotions.csv', index=False)

# Display the DataFrame
print(emotion_results_df)


     book                                    chapter  positive  negative  \
0   Earth                           Appa's Lost Days  0.203125  0.152344   
1   Earth                                 Avatar Day  0.255446  0.112871   
2   Earth                                Bitter Work  0.245750  0.091190   
3   Earth                  City of Walls and Secrets  0.248521  0.118343   
4   Earth                                Lake Laogai  0.258907  0.109264   
..    ...                                        ...       ...       ...   
56  Water                     The Warriors of Kyoshi  0.247807  0.138158   
57  Water                    The Waterbending Master  0.279294  0.086677   
58  Water                    The Waterbending Scroll  0.253579  0.124744   
59  Water  Winter Solstice, Part 1: The Spirit World  0.286585  0.115854   
60  Water       Winter Solstice, Part 2: Avatar Roku  0.275154  0.141684   

       anger      fear  anticipation     trust  surprise   sadness       joy  \
0   0.0

This script processes IMDb ratings for episodes, aggregates ratings by unique episodes, and identifies the top 10 best and worst-rated episodes. The results are sorted and saved as separate CSV files for analysis.

In [20]:
import pandas as pd

# Load the dataset
imdb_df = pd.read_csv('data/avatar.csv', encoding='ISO-8859-1')

# Ensure the `imdb_rating` column is treated as numeric
imdb_df['imdb_rating'] = pd.to_numeric(imdb_df['imdb_rating'], errors='coerce')

# Drop rows with missing or invalid ratings
imdb_df = imdb_df.dropna(subset=['imdb_rating'])

# Aggregate by unique episodes using `book`, `chapter`, and `chapter_num`
# Take the mean `imdb_rating` for episodes with multiple rows
aggregated_df = imdb_df.groupby(['book', 'chapter', 'chapter_num'], as_index=False).agg({
    'imdb_rating': 'mean'
})

# Sort by rating
sorted_imdb = aggregated_df.sort_values(by='imdb_rating', ascending=False)

# Top 10 best-rated episodes
top_10_best = sorted_imdb.head(10)

# Top 10 worst-rated episodes
top_10_worst = sorted_imdb.tail(10)


# Save results to separate CSV files
top_10_best.to_csv('top_10_best_rated_episodes.csv', index=False)
top_10_worst.to_csv('top_10_worst_rated_episodes.csv', index=False)

# Display the results
print("Top 10 Best-Rated Episodes:")
print(top_10_best)

print("\nTop 10 Worst-Rated Episodes:")
print(top_10_worst)



Top 10 Best-Rated Episodes:
     book                                    chapter  chapter_num  imdb_rating
24   Fire    Sozin's Comet, Part 3: Into the Inferno           20          9.8
25   Fire         Sozin's Comet, Part 4: Avatar Aang           21          9.8
10  Earth                  The Crossroads of Destiny           20          9.6
19  Earth                                 Zuko Alone            7          9.5
26   Fire               The Avatar and the Fire Lord            6          9.5
23   Fire     Sozin's Comet, Part 2: The Old Masters           19          9.5
52  Water             The Siege of the North, Part 1           19          9.4
32   Fire  The Day of Black Sun, Part 2: The Eclipse           11          9.4
30   Fire                   The Boiling Rock, Part 2           15          9.2
18  Earth                    The Tales of Ba Sing Se           15          9.2

Top 10 Worst-Rated Episodes:
     book                   chapter  chapter_num  imdb_rating
57  Water  