In [None]:
# data manipulation
import pandas as pd
import itertools
from collections import Counter

# data visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# social network analysis
import networkx as nx

# text preprocessing
import spacy
import re

# machine learning
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.decomposition import LatentDirichletAllocation

import warnings
warnings.filterwarnings('ignore')

In [None]:
# reading in the avatar demoographics dataset
# There are characters from the Avatar: LOK which is the second installation in the Universe
demo = pd.read_csv('../data/avatar_characters_data.csv', encoding='utf-8', encoding_errors='replace')
demo

In [None]:
# reading in the avatar script dataset
script = pd.read_csv('../data/avatar.csv', encoding='utf-8', encoding_errors='replace')
script

In [None]:
# list of characters found in script dataset. 
avatar_chars = script['character'].unique()
print(avatar_chars)

### Clustering Characteristics

In [None]:
# Making a list of episodes for each Avatar character
script_rd = script[['chapter','character']].copy()
script_rd = script_rd.drop_duplicates()
script_rd = script_rd.groupby('character')['chapter'].apply(list).reset_index()

# Removing the Scene Description
script_rd = script_rd[script_rd['character'] != 'Scene Description']

# To match the naming convention in the other dataset, need to remove the prefixes of certain characters
subs = ['Avatar ', 'Young ']
pattern = '|'.join(map(re.escape,subs))
script_rd['character'] = script_rd['character'].str.replace(pattern, '', regex=True)

# Finding the number of episodes a character is in and removing characters that only appear in 1 episode
script_rd['epi_cnt'] = script_rd['chapter'].apply(len)
script_rd = script_rd[script_rd['epi_cnt'] > 1].reset_index(drop=True)
script_rd

In [None]:
# Calculating the jaccard index for each pair of characters to know how often they are in chapters together as a function of overall total of chapters each character was in
chars = len(script_rd)
sims = []
for i in range(chars):
    for j in range(i+1, chars):
        char1 = set(script_rd.iloc[i]['chapter'])
        char2 = set(script_rd.iloc[j]['chapter'])
        overlap = char1.intersection(char2)
        total = char1.union(char2)
        perc = round((len(overlap) / len(total)),2)
        res = (script_rd.iloc[i,0], script_rd.iloc[j,0], perc)
        sims.append(res)

In [None]:
group = pd.DataFrame(sims, columns=['source','target','jaccard'])
group

In [None]:
# Toph is one of the few characters with a last name and it messes up the naming convention 
demo['Character Name'] = demo['Character Name'].str.replace('Toph Beifong', 'Toph')

# Filtering characters in the demographic datasets by the Avatar characters only list
demo_rd = demo[['Character Name','Ethnicity','Weapon of choice', 'First appearance']]

demo_rd = demo_rd[demo_rd['Character Name'].isin(avatar_chars)]
demo_rd['cln_appearance'] = demo_rd['First appearance'].str.extract(r'"(.*?)"')

demo_rd

In [None]:
# Identifying the book for each episode
ep_bk = script[['chapter','book', 'book_num']].copy()
ep_bk.drop_duplicates(inplace=True)
ep_bk

In [None]:
demo_rd = demo_rd.merge(ep_bk, left_on='cln_appearance', right_on='chapter')

In [None]:
# Identifying the book of the first appearance of each character
demo_rd = demo_rd.drop(['chapter', 'First appearance'],axis=1)
demo_rd

In [None]:
# Making the columns to identify if a character pair has matching ethnicity, weapon choice and book of first appearance
group['mat_eth'] = None
group['mat_weapon'] = None
group['mat_book'] = None
group

In [None]:
for index,row in group.iterrows():
    char1 = row['source']
    char2 = row['target']
    temp = demo_rd[demo_rd['Character Name'].isin([char1, char2])]
    if len(temp) == 1:
        continue
    group.loc[group.index[index], 'mat_eth'] = 1 if len(set(temp['Ethnicity'])) == 1 else 0
    group.loc[group.index[index], 'mat_weapon'] = 1 if len(set(temp['Weapon of choice'])) == 1 else 0
    group.loc[group.index[index], 'mat_book'] = 1 if len(set(temp['book'])) == 1 else 0

In [None]:
# Sum up the values to determine an overall score
group['total'] = group[['jaccard','mat_eth','mat_weapon','mat_book']].sum(axis=1)
group

In [None]:
# Removing groups that don't have any matching criteria
group = group.dropna(subset='mat_book')
group = group[group['total']!=0]

In [None]:
group = group.reset_index(drop=True)
group['total'] = group['total'].astype(float)

In [None]:
# which pairs have the highest scores; looks like side characaters show stronger relationships because they are in few and same episodes
max_connections = group.nlargest(n=10,columns='total')
max_connections

In [None]:
# distribution of scores; skew on the ones probably due to side characters appearing in two episodes together and having no matching characteristics
fig_scores = px.histogram(group,x='total', nbins=8, text_auto=True, template='simple_white')
fig_scores.show()

In [None]:
max_eps = script_rd.nlargest(n=20,columns='epi_cnt')
max_eps

In [None]:
# Making the network graph to map characters and the strength of their relationship based on how much they match on the characteristics
G = nx.from_pandas_edgelist(group, 'source', 'target', 'total')

In [None]:
# There are two distinct clusters: one that seems to include the main cast and another cluster of side-characters
plt.figure(figsize=(18, 14))
nx.draw_networkx(G, with_labels=True)
plt.title('Network')
plt.show()

In [None]:
# Community detection to see how groups get clustered. There are four groups which looks roughly like a water tribe based group, a fire nation based group
# a group full of other meaningful characters, and a group for everyone else
comms = nx.community.louvain_communities(G, weight='total', seed=42)
comms

In [None]:
# breaking apart each group to see the connections between them
community_features = []
for i, comm in enumerate(comms):
    subgraph = G.subgraph(comm)
    plt.figure(figsize=(20, 18))
    nx.draw_networkx(subgraph, with_labels=True,node_size=2000, edge_cmap=plt.cm.Greys)
    plt.title('Network ' + str(i))
    plt.show()
    # Example features: average degree, number of edges, number of nodes
    avg_degree = sum(dict(subgraph.degree()).values()) / len(subgraph)
    num_edges = subgraph.number_of_edges()
    num_nodes = subgraph.number_of_nodes()
    community_features.append([avg_degree, num_edges, num_nodes])

In [None]:
# see the average number of connections for the nodes, number of connections in the graph, number of nodes
community_features

In [None]:
# break down of ethnicity, book of appearance, histogram of episodes, weapon of choice 
demo_rd

In [None]:
script_rd

In [None]:
merged_df = script_rd.merge(demo_rd, left_on='character', right_on='Character Name').drop('Character Name',axis=1)
merged_df

In [None]:
# making graphs to analyzse the clusters to confirm the hypothesis that there is a water, fire, other meaningful, and side characters
for each in comms:
    df_temp = merged_df[merged_df['character'].isin(each)]
    fig_plt = make_subplots(rows=2,cols=2, subplot_titles=('Ethnicity','Weapon of Choice','Book of First Appearance'))
    fig_plt.add_trace(go.Histogram(x=df_temp['Ethnicity'],texttemplate= '%{y}', name='Ethnicity Breakdown'),row=1,col=1)
    fig_plt.add_trace(go.Histogram(x=df_temp['Weapon of choice'],texttemplate= '%{y}', name='Weapon of Choice Breakdown'),row=1,col=2)
    fig_plt.add_trace(go.Histogram(x=df_temp['book'],texttemplate= '%{y}', name='Book Appearance Breakdown'),row=2,col=1)

    fig_plt.update_layout(height=800,width=1600,template='simple_white')
    
    fig_plt.show()

### Using Text to make Communities

In [None]:
script

In [None]:
# Remove rows where characters say nothing
script_pre = script[['id','book','chapter','character','character_words']].copy()
script_pre = script_pre[script_pre['character_words'].notna()]

# Cleaning up the titles
subs = ['Avatar ', 'Young ', ':']
pattern = '|'.join(map(re.escape,subs))
script_pre['character'] = script_pre['character'].str.replace(pattern, '', regex=True)

script_pre

In [None]:
# Pre-processing text using spacy
# Removing punctuation, stopwords, lowercase, lemmatize
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tkns = []
    for token in doc:
        if not token.is_punct and not token.is_space and not token.is_stop:
            tkns.append(token.lemma_.lower())
    return " ".join(tkns)

In [None]:
script_pre['text_cln'] = script_pre['character_words'].apply(preprocess_text)
script_pre['text_cln'] = script_pre['text_cln'].apply(lambda x: ' '.join(word for word in x.split() if len(word) > 1))
script_pre['split_text'] = script_pre['text_cln'].str.split()
script_pre['word_count'] = script_pre['split_text'].apply(lambda x: len(x))

In [None]:
script_pre

In [None]:
script_grp = script_pre.groupby('character',as_index=False)['word_count'].sum()
top_10_char_word = script_grp.nlargest(10, 'word_count')

In [None]:
top_10_grph = px.bar(top_10_char_word, x='character',y='word_count', template='simple_white', text_auto=True, title='Top 10 Characters by Word Count')
top_10_grph.show()

In [None]:
chars_include = list(script_grp[script_grp['word_count'] >= 50]['character'])
chars_include

In [None]:
script_full_ep = script_pre.groupby(['character', 'chapter'],as_index=False)['text_cln'].agg(' '.join)
script_full_ep_lm = script_full_ep[script_full_ep['character'].isin(chars_include)]
script_full_ep_lm

In [None]:
tf_vect = CountVectorizer(max_df=0.95, min_df=2)
tf = tf_vect.fit_transform(script_full_ep_lm['text_cln'])
tf_feature_names = tf_vect.get_feature_names_out()

no_topics = 50

In [None]:
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online',random_state=42)
lda_output = lda_model.fit_transform(tf)

In [None]:
topic_names = [f"Topic_{i+1}" for i in range(lda_model.n_components)]
df_topic = pd.DataFrame(lda_output, columns=topic_names)

In [None]:
script_full_ep_lm.reset_index(drop=True, inplace=True)

In [None]:
df_char_topic = pd.concat([script_full_ep_lm, df_topic],axis=1)
df_char_topic

In [None]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lda_model, tf_feature_names, 15)

In [None]:
df_char_topic_mod = (df_char_topic.iloc[:,3:] < .10).astype(float).replace({1: 0, 0: 1})
df_char_topic_mod

In [None]:
df_final = pd.concat([df_char_topic[['character','chapter']],df_char_topic_mod],axis=1)
df_final['topic_sum'] = df_final.iloc[:,2:].sum(axis=1)
df_final

In [None]:
top_topic_grp = df_final.groupby('character', as_index=False)['topic_sum'].sum()

In [None]:
top_topic_sum = top_topic_grp.nlargest(10,'topic_sum', 'all')

In [None]:
top_topic_grph = px.bar(top_topic_sum, x='character', y='topic_sum', template='simple_white', text_auto=True, title='Top 10 Characters by Topic Occurrence Count')
top_topic_grph.show()

In [None]:
uni_topics = []
for each in df_final['character'].unique():
    temp = df_final[df_final['character'] == each]
    summed = temp.iloc[:,2:52].sum()
    topics = list(summed[summed >0].index)
    res = (each, topics)
    uni_topics.append(res)

In [None]:
df_uni = pd.DataFrame(uni_topics, columns=['character','topics'])
df_uni['num_uni_topics'] = df_uni['topics'].apply(len)
df_uni

In [None]:
uni_topics_grph = px.bar(df_uni.nlargest(10,'num_uni_topics'), x='character', y='num_uni_topics', template='simple_white', text_auto=True, title='Top Ten Characters by Unique Topic Count')
uni_topics_grph.show()

In [None]:
df_net = df_uni[['character', 'topics']].explode('topics')
df_net = pd.merge(df_net,df_net, on='topics')
df_net = df_net[df_net['character_x'] != df_net['character_y']]
df_net_grp = df_net.groupby(['character_x','character_y'], as_index=False)['topics'].count()
df_net_grp.columns = ['source', 'target','topics']
df_net_grp

In [None]:
topic_hist_grph = px.histogram(df_net_grp, x='topics', text_auto=True, template='simple_white', title='Distribution of Topics by Pairs')
topic_hist_grph.show()

In [None]:
K = nx.from_pandas_edgelist(df_net_grp[df_net_grp['topics'] >=3], source='source',target='target', edge_attr='topics')

In [None]:
plt.figure(figsize=(20, 18))
nx.draw_networkx(K, with_labels=True,node_size=2000, edge_cmap=plt.cm.Greys)
plt.title('Network of Connections for Nodes with at least 3 Matched Topics')
plt.show()

In [None]:
K = nx.from_pandas_edgelist(df_net_grp[df_net_grp['topics'] >=2], source='source',target='target', edge_attr='topics')
comms = nx.community.louvain_communities(K, weight='total', seed=42)
comms

In [None]:
# breaking apart each group to see the connections between them
community_features = []
for i, comm in enumerate(comms):
    subgraph = K.subgraph(comm)
    plt.figure(figsize=(20, 18))
    nx.draw_networkx(subgraph, with_labels=True,node_size=2000, edge_cmap=plt.cm.Greys)
    plt.title('Network ' + str(i))
    plt.show()
    # Example features: average degree, number of edges, number of nodes
    avg_degree = round(sum(dict(subgraph.degree()).values()) / len(subgraph),0)
    num_edges = subgraph.number_of_edges()
    num_nodes = subgraph.number_of_nodes()
    community_features.append([avg_degree, num_edges, num_nodes])

In [None]:
# see the average number of connections for the nodes, number of connections in the graph, number of nodes
community_features

In [None]:
# making graphs to analyzse the clusters to confirm the hypothesis that there is a water, fire, other meaningful, and side characters. Some characters aren't found in both datasets
for each in comms:
    df_temp = merged_df[merged_df['character'].isin(each)]
    fig_plt = make_subplots(rows=2,cols=2, subplot_titles=('Ethnicity','Weapon of Choice','Book of First Appearance'))
    fig_plt.add_trace(go.Histogram(x=df_temp['Ethnicity'],texttemplate= '%{y}', name='Ethnicity Breakdown'),row=1,col=1)
    fig_plt.add_trace(go.Histogram(x=df_temp['Weapon of choice'],texttemplate= '%{y}', name='Weapon of Choice Breakdown'),row=1,col=2)
    fig_plt.add_trace(go.Histogram(x=df_temp['book'],texttemplate= '%{y}', name='Book Appearance Breakdown'),row=2,col=1)

    fig_plt.update_layout(height=800,width=1600,template='simple_white')
    
    fig_plt.show()

## Deep Learning to classify character lines

In [None]:
script_full_ep_lm

In [None]:
chars_dl = ['Sokka','Toph','Aang','Katara','Suki','Zuko','Azula','Iroh','Ty Lee','Mai','Ozai']
script_dl = script_full_ep_lm[script_full_ep_lm['character'].isin(chars_dl)]

In [None]:
script_dl

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(script_dl['text_cln'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, script_dl['character'], test_size=0.2, random_state=42)

# 3. Model Training
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=42)
mlp.fit(X_train, y_train)

# 4. Prediction and Evaluation
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
chars_dl.sort()
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=chars_dl)
disp.plot()
plt.title("Confusion Matrix")
plt.show()

In [None]:
## Using each line instead of using episodes
ln_dl = script_pre[script_pre['character'].isin(chars_dl)]
ln_dl

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(ln_dl['text_cln'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, ln_dl['character'], test_size=0.2, random_state=42)

# 3. Model Training
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=200, random_state=42)
mlp.fit(X_train, y_train)

# 4. Prediction and Evaluation
y_pred = mlp.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
chars_dl.sort()
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=chars_dl)
disp.plot()
plt.title("Confusion Matrix")
plt.show()