In [1]:
import warnings
warnings.filterwarnings('ignore')     # to avoid warning messages

In [2]:
!pip install plotly 
!pip install community 
!pip install python-louvain
!pip install colorlover

Collecting plotly
  Downloading plotly-5.12.0-py2.py3-none-any.whl (15.2 MB)
[K     |████████████████████████████████| 15.2 MB 668 kB/s eta 0:00:01
[?25hCollecting tenacity>=6.2.0
  Downloading tenacity-8.1.0-py3-none-any.whl (23 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.12.0 tenacity-8.1.0
Collecting community
  Downloading community-1.0.0b1.tar.gz (2.2 kB)
Building wheels for collected packages: community
  Building wheel for community (setup.py) ... [?25ldone
[?25h  Created wheel for community: filename=community-1.0.0b1-py3-none-any.whl size=2153 sha256=8cf4efeedfc974f2cce734f064a76d8a075e0e706004d7381869647b4b0d9476
  Stored in directory: /Users/ArunRam/Library/Caches/pip/wheels/e5/32/7b/aebaa96975f39189b78d7b123cf90e5fd2f0418953833b3043
Successfully built community
Installing collected packages: community
Successfully installed community-1.0.0b1
Collecting python-louvain
  Downloading python-louvain-0.16.tar.gz (204 kB)
[K     |█████

In [3]:
%matplotlib inline

import networkx as nx

from decorator import decorator

from networkx.utils import create_random_state, create_py_random_state

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import os

# Remove scientific notations and display numbers with 2 decimal points instead
pd.options.display.float_format = '{:,.2f}'.format        

# Update default background style of plots
sns.set_style(style='darkgrid')

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot

import plotly.graph_objs as go

import plotly

import plotly.express as px
init_notebook_mode(connected=True)

In [4]:
os.listdir("raw_data_books/")

['book1.csv', 'book2.csv', 'book3.csv', 'book4.csv', 'book5.csv']

In [5]:
book1 = pd.read_csv("raw_data_books/book1.csv")

In [6]:
book1.shape

(684, 5)

In [7]:
book1.head()

Unnamed: 0,Person 1,Person 2,Type,weight,book
0,Addam-Marbrand,Jaime-Lannister,Undirected,3,1
1,Addam-Marbrand,Tywin-Lannister,Undirected,6,1
2,Aegon-I-Targaryen,Daenerys-Targaryen,Undirected,5,1
3,Aegon-I-Targaryen,Eddard-Stark,Undirected,4,1
4,Aemon-Targaryen-(Maester-Aemon),Alliser-Thorne,Undirected,4,1


In [8]:
book2 = pd.read_csv("raw_data_books/book2.csv")

book3 = pd.read_csv("raw_data_books/book3.csv")

book4 = pd.read_csv("raw_data_books/book4.csv")

book5 = pd.read_csv("raw_data_books/book5.csv")

In [9]:
books = [book1, book2, book3, book4, book5]

books_combined = pd.DataFrame()

for book in books:
    books_combined = pd.concat([books_combined, book])

# Grouping the data by Person 2 and Person 1 to avoid multiple entries with the same characters 
books_combined = books_combined.groupby(["Person 2", "Person 1"], as_index = False)["weight"].sum()

In [10]:
# DESCRIPTIVE ANALYTICS

books_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Person 2  2823 non-null   object
 1   Person 1  2823 non-null   object
 2   weight    2823 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 66.3+ KB


In [11]:
books_combined.describe()

Unnamed: 0,weight
count,2823.0
mean,11.56
std,19.98
min,3.0
25%,3.0
50%,5.0
75%,11.0
max,334.0


In [None]:
# OBSERVATIONS

# There are **2823 edges** in total, or 2823 co-occurrences of characters.
#The **minimum weight is 3** (meaning every co-occurrence pair has been observed at least thrice), and the **maximum weight is 334**.
# The **mean weight is 11.56**, meaning that on average, two co-occurring characters are mentioned around 12 times together. **The median of 5** also implies that **it is the maximum weight which is more likely the outlier,** which is also affirmed by the fact that 75% of the weight values are 11 or lower.

In [13]:
books_combined[books_combined["weight"] == 334]

Unnamed: 0,Person 2,Person 1,weight
1570,Robert-Baratheon,Eddard-Stark,334


## **Creating a Graph Network (for each book as well as all books combined)**


In [15]:
G1 = nx.from_pandas_edgelist(book1, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

G2 = nx.from_pandas_edgelist(book2, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

G3 = nx.from_pandas_edgelist(book3, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

G4 = nx.from_pandas_edgelist(book4, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

G5 = nx.from_pandas_edgelist(book5, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

G = nx.from_pandas_edgelist(books_combined, 'Person 1', "Person 2", edge_attr = "weight", create_using = nx.Graph())

In [48]:
# Number of nodes & edges in combined graph network

nx.info(G)
#print(list(G.degree()),'\n')
sorted(list(G.degree()), key = lambda x:x[1], reverse = True)

[('Tyrion-Lannister', 122),
 ('Jon-Snow', 114),
 ('Jaime-Lannister', 101),
 ('Cersei-Lannister', 97),
 ('Stannis-Baratheon', 89),
 ('Arya-Stark', 84),
 ('Catelyn-Stark', 75),
 ('Sansa-Stark', 75),
 ('Eddard-Stark', 74),
 ('Robb-Stark', 74),
 ('Daenerys-Targaryen', 73),
 ('Joffrey-Baratheon', 69),
 ('Theon-Greyjoy', 66),
 ('Robert-Baratheon', 65),
 ('Bran-Stark', 54),
 ('Tywin-Lannister', 48),
 ('Petyr-Baelish', 47),
 ('Brienne-of-Tarth', 45),
 ('Samwell-Tarly', 44),
 ('Barristan-Selmy', 41),
 ('Renly-Baratheon', 38),
 ('Sandor-Clegane', 36),
 ('Varys', 34),
 ('Davos-Seaworth', 33),
 ('Gregor-Clegane', 33),
 ('Mance-Rayder', 33),
 ('Tommen-Baratheon', 33),
 ('Margaery-Tyrell', 30),
 ('Rodrik-Cassel', 30),
 ('Jeor-Mormont', 28),
 ('Loras-Tyrell', 28),
 ('Roose-Bolton', 28),
 ('Pycelle', 27),
 ('Aemon-Targaryen-(Maester-Aemon)', 26),
 ('Jorah-Mormont', 26),
 ('Asha-Greyjoy', 25),
 ('Bronn', 25),
 ('Edmure-Tully', 24),
 ('Lysa-Arryn', 24),
 ('Victarion-Greyjoy', 24),
 ('Janos-Slynt', 23),


### **Creating functions to calculate the number of unique connections per character, Degree Centrality, Eigenvector Centrality, and Betweenness Centrality**

In [34]:
# The number of unique connections

def numUniqueConnec(G):
    numUniqueConnection = list(G.degree())
    
    numUniqueConnection = sorted(numUniqueConnection, key = lambda x:x[1], reverse = True)
    
    numUniqueConnection = pd.DataFrame.from_dict(numUniqueConnection)
    
    numUniqueConnection.columns = (["Character", "NumUniqueHCPConnections"])
    
    return numUniqueConnection

In [35]:
numUniqueConnec(G)

Unnamed: 0,Character,NumUniqueHCPConnections
0,Tyrion-Lannister,122
1,Jon-Snow,114
2,Jaime-Lannister,101
3,Cersei-Lannister,97
4,Stannis-Baratheon,89
...,...,...
791,Wynton-Stout,1
792,Bael-the-Bard,1
793,Yorko-Terys,1
794,Yurkhaz-zo-Yunzak,1


In [49]:
# Degree Centrality

def deg_central(G):
    deg_centrality = nx.degree_centrality(G)
    
    deg_centrality_sort = sorted(deg_centrality.items(), key = lambda x:x[1], reverse = True)
    
    deg_centrality_sort = pd.DataFrame.from_dict(deg_centrality_sort)
    
    deg_centrality_sort.columns = (["Character", "Degree Centrality"])
    
    return deg_centrality_sort

In [51]:
deg_cen_sort = deg_central(G)
deg_cen_sort

Unnamed: 0,Character,Degree Centrality
0,Tyrion-Lannister,0.15
1,Jon-Snow,0.14
2,Jaime-Lannister,0.13
3,Cersei-Lannister,0.12
4,Stannis-Baratheon,0.11
...,...,...
791,Wynton-Stout,0.00
792,Bael-the-Bard,0.00
793,Yorko-Terys,0.00
794,Yurkhaz-zo-Yunzak,0.00


In [52]:
#Eigenvector Centrality

def eigen_central(G):
    eigen_centrality = nx.eigenvector_centrality(G, weight = "weight")
    
    eigen_centrality_sort = sorted(eigen_centrality.items(), key = lambda x:x[1], reverse = True)
    
    eigen_centrality_sort = pd.DataFrame.from_dict(eigen_centrality_sort)
    
    eigen_centrality_sort.columns = (["Character", "EigenVector Centrality"])
    
    return eigen_centrality_sort

eigen_central(G)

Unnamed: 0,Character,EigenVector Centrality
0,Tyrion-Lannister,0.38
1,Cersei-Lannister,0.36
2,Joffrey-Baratheon,0.34
3,Robert-Baratheon,0.28
4,Eddard-Stark,0.28
...,...,...
791,Simon-Toyne,0.00
792,Hugh-Hungerford,0.00
793,Murch,0.00
794,Torwold-Browntooth,0.00


In [55]:
#Betweenness Centrality

def betweenness_central(G):
    betweenness_centrality = nx.betweenness_centrality(G, weight = "weight")
    
    betweenness_centrality_sort = sorted(betweenness_centrality.items(), key = lambda x:x[1], reverse = True)
    
    betweenness_centrality_sort = pd.DataFrame.from_dict(betweenness_centrality_sort)
    
    betweenness_centrality_sort.columns = (["Character", "Betweenness Centrality"])
    
    return betweenness_centrality_sort

betweenness_central(G)

Unnamed: 0,Character,Betweenness Centrality
0,Jon-Snow,0.13
1,Theon-Greyjoy,0.12
2,Jaime-Lannister,0.12
3,Daenerys-Targaryen,0.09
4,Stannis-Baratheon,0.09
...,...,...
791,Yandry,0.00
792,Bael-the-Bard,0.00
793,Yorko-Terys,0.00
794,Yurkhaz-zo-Yunzak,0.00


In [56]:
def draw_plotly_network_graph(Graph_obj, filter = None, filter_nodesbydegree = None):
    G_dup = Graph_obj.copy()

    degrees = nx.classes.degree(G_dup)
    
    degree_df = pd.DataFrame(degrees)
    
    if filter is not None:
        top = deg_centrality_sort[:filter_nodesbydegree]["Character"].values
        
        G_dup.remove_nodes_from([node
                             for node in G_dup.nodes
                             if node not in top
                            ]) # Filter out the nodes that fewer connections

    pos = nx.spring_layout(G_dup)

    for n, p in pos.items():
        G_dup.nodes[n]['pos'] = p

    edge_trace = go.Scatter(
        x = [],
        y = [],
        line = dict(width = 0.5, color = '#888'),
        hoverinfo = 'none',
        mode = 'lines')

    for edge in G_dup.edges():
        x0, y0 = G_dup.nodes[edge[0]]['pos']
        
        x1, y1 = G_dup.nodes[edge[1]]['pos']
        
        edge_trace['x'] += tuple([x0, x1, None])
        
        edge_trace['y'] += tuple([y0, y1, None])

    node_trace = go.Scatter(
        x = [],
        y = [],
        text = [],
        mode = 'markers',
        hoverinfo = 'text',
        marker = dict(
            showscale = True,
            colorscale = 'RdBu',
            reversescale = True,
            color = [],
            size = 15,
            colorbar = dict(
                thickness = 10,
                title = 'Node Connections',
                xanchor = 'left',
                titleside = 'right'
            ),
            line = dict(width = 0)))

    for node in G_dup.nodes():
        x, y = G_dup.nodes[node]['pos']
        
        node_trace['x'] += tuple([x])
        
        node_trace['y'] += tuple([y])

    for node, adjacencies in enumerate(G_dup.adjacency()):
        node_trace['marker']['color'] += tuple([int(degree_df[degree_df[0] == adjacencies[0]][1].values)])
        
        node_info = adjacencies[0] + '<br /># of connections: ' + str(int(degree_df[degree_df[0] == adjacencies[0]][1].values))
        
        node_trace['text'] += tuple([node_info])

    fig = go.Figure(data = [edge_trace, node_trace],
                 layout = go.Layout(
                    title = '<br>GOT network connections',
                    titlefont = dict(size = 20),
                    showlegend = False,
                    hovermode = 'closest',
                    margin = dict(b = 20, l = 5, r = 5, t = 0),
                    annotations=[ dict(
                        text = "",
                        showarrow = False,
                        xref = "paper", yref = "paper") ],
                    xaxis = dict(showgrid = False, zeroline = False, showticklabels = False),
                    yaxis = dict(showgrid = False, zeroline = False, showticklabels = False)))

    iplot(fig)

In [57]:
draw_plotly_network_graph(Graph_obj = G, filter = None, filter_nodesbydegree = None)

# Note: This cell will take sometime to run

### **All Books Combined**

In [60]:
draw_plotly_network_graph(Graph_obj = G, filter = "Yes", filter_nodesbydegree = 50)

NameError: name 'deg_centrality_sort' is not defined

#### **Summary - Book 1**

1. **Eddard Stark** is the most connected character, followed by **Robert Baratheon**. 
2. Tyrion, Catelyn, and Jon are in the top 5 characters.
3. Rob, Sansa, and Bran are all well-connected too, but the first book mostly revolves around Ed Stark and Robert Baratheon.
4. Cersei Lannister, Joffrey Baratheon, Jamie Lannister, Arya Stark, Daenerys, and Drogo are the other well-connected characters in this book.

The above findings make sense considering the plot of Book 1. Robert Baratheon, the king of the seven kingdoms, visits the House of Stark to offer Eddard Stark the position of Hand of the King, which Stark accepts. Eddard Stark's two daughters Arya and Sansa, also accompany him to the King's Landing, while his son Robb Stark looks after the House of Stark in Eddard's absence. The book eventually ends with the death of Robert Baratheon and the execution of Ed Stark by the new king Joffrey Baratheon. Robert and Eddard's importance in the story and their links to other characters in the book makes it logical that they are the two most connected characters in Book 1 of the series, with the **highest Degree Centrality measures** as seen in the table below. Tyrion Lannister, already the next most important character in Book 1, gains prominence in the coming books and becomes the most connected character in the book series overall.

In [59]:
deg_central(G1)[:20]

Unnamed: 0,Character,Degree Centrality
0,Eddard-Stark,0.35
1,Robert-Baratheon,0.27
2,Tyrion-Lannister,0.25
3,Catelyn-Stark,0.23
4,Jon-Snow,0.2
5,Robb-Stark,0.19
6,Sansa-Stark,0.19
7,Bran-Stark,0.17
8,Cersei-Lannister,0.16
9,Joffrey-Baratheon,0.16
