In [1]:
# Restrictions: ONLY named individuals

import pandas as pd

json_file_path = './ice-and-fire.json'
df = pd.read_json(json_file_path)
df.drop('predecessor', axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1345 non-null   object
 1   allegiance     1186 non-null   object
 2   father         942 non-null    object
 3   successor      194 non-null    object
 4   url            1345 non-null   object
 5   fatherRumored  11 non-null     object
 6   mother         490 non-null    object
 7   spouse         524 non-null    object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 94.7+ KB


In [2]:
# Remove datapoints that have no relation at all.
# Old code and technically not necessarily anymore, already done in node.js
data_without_relation = df[(df['father'].isnull()) \
                           & (df['mother'].isnull()) \
                           & (df['successor'].isnull()) \
                           & (df['spouse'].isnull())]

data_with_relation = df[~df['name'].isin(data_without_relation['name'])]
data_with_relation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1316 entries, 0 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1316 non-null   object
 1   allegiance     1173 non-null   object
 2   father         942 non-null    object
 3   successor      194 non-null    object
 4   url            1316 non-null   object
 5   fatherRumored  11 non-null     object
 6   mother         490 non-null    object
 7   spouse         524 non-null    object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 102.8+ KB


In [3]:
duplicate_names = df[df['name'].duplicated()]
duplicate_names['name'].value_counts()

Series([], Name: name, dtype: int64)

In [4]:
# Weed out double fathers.
# Some are actually double, because they are rumoured
multiple_fathers = df[df['father'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_fathers['father']

Series([], Name: father, dtype: object)

In [5]:
# Weed out double moms.
# Some are actually double, because they are rumoured
multiple_mothers = df[df['mother'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_mothers['mother']

Series([], Name: mother, dtype: object)

In [6]:
# Weed out erratic spouses.
multiple_spouses = df[df['spouse'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_spouses[['name', 'spouse']]

Unnamed: 0,name,spouse
5,Adrack Humble,"[Rock wife, Salt wife]"
82,Alys Karstark,"[Sigorn, Thenns]"
103,Alyssa Velaryon,"[Aenys I Targaryen, Rogar Baratheon]"
108,Amerei Frey,"[Pate of the Blue Fork, Lancel Lannister]"
290,Craster,"[Dyah, Ferny, Nella, Gilly]"
293,Cregan Stark,"[Arra Norrey, Alysanne Blackwood, Lynara Stark]"
310,Daemon Targaryen,"[Rhea Royce, Laena Velaryon, Rhaenyra Targaryen]"
328,Dalton Greyjoy,"[Tess, Kayce, Lysa Farman]"
342,Davos Dayne,"[Nymeria, Ny Sar]"
369,Donella Hornwood,"[Halys Hornwood, Ramsay Snow]"


In [7]:
# Datapoints with children
data_with_children = data_with_relation[
    (~data_with_relation['successor'].isnull()) 
    | (data_with_relation['name'].isin(data_with_relation['mother']))
    | (data_with_relation['name'].isin(data_with_relation['father']))]

data_with_children.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 0 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           462 non-null    object
 1   allegiance     374 non-null    object
 2   father         255 non-null    object
 3   successor      194 non-null    object
 4   url            462 non-null    object
 5   fatherRumored  2 non-null      object
 6   mother         148 non-null    object
 7   spouse         290 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 36.1+ KB


In [8]:
# Datapoints for whom spouse is the only relationship

data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull()) 
                                      & (df['father'].isnull()) 
                                       & (df['mother'].isnull()) 
                                       & (df['successor'].isnull())]

data_with_spouses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 257 entries, 5 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           257 non-null    object
 1   allegiance     242 non-null    object
 2   father         0 non-null      object
 3   successor      0 non-null      object
 4   url            257 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         257 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 20.1+ KB


  data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull())


In [9]:
data_with_spouses.head()

Unnamed: 0,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored
5,Adrack Humble,"[House Humble, House Greyjoy]",,,http://awoiaf.westeros.org/index.php/Adrack_Hu...,,,"[Rock wife, Salt wife]",
22,Aelinor Penrose,"[House Penrose, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Aelinor_P...,,,[Aerys I Targaryen],
47,Alarra Massey,"[House Massey, House Velaryon]",,,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
50,Alayne,[House Baelish],,,http://awoiaf.westeros.org/index.php/Alayne_Ba...,,,[Baelish (lord)],
74,Alyn Tarbeck,[House Tarbeck],,,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],


In [10]:
# Get spouses that have a spouse present in the dataset with all 
# parent-child relationships

parent_child_spouses = df.explode('spouse').dropna(subset=['spouse'])['spouse'].unique()
data_spouse_family = data_with_spouses[data_with_spouses['name'].isin(parent_child_spouses)]
data_spouse_family.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 47 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           192 non-null    object
 1   allegiance     182 non-null    object
 2   father         0 non-null      object
 3   successor      0 non-null      object
 4   url            192 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         192 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 15.0+ KB


In [11]:
data_spouse_family.head()

Unnamed: 0,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored
47,Alarra Massey,"[House Massey, House Velaryon]",,,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
74,Alyn Tarbeck,[House Tarbeck],,,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],
78,Alys Arryn (wife of Rhaegel),"[House Arryn, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Alys_Arry...,,,[Rhaegel Targaryen],
79,Alys Beesbury,"[House Beesbury, House Tyrell]",,,http://awoiaf.westeros.org/index.php/Alys_Bees...,,,[Leo Tyrell (son of Victor)],
80,Alys Frey,[House Frey],,,http://awoiaf.westeros.org/index.php/Alys_Frey,,,[Jared Frey],


In [12]:
all_usable_data = pd.concat([data_with_children, data_spouse_family])
len(all_usable_data)

654

In [13]:
all_usable_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 654 entries, 0 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           654 non-null    object
 1   allegiance     556 non-null    object
 2   father         255 non-null    object
 3   successor      194 non-null    object
 4   url            654 non-null    object
 5   fatherRumored  2 non-null      object
 6   mother         148 non-null    object
 7   spouse         482 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 51.1+ KB


In [14]:
first_layer = all_usable_data[(all_usable_data['father'].isnull()) & (all_usable_data['mother'].isnull())]
first_layer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 47 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           392 non-null    object
 1   allegiance     344 non-null    object
 2   father         0 non-null      object
 3   successor      80 non-null     object
 4   url            392 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         324 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 30.6+ KB


In [15]:
all_usable_data.explode('allegiance').dropna(subset=['allegiance']).value_counts('allegiance').head(50)

allegiance
House Frey                        87
House Targaryen                   63
House Stark                       55
House Lannister                   47
Blacks                            29
House Tyrell                      25
House Baratheon                   18
House Hightower                   18
House Velaryon                    16
House Arryn                       13
Greens                            13
House Martell                     12
House Greyjoy                     11
House Waynwood                    10
House Tarbeck                      9
House Tully                        9
House Florent                      9
House Royce                        8
House Manderly                     8
House Glover                       8
House Westerling                   8
House Blackwood                    7
House Redwyne                      7
House Stokeworth                   7
City Watch of King's Landing       7
House Swann                        6
House Locke                

In [16]:
first_layer[first_layer['successor'].notna()]['successor'].head(100)

58          [Alekyne Florent, Davos Seaworth]
65           [Edwell Celtigar, Osmund Strong]
73      [Lord Stokeworth, Maegor I Targaryen]
107                            [Lord Hayford]
131                       [Arlan V Durrandon]
                        ...                  
1257                        [Torgon Greyiron]
1263                      [Urrigon Hightower]
1273                                 [Qyburn]
1322                         [Barbrey Dustin]
1331                         [Rohanne Webber]
Name: successor, Length: 80, dtype: object

In [17]:
# Transform nodes into partnerships (= single or couple), 
# so that they will be easier to use in a hierarchical structure

all_with_spouse = all_usable_data[all_usable_data['spouse'].notnull()]
all_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 482 entries, 24 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           482 non-null    object
 1   allegiance     467 non-null    object
 2   father         155 non-null    object
 3   successor      61 non-null     object
 4   url            482 non-null    object
 5   fatherRumored  1 non-null      object
 6   mother         102 non-null    object
 7   spouse         482 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 37.7+ KB


In [18]:
all_no_spouse = all_usable_data[all_usable_data['spouse'].isna()]
all_no_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 0 to 1331
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           172 non-null    object
 1   allegiance     89 non-null     object
 2   father         100 non-null    object
 3   successor      133 non-null    object
 4   url            172 non-null    object
 5   fatherRumored  1 non-null      object
 6   mother         46 non-null     object
 7   spouse         0 non-null      object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 13.4+ KB


In [19]:
all_with_spouse.reset_index(inplace=True)
all_relations = all_with_spouse.explode('spouse').dropna(subset=['spouse'])
all_relations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 519 entries, 0 to 481
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   index          519 non-null    int64 
 1   name           519 non-null    object
 2   allegiance     500 non-null    object
 3   father         188 non-null    object
 4   successor      77 non-null     object
 5   url            519 non-null    object
 6   fatherRumored  1 non-null      object
 7   mother         119 non-null    object
 8   spouse         519 non-null    object
 9   motherRumored  0 non-null      object
dtypes: int64(1), object(9)
memory usage: 44.6+ KB


In [26]:
# all_relations.reset_index(inplace=True)
all_relations_with_spouse = all_relations.merge(all_relations, how='left', left_on='spouse', right_on='name', suffixes=('', '_spouse'))

# trick to be able to drop duplicates even of some values are lists
# all_relations_with_spouse = all_relations_with_spouse.loc[all_relations_with_spouse.astype(str).drop_duplicates().index]
all_relations_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 527 entries, 0 to 827
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 527 non-null    int64  
 1   name                  527 non-null    object 
 2   allegiance            502 non-null    object 
 3   father                208 non-null    object 
 4   successor             86 non-null     object 
 5   url                   527 non-null    object 
 6   fatherRumored         1 non-null      object 
 7   mother                130 non-null    object 
 8   spouse                527 non-null    object 
 9   motherRumored         0 non-null      object 
 10  index_spouse          417 non-null    float64
 11  name_spouse           417 non-null    object 
 12  allegiance_spouse     397 non-null    object 
 13  father_spouse         241 non-null    object 
 14  successor_spouse      70 non-null     object 
 15  url_spouse            4

In [29]:
all_relations_with_spouse[all_relations_with_spouse['name_spouse'].isna()].head(20)

Unnamed: 0,index,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored,index_spouse,name_spouse,allegiance_spouse,father_spouse,successor_spouse,url_spouse,fatherRumored_spouse,mother_spouse,spouse_spouse,motherRumored_spouse
0,24,Aelora Targaryen,[House Targaryen],Rhaegel Targaryen,[Maekar I Targaryen],http://awoiaf.westeros.org/index.php/Aelora_Ta...,,Alys Arryn (wife of Rhaegel),Aelor Targaryen,,,,,,,,,,,
1,26,Aemma Arryn,"[House Arryn, House Targaryen]",Rodrik Arryn,[Alicent Hightower],http://awoiaf.westeros.org/index.php/Aemma_Arryn,,Daella Targaryen (daughter of Jaehaerys I),Viserys I Targaryen,,,,,,,,,,,
8,45,Alannys Harlaw,"[House Greyjoy, House Harlaw]",House Harlaw,,http://awoiaf.westeros.org/index.php/Alannys_H...,,,Balon Greyjoy,,,,,,,,,,,
15,62,Alicent Hightower,"[House Hightower, House Targaryen, Greens]",Otto Hightower,"[Helaena Targaryen, Daemon Targaryen]",http://awoiaf.westeros.org/index.php/Alicent_H...,,,Viserys I Targaryen,,,,,,,,,,,
26,81,Alys Harroway,"[House Harroway, House Targaryen]",Lucas Harroway,"[Ceryse Hightower, Tyanna of the Tower]",http://awoiaf.westeros.org/index.php/Alys_Harr...,,,Maegor I Targaryen,,,,,,,,,,,
35,99,Alysanne Targaryen,[House Targaryen],Aenys I Targaryen,[Aemma Arryn],http://awoiaf.westeros.org/index.php/Alysanne_...,,Alyssa Velaryon,Jaehaerys I Targaryen,,,,,,,,,,,
44,102,Alyssa Targaryen,[House Targaryen],Jaehaerys I Targaryen,,http://awoiaf.westeros.org/index.php/Alyssa_Ta...,,Alysanne Targaryen,Baelon Targaryen (son of Jaehaerys I),,,,,,,,,,,
45,103,Alyssa Velaryon,"[House Velaryon, House Targaryen, House Barath...",Aethan Velaryon,,http://awoiaf.westeros.org/index.php/Alyssa_Ve...,,Alarra Massey,Aenys I Targaryen,,,,,,,,,,,
55,107,Ambrose Butterwell,"[House Butterwell, House Targaryen, House Blac...",,[Lord Hayford],http://awoiaf.westeros.org/index.php/Ambrose_B...,,,Lady Frey (wife of Ambrose Butterwell),,,,,,,,,,,
73,143,Arya Flint,"[House Flint of the mountains, House Stark]",,,http://awoiaf.westeros.org/index.php/Arya_Flint,,,Rodrik Stark (son of Beron),,,,,,,,,,,


In [24]:
# all_relations_with_spouse.reset_index(inplace=True)
# all_no_spouse.reset_index(inplace=True)
final_data = pd.concat([all_relations_with_spouse, all_no_spouse])
final_data.info()

# Save final dataset
# all_usable_data.to_json('./ice_and_fire_final.json', orient='records')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 699 entries, 0 to 171
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   level_0               527 non-null    float64
 1   index                 699 non-null    int64  
 2   name                  699 non-null    object 
 3   allegiance            591 non-null    object 
 4   father                308 non-null    object 
 5   successor             219 non-null    object 
 6   url                   699 non-null    object 
 7   fatherRumored         2 non-null      object 
 8   mother                176 non-null    object 
 9   spouse                527 non-null    object 
 10  motherRumored         0 non-null      object 
 11  index_spouse          417 non-null    float64
 12  name_spouse           417 non-null    object 
 13  allegiance_spouse     397 non-null    object 
 14  father_spouse         241 non-null    object 
 15  successor_spouse      7