In [1]:
# Restrictions: ONLY named individuals

import pandas as pd

json_file_path = './ice-and-fire.json'
df = pd.read_json(json_file_path)

# Really only focus on family relations
df.drop('predecessor', axis=1, inplace=True)
df.drop('successor', axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1345 non-null   object
 1   allegiance     1186 non-null   object
 2   father         942 non-null    object
 3   url            1345 non-null   object
 4   fatherRumored  11 non-null     object
 5   mother         490 non-null    object
 6   spouse         523 non-null    object
 7   motherRumored  2 non-null      object
dtypes: object(8)
memory usage: 84.2+ KB


In [2]:
# Remove datapoints that have no relation at all.
# Old code and technically not necessarily anymore, already done in node.js
data_without_relation = df[(df['father'].isnull()) \
                           & (df['mother'].isnull()) \
                           & (df['spouse'].isnull())]

data_with_relation = df[~df['name'].isin(data_without_relation['name'])]
data_with_relation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1247 entries, 0 to 1344
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1247 non-null   object
 1   allegiance     1139 non-null   object
 2   father         942 non-null    object
 3   url            1247 non-null   object
 4   fatherRumored  11 non-null     object
 5   mother         490 non-null    object
 6   spouse         523 non-null    object
 7   motherRumored  2 non-null      object
dtypes: object(8)
memory usage: 87.7+ KB


In [3]:
duplicate_names = df[df['name'].duplicated()]
duplicate_names['name'].value_counts()

Series([], Name: name, dtype: int64)

In [4]:
# Weed out double fathers.
# Some are actually double, because they are rumoured
multiple_fathers = df[df['father'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_fathers['father']

Series([], Name: father, dtype: object)

In [5]:
# Weed out double moms.
# Some are actually double, because they are rumoured
multiple_mothers = df[df['mother'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_mothers['mother']

Series([], Name: mother, dtype: object)

In [6]:
# Weed out erratic spouses.
multiple_spouses = df[df['spouse'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_spouses[['name', 'spouse']]

Unnamed: 0,name,spouse
82,Alys Karstark,"[Sigorn, Thenns]"
103,Alyssa Velaryon,"[Aenys I Targaryen, Rogar Baratheon]"
108,Amerei Frey,"[Pate of the Blue Fork, Lancel Lannister]"
290,Craster,"[Dyah, Ferny, Nella, Gilly]"
293,Cregan Stark,"[Arra Norrey, Alysanne Blackwood, Lynara Stark]"
310,Daemon Targaryen,"[Rhea Royce, Laena Velaryon, Rhaenyra Targaryen]"
328,Dalton Greyjoy,"[Tess, Kayce, Lysa Farman]"
342,Davos Dayne,"[Nymeria, Ny Sar]"
369,Donella Hornwood,"[Halys Hornwood, Ramsay Snow]"
401,Elaena Targaryen,"[Ossifer Plumm, Ronnel Penrose, Michael Manwoody]"


In [7]:
# Datapoints with children
data_with_children = data_without_relation[
    (data_without_relation['name'].isin(data_with_relation['mother']))
    | (data_without_relation['name'].isin(data_with_relation['father']))]

data_with_children.head()

Unnamed: 0,name,allegiance,father,url,fatherRumored,mother,spouse,motherRumored
73,Alyn Stokeworth,[House Stokeworth],,http://awoiaf.westeros.org/index.php/Alyn_Stok...,,,,
131,Arrec Durrandon,,,http://awoiaf.westeros.org/index.php/Arrec_Dur...,,,,
180,Benedict I Justman,,,http://awoiaf.westeros.org/index.php/Benedict_...,,,,
220,Brandon Stark (Shipwright),,,http://awoiaf.westeros.org/index.php/Brandon_S...,,,,
223,Brandon Stark (father of Walton),[House Stark],,http://awoiaf.westeros.org/index.php/Brandon_S...,,,,


In [8]:
# data_with_relation[data_with_relation['name'] == 'Aelor Targaryen']
data_with_relation['father']

0       Manfred Hightower (Aegon's Conquest)
1                             Damon Marbrand
2                             Eustace Osgrey
3                            Laenor Velaryon
4                            House Whitehead
                        ...                 
1340                                     NaN
1341                              Yohn Royce
1342                              Tytos Frey
1343                                     NaN
1344                              Tytos Frey
Name: father, Length: 1247, dtype: object

In [9]:
# Datapoints for whom spouse is the only relationship

data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull()) 
                                      & (df['father'].isnull()) 
                                       & (df['mother'].isnull())]

data_with_spouses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 22 to 1343
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           268 non-null    object
 1   allegiance     252 non-null    object
 2   father         0 non-null      object
 3   url            268 non-null    object
 4   fatherRumored  0 non-null      object
 5   mother         0 non-null      object
 6   spouse         268 non-null    object
 7   motherRumored  0 non-null      object
dtypes: object(8)
memory usage: 18.8+ KB


  data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull())


In [10]:
data_with_spouses.head()

Unnamed: 0,name,allegiance,father,url,fatherRumored,mother,spouse,motherRumored
22,Aelinor Penrose,"[House Penrose, House Targaryen]",,http://awoiaf.westeros.org/index.php/Aelinor_P...,,,[Aerys I Targaryen],
47,Alarra Massey,"[House Massey, House Velaryon]",,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
50,Alayne,[House Baelish],,http://awoiaf.westeros.org/index.php/Alayne_Ba...,,,[Baelish (lord)],
58,Alester Florent,"[House Florent, House Tyrell, House Baratheon ...",,http://awoiaf.westeros.org/index.php/Alester_F...,,,[Melara Crane],
74,Alyn Tarbeck,[House Tarbeck],,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],


In [11]:
# Get spouses that have a spouse present in the dataset with all 
# parent-child relationships

parent_child_spouses = df.explode('spouse').dropna(subset=['spouse'])['spouse'].unique()
data_spouse_family = data_with_spouses[data_with_spouses['name'].isin(parent_child_spouses)]
data_spouse_family.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 47 to 1343
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           202 non-null    object
 1   allegiance     192 non-null    object
 2   father         0 non-null      object
 3   url            202 non-null    object
 4   fatherRumored  0 non-null      object
 5   mother         0 non-null      object
 6   spouse         202 non-null    object
 7   motherRumored  0 non-null      object
dtypes: object(8)
memory usage: 14.2+ KB


In [12]:
data_spouse_family.head()

Unnamed: 0,name,allegiance,father,url,fatherRumored,mother,spouse,motherRumored
47,Alarra Massey,"[House Massey, House Velaryon]",,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
58,Alester Florent,"[House Florent, House Tyrell, House Baratheon ...",,http://awoiaf.westeros.org/index.php/Alester_F...,,,[Melara Crane],
74,Alyn Tarbeck,[House Tarbeck],,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],
78,Alys Arryn (wife of Rhaegel),"[House Arryn, House Targaryen]",,http://awoiaf.westeros.org/index.php/Alys_Arry...,,,[Rhaegel Targaryen],
79,Alys Beesbury,"[House Beesbury, House Tyrell]",,http://awoiaf.westeros.org/index.php/Alys_Bees...,,,[Leo Tyrell (son of Victor)],


In [13]:
all_usable_data = pd.concat([data_with_children, data_spouse_family, data_with_relation])
len(all_usable_data)

1485

In [14]:
all_usable_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1485 entries, 73 to 1344
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1485 non-null   object
 1   allegiance     1346 non-null   object
 2   father         942 non-null    object
 3   url            1485 non-null   object
 4   fatherRumored  11 non-null     object
 5   mother         490 non-null    object
 6   spouse         725 non-null    object
 7   motherRumored  2 non-null      object
dtypes: object(8)
memory usage: 104.4+ KB


In [15]:
first_layer = all_usable_data[(all_usable_data['father'].isnull()) & (all_usable_data['mother'].isnull())]
first_layer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 73 to 1343
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           506 non-null    object
 1   allegiance     459 non-null    object
 2   father         0 non-null      object
 3   url            506 non-null    object
 4   fatherRumored  0 non-null      object
 5   mother         0 non-null      object
 6   spouse         470 non-null    object
 7   motherRumored  0 non-null      object
dtypes: object(8)
memory usage: 35.6+ KB


In [16]:
all_usable_data.explode('allegiance').dropna(subset=['allegiance']).value_counts('allegiance').head(50)

allegiance
House Frey                        163
House Targaryen                   142
House Stark                       110
House Lannister                    82
Blacks                             53
House Tyrell                       52
House Hightower                    45
House Martell                      36
House Baratheon                    36
House Greyjoy                      32
House Arryn                        28
House Velaryon                     28
Greens                             23
House Tully                        18
House Blackwood                    18
House Bracken                      17
House Manderly                     17
House Royce                        17
Night's Watch                      16
House Florent                      16
House Redwyne                      16
House Blackfyre                    14
House Waynwood                     14
House Tarbeck                      14
Citadel                            13
House Baratheon of Dragonstone     13
H

In [17]:
# first_layer[first_layer['successor'].notna()]['successor'].head(100)

In [18]:
# Transform nodes into partnerships (= single or couple), 
# so that they will be easier to use in a hierarchical structure

all_with_spouse = all_usable_data[all_usable_data['spouse'].notnull()]
all_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 725 entries, 47 to 1343
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           725 non-null    object
 1   allegiance     695 non-null    object
 2   father         251 non-null    object
 3   url            725 non-null    object
 4   fatherRumored  2 non-null      object
 5   mother         149 non-null    object
 6   spouse         725 non-null    object
 7   motherRumored  0 non-null      object
dtypes: object(8)
memory usage: 51.0+ KB


In [19]:
all_no_spouse = all_usable_data[all_usable_data['spouse'].isna()]
all_no_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 760 entries, 73 to 1344
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           760 non-null    object
 1   allegiance     651 non-null    object
 2   father         691 non-null    object
 3   url            760 non-null    object
 4   fatherRumored  9 non-null      object
 5   mother         341 non-null    object
 6   spouse         0 non-null      object
 7   motherRumored  2 non-null      object
dtypes: object(8)
memory usage: 53.4+ KB


In [20]:
all_relations = all_with_spouse.explode('spouse').dropna(subset=['spouse'])
all_relations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 761 entries, 47 to 1343
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           761 non-null    object
 1   allegiance     731 non-null    object
 2   father         291 non-null    object
 3   url            761 non-null    object
 4   fatherRumored  2 non-null      object
 5   mother         171 non-null    object
 6   spouse         761 non-null    object
 7   motherRumored  0 non-null      object
dtypes: object(8)
memory usage: 53.5+ KB


In [21]:
# all_relations.reset_index(inplace=True)
all_relations_with_spouse = pd.merge(all_relations, all_relations, how='left', left_on='spouse', right_on='name', suffixes=('', '_spouse'))
all_relations_with_spouse = all_relations_with_spouse.reset_index()
all_relations_with_spouse = pd.merge(all_relations_with_spouse, all_no_spouse, how='left', left_on='spouse', right_on='name', suffixes=('', '_no-spouse'))

all_relations_with_spouse.info(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1336 entries, 0 to 1335
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    1336 non-null   int64 
 1   name                     1336 non-null   object
 2   allegiance               1241 non-null   object
 3   father                   478 non-null    object
 4   url                      1336 non-null   object
 5   fatherRumored            2 non-null      object
 6   mother                   273 non-null    object
 7   spouse                   1336 non-null   object
 8   motherRumored            0 non-null      object
 9   name_spouse              1212 non-null   object
 10  allegiance_spouse        1122 non-null   object
 11  father_spouse            563 non-null    object
 12  url_spouse               1212 non-null   object
 13  fatherRumored_spouse     2 non-null      object
 14  mother_spouse            282 non-null   

In [22]:
all_relations_with_spouse = all_relations_with_spouse.drop_duplicates(subset=['name', 'father', 'mother', 'spouse', 'name_spouse', 'father_spouse', 'mother_spouse'], ignore_index=True)
all_relations_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    549 non-null    int64 
 1   name                     549 non-null    object
 2   allegiance               532 non-null    object
 3   father                   291 non-null    object
 4   url                      549 non-null    object
 5   fatherRumored            2 non-null      object
 6   mother                   171 non-null    object
 7   spouse                   549 non-null    object
 8   motherRumored            0 non-null      object
 9   name_spouse              443 non-null    object
 10  allegiance_spouse        429 non-null    object
 11  father_spouse            226 non-null    object
 12  url_spouse               443 non-null    object
 13  fatherRumored_spouse     2 non-null      object
 14  mother_spouse            141 non-null    o

In [23]:
all_relations_with_spouse[all_relations_with_spouse['name_spouse'].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 16 to 542
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    106 non-null    int64 
 1   name                     106 non-null    object
 2   allegiance               103 non-null    object
 3   father                   57 non-null     object
 4   url                      106 non-null    object
 5   fatherRumored            0 non-null      object
 6   mother                   29 non-null     object
 7   spouse                   106 non-null    object
 8   motherRumored            0 non-null      object
 9   name_spouse              0 non-null      object
 10  allegiance_spouse        0 non-null      object
 11  father_spouse            0 non-null      object
 12  url_spouse               0 non-null      object
 13  fatherRumored_spouse     0 non-null      object
 14  mother_spouse            0 non-null      

In [24]:
all_relations_with_spouse[all_relations_with_spouse['name_no-spouse'].notna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 215 to 530
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    24 non-null     int64 
 1   name                     24 non-null     object
 2   allegiance               22 non-null     object
 3   father                   21 non-null     object
 4   url                      24 non-null     object
 5   fatherRumored            0 non-null      object
 6   mother                   12 non-null     object
 7   spouse                   24 non-null     object
 8   motherRumored            0 non-null      object
 9   name_spouse              0 non-null      object
 10  allegiance_spouse        0 non-null      object
 11  father_spouse            0 non-null      object
 12  url_spouse               0 non-null      object
 13  fatherRumored_spouse     0 non-null      object
 14  mother_spouse            0 non-null      

In [25]:
all_relations_with_spouse[all_relations_with_spouse['name_spouse'].isna()].sort_values('name').head(20)

Unnamed: 0,index,name,allegiance,father,url,fatherRumored,mother,spouse,motherRumored,name_spouse,...,spouse_spouse,motherRumored_spouse,name_no-spouse,allegiance_no-spouse,father_no-spouse,url_no-spouse,fatherRumored_no-spouse,mother_no-spouse,spouse_no-spouse,motherRumored_no-spouse
215,404,Aelinor Penrose,"[House Penrose, House Targaryen]",,http://awoiaf.westeros.org/index.php/Aelinor_P...,,,Aerys I Targaryen,,,...,,,Aerys I Targaryen,,Daeron II Targaryen,http://awoiaf.westeros.org/index.php/Aerys_I_T...,,Myriah Martell,,
218,407,Aemma Arryn,"[House Arryn, House Targaryen]",Rodrik Arryn,http://awoiaf.westeros.org/index.php/Aemma_Arryn,,Daella Targaryen (daughter of Jaehaerys I),Viserys I Targaryen,,,...,,,Viserys I Targaryen,,Baelon Targaryen (son of Jaehaerys I),http://awoiaf.westeros.org/index.php/Viserys_I...,,Alyssa Targaryen,,
223,413,Aerion Targaryen[1],[House Targaryen],Daemion Targaryen,http://awoiaf.westeros.org/index.php/Aerion_Ta...,,,Valaena Velaryon,,,...,,,,,,,,,,
225,416,Alannys Harlaw,"[House Greyjoy, House Harlaw]",House Harlaw,http://awoiaf.westeros.org/index.php/Alannys_H...,,,Balon Greyjoy,,,...,,,Balon Greyjoy,,Quellon Greyjoy,http://awoiaf.westeros.org/index.php/Balon_Gre...,,Lady Sunderly (wife of Quellon Greyjoy),,
227,420,Alayne,[House Baelish],,http://awoiaf.westeros.org/index.php/Alayne_Ba...,,,Baelish (lord),,,...,,,,,,,,,,
229,424,Aliandra Martell,[House Martell],Qoren Martell,http://awoiaf.westeros.org/index.php/Aliandra_...,,,Drazenko Rogare,,,...,,,,,,,,,,
230,425,Alicent Hightower,"[House Hightower, House Targaryen, Greens]",Otto Hightower,http://awoiaf.westeros.org/index.php/Alicent_H...,,,Viserys I Targaryen,,,...,,,Viserys I Targaryen,,Baelon Targaryen (son of Jaehaerys I),http://awoiaf.westeros.org/index.php/Viserys_I...,,Alyssa Targaryen,,
233,436,Alys Harroway,"[House Harroway, House Targaryen]",Lucas Harroway,http://awoiaf.westeros.org/index.php/Alys_Harr...,,,Maegor I Targaryen,,,...,,,,,,,,,,
235,438,Alys Karstark,"[House Karstark, House Thenn]",Rickard Karstark,http://awoiaf.westeros.org/index.php/Alys_Kars...,,,Thenns,,,...,,,,,,,,,,
241,454,Alysanne Targaryen,[House Targaryen],Aenys I Targaryen,http://awoiaf.westeros.org/index.php/Alysanne_...,,Alyssa Velaryon,Jaehaerys I Targaryen,,,...,,,,,,,,,,


In [26]:
# all_relations_with_spouse.reset_index(inplace=True)
# all_no_spouse.reset_index(inplace=True)
final_data = pd.concat([all_relations_with_spouse, all_no_spouse])
final_data.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1344
Data columns (total 25 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    549 non-null    float64
 1   name                     1309 non-null   object 
 2   allegiance               1183 non-null   object 
 3   father                   982 non-null    object 
 4   url                      1309 non-null   object 
 5   fatherRumored            11 non-null     object 
 6   mother                   512 non-null    object 
 7   spouse                   549 non-null    object 
 8   motherRumored            2 non-null      object 
 9   name_spouse              443 non-null    object 
 10  allegiance_spouse        429 non-null    object 
 11  father_spouse            226 non-null    object 
 12  url_spouse               443 non-null    object 
 13  fatherRumored_spouse     2 non-null      object 
 14  mother_spouse           

In [27]:
# successors = all_usable_data[all_usable_data['successor'].notna()]['successor']
# successors.head(50)

In [28]:
# Save final dataset
all_usable_data.to_json('./ice_and_fire_final.json', orient='records')