In [1]:
# Restrictions: ONLY named individuals

import pandas as pd

json_file_path = './ice-and-fire.json'
df = pd.read_json(json_file_path)
df.drop('predecessor', axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1345 entries, 0 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1345 non-null   object
 1   allegiance     1186 non-null   object
 2   father         942 non-null    object
 3   successor      194 non-null    object
 4   url            1345 non-null   object
 5   fatherRumored  11 non-null     object
 6   mother         490 non-null    object
 7   spouse         523 non-null    object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 94.7+ KB


In [2]:
# Remove datapoints that have no relation at all.
# Old code and technically not necessarily anymore, already done in node.js
data_without_relation = df[(df['father'].isnull()) \
                           & (df['mother'].isnull()) \
                           & (df['successor'].isnull()) \
                           & (df['spouse'].isnull())]

data_with_relation = df[~df['name'].isin(data_without_relation['name'])]
data_with_relation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1315 entries, 0 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1315 non-null   object
 1   allegiance     1172 non-null   object
 2   father         942 non-null    object
 3   successor      194 non-null    object
 4   url            1315 non-null   object
 5   fatherRumored  11 non-null     object
 6   mother         490 non-null    object
 7   spouse         523 non-null    object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 102.7+ KB


In [3]:
duplicate_names = df[df['name'].duplicated()]
duplicate_names['name'].value_counts()

Series([], Name: name, dtype: int64)

In [4]:
# Weed out double fathers.
# Some are actually double, because they are rumoured
multiple_fathers = df[df['father'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_fathers['father']

Series([], Name: father, dtype: object)

In [5]:
# Weed out double moms.
# Some are actually double, because they are rumoured
multiple_mothers = df[df['mother'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_mothers['mother']

Series([], Name: mother, dtype: object)

In [6]:
# Weed out erratic spouses.
multiple_spouses = df[df['spouse'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
multiple_spouses[['name', 'spouse']]

Unnamed: 0,name,spouse
82,Alys Karstark,"[Sigorn, Thenns]"
103,Alyssa Velaryon,"[Aenys I Targaryen, Rogar Baratheon]"
108,Amerei Frey,"[Pate of the Blue Fork, Lancel Lannister]"
290,Craster,"[Dyah, Ferny, Nella, Gilly]"
293,Cregan Stark,"[Arra Norrey, Alysanne Blackwood, Lynara Stark]"
310,Daemon Targaryen,"[Rhea Royce, Laena Velaryon, Rhaenyra Targaryen]"
328,Dalton Greyjoy,"[Tess, Kayce, Lysa Farman]"
342,Davos Dayne,"[Nymeria, Ny Sar]"
369,Donella Hornwood,"[Halys Hornwood, Ramsay Snow]"
401,Elaena Targaryen,"[Ossifer Plumm, Ronnel Penrose, Michael Manwoody]"


In [7]:
# Datapoints with children
data_with_children = data_without_relation[
    (data_without_relation['name'].isin(data_with_relation['mother']))
    | (data_without_relation['name'].isin(data_with_relation['father']))]

data_with_children.head()

Unnamed: 0,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored
495,Gerold Grafton,,,,http://awoiaf.westeros.org/index.php/Gerold_Gr...,,,,


In [8]:
# data_with_relation[data_with_relation['name'] == 'Aelor Targaryen']
data_with_relation['father']

0       Manfred Hightower (Aegon's Conquest)
1                             Damon Marbrand
2                             Eustace Osgrey
3                            Laenor Velaryon
4                            House Whitehead
                        ...                 
1340                                     NaN
1341                              Yohn Royce
1342                              Tytos Frey
1343                                     NaN
1344                              Tytos Frey
Name: father, Length: 1315, dtype: object

In [9]:
# Datapoints for whom spouse is the only relationship

data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull()) 
                                      & (df['father'].isnull()) 
                                       & (df['mother'].isnull()) 
                                       & (df['successor'].isnull())]

data_with_spouses.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 256 entries, 22 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           256 non-null    object
 1   allegiance     241 non-null    object
 2   father         0 non-null      object
 3   successor      0 non-null      object
 4   url            256 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         256 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 20.0+ KB


  data_with_spouses = data_with_relation[(~data_with_relation['spouse'].isnull())


In [10]:
data_with_spouses.head()

Unnamed: 0,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored
22,Aelinor Penrose,"[House Penrose, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Aelinor_P...,,,[Aerys I Targaryen],
47,Alarra Massey,"[House Massey, House Velaryon]",,,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
50,Alayne,[House Baelish],,,http://awoiaf.westeros.org/index.php/Alayne_Ba...,,,[Baelish (lord)],
74,Alyn Tarbeck,[House Tarbeck],,,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],
78,Alys Arryn (wife of Rhaegel),"[House Arryn, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Alys_Arry...,,,[Rhaegel Targaryen],


In [11]:
# Get spouses that have a spouse present in the dataset with all 
# parent-child relationships

parent_child_spouses = df.explode('spouse').dropna(subset=['spouse'])['spouse'].unique()
data_spouse_family = data_with_spouses[data_with_spouses['name'].isin(parent_child_spouses)]
data_spouse_family.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 47 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           192 non-null    object
 1   allegiance     182 non-null    object
 2   father         0 non-null      object
 3   successor      0 non-null      object
 4   url            192 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         192 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 15.0+ KB


In [12]:
data_spouse_family.head()

Unnamed: 0,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored
47,Alarra Massey,"[House Massey, House Velaryon]",,,http://awoiaf.westeros.org/index.php/Alarra_Ma...,,,[Aethan Velaryon],
74,Alyn Tarbeck,[House Tarbeck],,,http://awoiaf.westeros.org/index.php/Alyn_Tarbeck,,,[Jeyne Westerling (wife of Maegor I)],
78,Alys Arryn (wife of Rhaegel),"[House Arryn, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Alys_Arry...,,,[Rhaegel Targaryen],
79,Alys Beesbury,"[House Beesbury, House Tyrell]",,,http://awoiaf.westeros.org/index.php/Alys_Bees...,,,[Leo Tyrell (son of Victor)],
80,Alys Frey,[House Frey],,,http://awoiaf.westeros.org/index.php/Alys_Frey,,,[Jared Frey],


In [13]:
all_usable_data = pd.concat([data_with_children, data_spouse_family, data_with_relation])
len(all_usable_data)

1508

In [14]:
all_usable_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1508 entries, 495 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           1508 non-null   object
 1   allegiance     1354 non-null   object
 2   father         942 non-null    object
 3   successor      194 non-null    object
 4   url            1508 non-null   object
 5   fatherRumored  11 non-null     object
 6   mother         490 non-null    object
 7   spouse         715 non-null    object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 117.8+ KB


In [15]:
first_layer = all_usable_data[(all_usable_data['father'].isnull()) & (all_usable_data['mother'].isnull())]
first_layer.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 529 entries, 495 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           529 non-null    object
 1   allegiance     467 non-null    object
 2   father         0 non-null      object
 3   successor      80 non-null     object
 4   url            529 non-null    object
 5   fatherRumored  0 non-null      object
 6   mother         0 non-null      object
 7   spouse         460 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 41.3+ KB


In [16]:
all_usable_data.explode('allegiance').dropna(subset=['allegiance']).value_counts('allegiance').head(50)

allegiance
House Frey                        163
House Targaryen                   144
House Stark                       110
House Lannister                    81
Blacks                             54
House Tyrell                       50
House Hightower                    46
House Baratheon                    36
House Martell                      36
House Greyjoy                      32
House Velaryon                     28
House Arryn                        28
Greens                             25
House Tully                        19
Night's Watch                      18
House Blackwood                    18
House Royce                        17
House Bracken                      17
House Manderly                     16
Kingsguard                         16
House Redwyne                      16
House Florent                      15
House Waynwood                     14
Citadel                            14
House Tarbeck                      14
House Baratheon of Dragonstone     13
H

In [17]:
first_layer[first_layer['successor'].notna()]['successor'].head(100)

58          [Alekyne Florent, Davos Seaworth]
65           [Edwell Celtigar, Osmund Strong]
73      [Lord Stokeworth, Maegor I Targaryen]
107                            [Lord Hayford]
131                       [Arlan V Durrandon]
                        ...                  
1257                        [Torgon Greyiron]
1263                      [Urrigon Hightower]
1273                                 [Qyburn]
1322                         [Barbrey Dustin]
1331                         [Rohanne Webber]
Name: successor, Length: 80, dtype: object

In [18]:
# Transform nodes into partnerships (= single or couple), 
# so that they will be easier to use in a hierarchical structure

all_with_spouse = all_usable_data[all_usable_data['spouse'].notnull()]
all_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 715 entries, 47 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           715 non-null    object
 1   allegiance     685 non-null    object
 2   father         251 non-null    object
 3   successor      61 non-null     object
 4   url            715 non-null    object
 5   fatherRumored  2 non-null      object
 6   mother         149 non-null    object
 7   spouse         715 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 55.9+ KB


In [19]:
all_no_spouse = all_usable_data[all_usable_data['spouse'].isna()]
all_no_spouse.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 793 entries, 495 to 1344
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           793 non-null    object
 1   allegiance     669 non-null    object
 2   father         691 non-null    object
 3   successor      133 non-null    object
 4   url            793 non-null    object
 5   fatherRumored  9 non-null      object
 6   mother         341 non-null    object
 7   spouse         0 non-null      object
 8   motherRumored  2 non-null      object
dtypes: object(9)
memory usage: 62.0+ KB


In [20]:
all_relations = all_with_spouse.explode('spouse').dropna(subset=['spouse'])
all_relations.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 47 to 1343
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           750 non-null    object
 1   allegiance     720 non-null    object
 2   father         291 non-null    object
 3   successor      77 non-null     object
 4   url            750 non-null    object
 5   fatherRumored  2 non-null      object
 6   mother         171 non-null    object
 7   spouse         750 non-null    object
 8   motherRumored  0 non-null      object
dtypes: object(9)
memory usage: 58.6+ KB


In [33]:
# all_relations.reset_index(inplace=True)
all_relations_with_spouse = pd.merge(all_relations, all_relations, how='left', left_on='spouse', right_on='name', suffixes=('', '_spouse'))
all_relations_with_spouse = all_relations_with_spouse.reset_index()
all_relations_with_spouse = pd.merge(all_relations_with_spouse, all_no_spouse, how='left', left_on='spouse', right_on='name', suffixes=('', '_no-spouse'))

all_relations_with_spouse.info(20)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1298 entries, 0 to 1297
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    1298 non-null   int64 
 1   name                     1298 non-null   object
 2   allegiance               1203 non-null   object
 3   father                   470 non-null    object
 4   successor                112 non-null    object
 5   url                      1298 non-null   object
 6   fatherRumored            2 non-null      object
 7   mother                   270 non-null    object
 8   spouse                   1298 non-null   object
 9   motherRumored            0 non-null      object
 10  name_spouse              1174 non-null   object
 11  allegiance_spouse        1084 non-null   object
 12  father_spouse            555 non-null    object
 13  successor_spouse         129 non-null    object
 14  url_spouse               1174 non-null  

In [34]:
all_relations_with_spouse = all_relations_with_spouse.drop_duplicates(subset=['name', 'father', 'mother', 'spouse', 'name_spouse', 'father_spouse', 'mother_spouse'], ignore_index=True)
all_relations_with_spouse.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 549 entries, 0 to 548
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    549 non-null    int64 
 1   name                     549 non-null    object
 2   allegiance               532 non-null    object
 3   father                   291 non-null    object
 4   successor                77 non-null     object
 5   url                      549 non-null    object
 6   fatherRumored            2 non-null      object
 7   mother                   171 non-null    object
 8   spouse                   549 non-null    object
 9   motherRumored            0 non-null      object
 10  name_spouse              443 non-null    object
 11  allegiance_spouse        429 non-null    object
 12  father_spouse            226 non-null    object
 13  successor_spouse         57 non-null     object
 14  url_spouse               443 non-null    o

In [39]:
all_relations_with_spouse[all_relations_with_spouse['name_spouse'].isna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106 entries, 14 to 541
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    106 non-null    int64 
 1   name                     106 non-null    object
 2   allegiance               103 non-null    object
 3   father                   57 non-null     object
 4   successor                22 non-null     object
 5   url                      106 non-null    object
 6   fatherRumored            0 non-null      object
 7   mother                   29 non-null     object
 8   spouse                   106 non-null    object
 9   motherRumored            0 non-null      object
 10  name_spouse              0 non-null      object
 11  allegiance_spouse        0 non-null      object
 12  father_spouse            0 non-null      object
 13  successor_spouse         0 non-null      object
 14  url_spouse               0 non-null      

In [37]:
all_relations_with_spouse[all_relations_with_spouse['name_no-spouse'].notna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 204 to 529
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   index                    25 non-null     int64 
 1   name                     25 non-null     object
 2   allegiance               23 non-null     object
 3   father                   21 non-null     object
 4   successor                8 non-null      object
 5   url                      25 non-null     object
 6   fatherRumored            0 non-null      object
 7   mother                   12 non-null     object
 8   spouse                   25 non-null     object
 9   motherRumored            0 non-null      object
 10  name_spouse              0 non-null      object
 11  allegiance_spouse        0 non-null      object
 12  father_spouse            0 non-null      object
 13  successor_spouse         0 non-null      object
 14  url_spouse               0 non-null      

In [24]:
all_relations_with_spouse[all_relations_with_spouse['name_spouse'].isna()].sort_values('name').head(20)

Unnamed: 0,index,name,allegiance,father,successor,url,fatherRumored,mother,spouse,motherRumored,name_spouse,allegiance_spouse,father_spouse,successor_spouse,url_spouse,fatherRumored_spouse,mother_spouse,spouse_spouse,motherRumored_spouse
204,381,Aelinor Penrose,"[House Penrose, House Targaryen]",,,http://awoiaf.westeros.org/index.php/Aelinor_P...,,,Aerys I Targaryen,,,,,,,,,,
207,384,Aemma Arryn,"[House Arryn, House Targaryen]",Rodrik Arryn,[Alicent Hightower],http://awoiaf.westeros.org/index.php/Aemma_Arryn,,Daella Targaryen (daughter of Jaehaerys I),Viserys I Targaryen,,,,,,,,,,
212,390,Aerion Targaryen[1],[House Targaryen],Daemion Targaryen,,http://awoiaf.westeros.org/index.php/Aerion_Ta...,,,Valaena Velaryon,,,,,,,,,,
214,393,Alannys Harlaw,"[House Greyjoy, House Harlaw]",House Harlaw,,http://awoiaf.westeros.org/index.php/Alannys_H...,,,Balon Greyjoy,,,,,,,,,,
216,397,Alayne,[House Baelish],,,http://awoiaf.westeros.org/index.php/Alayne_Ba...,,,Baelish (lord),,,,,,,,,,
219,401,Aliandra Martell,[House Martell],Qoren Martell,,http://awoiaf.westeros.org/index.php/Aliandra_...,,,Drazenko Rogare,,,,,,,,,,
220,402,Alicent Hightower,"[House Hightower, House Targaryen, Greens]",Otto Hightower,"[Helaena Targaryen, Daemon Targaryen]",http://awoiaf.westeros.org/index.php/Alicent_H...,,,Viserys I Targaryen,,,,,,,,,,
223,413,Alys Harroway,"[House Harroway, House Targaryen]",Lucas Harroway,"[Ceryse Hightower, Tyanna of the Tower]",http://awoiaf.westeros.org/index.php/Alys_Harr...,,,Maegor I Targaryen,,,,,,,,,,
225,415,Alys Karstark,"[House Karstark, House Thenn]",Rickard Karstark,,http://awoiaf.westeros.org/index.php/Alys_Kars...,,,Thenns,,,,,,,,,,
231,431,Alysanne Targaryen,[House Targaryen],Aenys I Targaryen,[Aemma Arryn],http://awoiaf.westeros.org/index.php/Alysanne_...,,Alyssa Velaryon,Jaehaerys I Targaryen,,,,,,,,,,


In [25]:
# all_relations_with_spouse.reset_index(inplace=True)
# all_no_spouse.reset_index(inplace=True)
final_data = pd.concat([all_relations_with_spouse, all_no_spouse])
final_data.info()



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342 entries, 0 to 1344
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 549 non-null    float64
 1   name                  1342 non-null   object 
 2   allegiance            1201 non-null   object 
 3   father                982 non-null    object 
 4   successor             210 non-null    object 
 5   url                   1342 non-null   object 
 6   fatherRumored         11 non-null     object 
 7   mother                512 non-null    object 
 8   spouse                549 non-null    object 
 9   motherRumored         2 non-null      object 
 10  name_spouse           443 non-null    object 
 11  allegiance_spouse     429 non-null    object 
 12  father_spouse         226 non-null    object 
 13  successor_spouse      57 non-null     object 
 14  url_spouse            443 non-null    object 
 15  fatherRumored_spouse 

In [26]:
# Save final dataset
# all_usable_data.to_json('./ice_and_fire_final.json', orient='records')