# Generate the association network

In [1]:
import pandas as pd
from helper.constantes import *
from ast import literal_eval

In [2]:
networks = pd.read_csv(cleaned_folder+"selected_chars.csv",converters={"associated_groups": literal_eval,"connection_label": literal_eval})

In [3]:
networks = networks.rename(columns={'Unnamed: 0':"id"})

In [4]:
networks_assoc = networks.loc[:,['name','associated_groups']]

In [5]:
networks_assoc.head()

Unnamed: 0,name,associated_groups
0,Aragog,[]
1,Ludovic Bagman,[British Ministry of Magic]
2,Bane,[Hogwarts School of Witchcraft and Wizardry]
3,Cuthbert Binns,[Hogwarts School of Witchcraft and Wizardry]
4,Regulus Black,"[Slytherin, Hogwarts School of Witchcraft and ..."


In [6]:
all_assoc = {}

def add_assoc_to_dico(index, list_groups, dico):
    for l in list_groups:
        if l in dico: 
            cur_set = dico[l]
        else:
            cur_set = set()
        cur_set.add(index)
        dico[l] = cur_set
networks_assoc.apply(lambda x: add_assoc_to_dico(x.name,x['associated_groups'],all_assoc),axis=1)
None

In [7]:
len(all_assoc["Hogwarts School of Witchcraft and Wizardry"])

109

In [8]:
len(all_assoc["Gryffindor"])

37

In [9]:
len(all_assoc["Slytherin"])

20

In [10]:
len(all_assoc["Hufflepuff"])

12

We can see that in the Hogwarts "association", all the students and teacher are included. This results in 109 different people, meaning that there will be $\frac{109\cdot 108}{2}$ different links in the network just for this association. This will quickly become not possible to visualise. We will therefore remove the Hogwarts school of Witchcraft and wizardry association. To get more meaningful results, we will add a bit of preprocessing about the association. We will also remove the different Hogwarts houses as they add many links and there are already explored in a previous visualisation.

The rest of the notebook will be split in the following way:
- Define the more insightful "association" and add their members. 
- Regenerate the dictionnary with the different person belonging to different associations 
- Generate and export the graph for the visualisation using NetworkX

In [11]:
def row_checking_predicate(predicate, df):
    return df[df.apply(lambda x: predicate(x),axis=1)]

## Association of interest
The associations or groups we will focus on are the following:
- Ministry of magic employee (*)
- Hogwarts staff
- Order of the Phoenix 1st generation (1st wave)
- Order of the Phoenix 2nd generate (2nd wave) 
- Weasley family (*)
- Potter family
- Death eaters (*)
- Dumbledore's Army (*)
- Gryffindor Quidditch team (*)
- House of Black 
- House of Gaunt 
- Order of Merlin (*)
- Slug club (*)
- Advanced guard (*)
- Hogwarts Headmasters

The first step we will have to do is to remove any of the association that are not in the above list. In our exploratory data anaalysis, we already started to do some cleaning (the name of the above association is copied from the exploratory data analysis results). This will become more intensive from now on. The association marked with (\*) correspond to association available in our dataset, we will therefore not double check tham. The others are handcrafted groups that will be made according to the Harry Potter fandom wiki website and wikipedia. 

In [12]:
lst = ["Advanced Guard", "Order of Merlin", "Gryffindor Quidditch team", "Dumbledore's Army", "Death Eaters", "Weasley family","British Ministry of Magic", "Potter family"]
networks_assoc['associated_groups'] = networks_assoc.apply(lambda x: [y for y in x['associated_groups'] if y in lst],axis=1)


In [13]:
def add_assoc_to_list(row, name):
    row.append(name)
    return row 

In [14]:
def add_association_to_people(people_list, df, association_name):
    lower_people_list = [x.lower() for x in people_list]
    people_in_df = row_checking_predicate(lambda x: x['name'].lower() in lower_people_list, df)
    print(f"Number of matching people in the dataframe: {len(people_in_df)}")
    df['associated_groups'] = df.apply(lambda x: add_assoc_to_list(x['associated_groups'], association_name)  if x['name'].lower() in lower_people_list else x['associated_groups'],axis=1)
    people_with_new_name = row_checking_predicate(lambda x: association_name in x['associated_groups'], df)
    print(f"Number of people with new association name: {len(people_with_new_name)}")
    print(f"Consistent numbers ? {len(people_with_new_name)==len(people_in_df)}")
    return df

We will start with Hogwarts teaching staff (based on this [wikipedia page](https://en.wikipedia.org/wiki/Hogwarts_staff))

In [15]:
hogwarts_staff = [
    "Albus Dumbledore", # Transfiguration then Headmaster
    "Alastor Moody", #Defense against the Dark Arts (DADA)
    "Argus Filch",#Caretaker
    "Aurora Sinistra",#Astronomy
    "Charity Burbage", # Muggle studies
    "Cuthbert Binns", # history of magic
    "Dolores Umbridge", #DADA
    "Filius Flitwick",  # Charms
    "Firenze", #divination
    "Gilderoy Lockhart", # DADA
    "Horace Slughorn", # Potions
    "Irma Pince", #Librarian
    "Minerva McGonagall",# transfiguration, headmistress
    "Pomona Sprout", #herbology
    "Poppy Pomfrey", # matron
    "Quirinus Quirrell", #DADA
    "Remus Lupin", # DADA
    "Rolanda Hooch",# Flying
    "Rubeus Hagrid",# Care of magical creatures
    "Septima Vector", # arithmancy
    "Severus Snape", # Potions, DADA, Headmasters
    "Silvanus Kettleburn", # care of magical creatures
    "Sybill Trelawney", # Divination
    "Wilhelmina Grubbly-Plank", #Care of magical creatures
]

In [16]:
networks_assoc = add_association_to_people(hogwarts_staff, networks_assoc, "Hogwarts staff")

Number of matching people in the dataframe: 22
Number of people with new association name: 22
Consistent numbers ? True


In [17]:
# based on the following list: https://harrypotter.fandom.com/wiki/Hogwarts_Headmaster
hogwarts_headmasters = [
    "Albus Dumbledore",
    "Dolores Umbridge",
    "Minerva McGonagall",
    "Severus Snape",
    "Armando Dippet",
    "Phineas Black"
]

In [18]:
networks_assoc = add_association_to_people(hogwarts_headmasters, networks_assoc, "Hogwarts headmasters")

Number of matching people in the dataframe: 4
Number of people with new association name: 4
Consistent numbers ? True


In [19]:
gaunt_house = [
    "Salazar Slytherin",
    "Lord Voldemort",
    "Marvolo Gaunt",
    "Morfin Gaunt",
    "Merope Gaunt"
]

In [20]:
networks_assoc = add_association_to_people(gaunt_house, networks_assoc, "House of Gaunt")

Number of matching people in the dataframe: 5
Number of people with new association name: 5
Consistent numbers ? True


In [22]:
ootp_1 = [
      "Albus Dumbledore",
    "Alastor Moody",
    "Aberforth Dumbledore",
    "Arabella Fig",
    "Alice Longbottom",
    "Benjy Fenwick",
    "Caradoc Dearborn",
    "Dedalus Diggle",
    "Dorcas Meadowes",
    "Edgar Bones",
    "Elphias Doge",
    "Emmeline Vance",
    "Fabian Prewett",
    "Frank Longbottom",
    "Gideon Prewett",
    "James Potter",
    "Lily Potter",
    "Marlene McKinnon",
    "Mundungus Fletcher",
    "Peter Pettigrew",
    "Remus Lupin",
    "Rubeus Hagrid",
    "Severus Snape",
    "Sirius Black",
    "Sturgis Podmore"
]

In [23]:
ootp_2 = [
    "Albus Dumbledore",
    "Alastor Moody",
    "Kingsley Shacklebot",
    "Aberforth Dumbledore",
    "Arabella Fig",
    "Dedalus Diggle",
    "Elphias Doge",
    "Emmeline Vance",
    "Minerva McGonagall",
    "Mundungus Fletcher",
    "Remus Lupin",
    "Rubeus Hagrid",
    "Severus Snape",
    "Sirius Black",
    "Sturgis Podmore",
    "Arthur Weasley",
    "Bill Weasley",
    "Charlie Weasley",
    "Hestia Jones",
    "Molly Weasley",
    "Nymphadora Tonks",
    "Fleur Delacour",
    "George Weasley",
    "Fred Weasley",
    "Harry Potter",
    "Hermione Granger",
    "Ron Weasley"
]

In [24]:
networks_assoc = add_association_to_people(ootp_1, networks_assoc, "1st Order of the Phoenix")

Number of matching people in the dataframe: 17
Number of people with new association name: 17
Consistent numbers ? True


In [25]:
networks_assoc = add_association_to_people(ootp_2, networks_assoc, "2nd Order of the Phoenix")

Number of matching people in the dataframe: 23
Number of people with new association name: 23
Consistent numbers ? True


In [26]:
black = [
    "Rodolphus Lestrange",
    "Ted Tonks",
    "Bellatrix Lestrange",
    "Narcissa Malfoy",
    "Lucius Malfoy",
    "Andromeda Tonks",
    "Sirius Black",
    "Regulus Black",
    "Phineas Nigellus Black"
]

In [27]:
networks_assoc = add_association_to_people(black, networks_assoc, "House of Black")

Number of matching people in the dataframe: 8
Number of people with new association name: 8
Consistent numbers ? True


In [28]:
all_assoc = {}
networks_assoc.apply(lambda x: add_assoc_to_dico(x.name,x['associated_groups'],all_assoc),axis=1)
None

In [29]:
all_perm = [(x,y,k) for k,v in all_assoc.items() for x in list(v) for y in list(v)  if x != y]
edges_assoc = pd.DataFrame(all_perm, columns=['source', 'target', 'association']).groupby(['source','target'])['association'].apply(list).reset_index()

### Generate graph in networkx
We will now generate the graph using networkx, get the (x,y) coordinates of the Kamada Kawai Layout (as Sigma.js cannot infer coordinates by itself) and use Louvain community detection method to attribute a color to a node.

In [40]:
import networkx as nx
import networkx.algorithms.community as nx_comm
from networkx.readwrite import json_graph
import json

In [57]:
def add_nodes_attributes_and_save(filepath, edge_list, node_info):
    edge_list = edge_list.reset_index().rename(columns={'index':'id'})
    # Create graph from pandas edgelist
    g=nx.from_pandas_edgelist(edge_list, source='source', target='target',edge_attr='id',create_using=nx.Graph())
    node_info.fillna("",inplace=True)
    # Detect the community using Louvain algorithm and collect this new attribute as a dictionnary
    communities = {k:idx for idx, cur in enumerate(nx_comm.louvain_communities(g)) for k in cur }
    node_attr = node_info.to_dict(orient='index') 
    nx.set_node_attributes(g, node_attr)
    nx.set_node_attributes(g, communities, "community")
    # Get the x-y coordinates of the Kamada Kawai Style layout
    nx.set_node_attributes(g,{k:{"x":v[0], "y":v[1], "color":"#EEEEEE","size":2} for k,v in nx.kamada_kawai_layout(g).items()})
    # Same the graph as JSON
    json_net = json_graph.node_link_data(g)
    edges = json_net['links']
    json_net["edges"]=edges.copy()
    del json_net['links']
    
    with open(filepath, 'w') as f:
        json.dump(json_net, f)

In [58]:
add_nodes_attributes_and_save(cleaned_folder + "association_network.json", edges_assoc, networks)

[{1, 99, 131, 165, 134, 103, 75, 107, 43, 12, 20, 84, 22, 25, 91, 92, 93}, {66, 3, 135, 73, 137, 146, 86, 153, 154, 95, 98, 163, 35, 38, 167, 166, 41, 170, 175, 55, 120, 62}, {144, 4, 101, 70, 164, 140, 61}, {67, 5, 141, 83, 21, 85, 27, 155, 168, 40, 104, 109, 173, 113, 125, 63}, {64, 68, 7, 71, 9, 139, 13, 143, 16, 151, 24, 152, 162, 100, 36, 37, 169, 106, 105, 108, 172, 46, 110, 112, 114, 116, 118, 56, 123}, {96, 26, 11, 44, 45, 14, 47, 79, 19, 88, 121, 90, 60, 159}]


  node_attr = node_info.to_dict(orient='index')
