In [None]:
import pandas as pd
import numpy as np
import urllib.request
import json
import pickle
import regex as re
import requests

## Supplementary notebook: 

# Collecting and cleaning our dataset

In this notebook the main dataset used for exploring our topic is constructed. This includes the names of all characters related to the Wizarding World including some chosen features, and the links between the characters. 

### Construct queries

First step is to collect the names of all the characters in the Wizarding World. To have to access as few pages as possible, we utilise that the characters all have a gender (male, female, unknown) and that there is a category for each gender where all characters of this specific gender are listed. Hereby, we can extract the names of all characters. 

Note that because there is a limit to the number items that can be returned by a query, the function below iterates over each page in steps of 500 to get all names. 

In [None]:
def individuals_by_gender(category_name):
  '''
  Function that returns the names of all characters associated with this specific catogory
  on the wiki-pages.
  '''

  individuals = []

  baseurl = "https://harrypotter.fandom.com/api.php?"
  action = "action=query&list=categorymembers"
  content = "prop=revisions&rvprop=content&rvslots=*"
  limit = "cmlimit=500"  # number of category items returned (max is 500)
  dataformat ="format=json"
  cmtitle = 'cmtitle=Category:{}'.format(category_name)
  q = "{}{}&{}&{}&{}&{}".format(baseurl, action, content,cmtitle,limit, dataformat)
  wikiresponse = urllib.request.urlopen(q)
  wikidata = wikiresponse.read()
  query = json.loads(wikidata.decode('utf-8'))
  for page in query['query']['categorymembers']:
    individuals.append(page['title'])
 
  while True: 
    try:
        contin = 'cmcontinue={}'.format(query['continue']['cmcontinue'])
    except:
        break
    
    continue_q = "{}{}&{}&{}&{}&{}&{}".format(baseurl, action, content,contin,cmtitle,limit,dataformat)
    print(contin)
    wikiresponse = urllib.request.urlopen(continue_q)
    wikidata = wikiresponse.read()
    query = json.loads(wikidata.decode('utf-8'))
    
    #print(query)
    for page in query['query']['categorymembers']:
        individuals.append(page['title'])

  return individuals


In [None]:
# Use the function to get the names associated with each gender.
females = individuals_by_gender('Females')
males = individuals_by_gender('Males')
unknowns= individuals_by_gender("Individuals_of_unknown_or_undetermined_gender")

#### Get character information
The names of all characters have now been collected and can be collected into a single list of all characters. 
. 

In [None]:
females_no_whitespace = [name.replace(" ", "_") for name in females]
males_no_whitespace = [name.replace(" ", "_") for name in males]
unknowns_no_whitespace = [name.replace(" ", "_") for name in unknowns]

In [None]:
all_names = females + males + unknowns
all_names_no_whitespace = females_no_whitespace + males_no_whitespace + unknowns_no_whitespace
all_names[0:10]

Some characters are defined as being "unidentified" meaning there is not much information and perhaps no specific name. We choose to keep these as mentioned in the explainer notebook. 

In [None]:
## These characters had errors in the character-informations and were not very interesting, so they are removed
drop_names = ['Chief Snatcher', 'Painting of a giraffe']
drop_names_no_whitespace = [n.replace(" ", "_") for n in drop_names]

all_names = [name for name in all_names if name not in drop_names]
all_names_no_whitespace = [name for name in all_names if name not in drop_names_no_whitespace]

#### Get text file for each character
The character-page for each character is now queried using the Wiki-API and saved as a txt-file. 


In [None]:
for name in all_names:
    baseurl = "https://harrypotter.fandom.com/api.php?"
    action = "action=query"
    content = "prop=revisions&rvprop=content&rvslots=*"
    dataformat ="format=json"

    title = "titles="+name
    link = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    
    wikiresponse = requests.get(link)
    wikitext = wikiresponse.text
    wikijson = json.loads(wikitext)

    # get the page id
    page_id = list(wikijson["query"]["pages"].keys())[0]

    try:
        text = wikijson["query"]["pages"][page_id]['revisions'][0]['slots']['main']['*']
        name_ = name.replace(' ', '_')
        with open('Characters/'+name_+'.txt', 'w') as f:
            f.write(text)
    except:
        pass

#### Get links and features for each character
For each character the relevant features are found using regex expressions to extract the relevant part of the page. The links are identified by finding the links to other characters on their character page.

As mentioned in the explainer notebook, the information for some of the characters was very inconsistent, so the features are manually corrected below.

In [None]:
# The blood-types are very inconsistent in their naming, so they are renamed
# We are only interested in the blood-types: Muggles, Muggle-borns, Squibs, Pure-blood and Half-blood 
# so the rest are categorized as unknown. 

def correct_blood_type(string):
    # Muggles 
    if any(x in ["Non-magic people", 'No-Maj','Muggle'] for x in (string.split("|"))): 
        return "Muggle"

    # Muggle-born
    elif any(x in ["Muggle-born", "muggle-born"] for x in (string.split("|"))): 
        return "Muggle-born"

    # Pure-blood
    elif any(x in ["Pure","Pure-blood"] for x in (string.split("|"))): 
        return "Pure-blood"
    
    # Half-blood
    elif string == 'Half-blood': 
        return "Half-blood"

    # Squib
    elif string == 'Squib': 
        return "Squib"

    else:
        return 'unknown'

In [None]:
# The species are a bit too specific (e.g. Boarhound, bulldog and dog should all just be dogs)
# so this is manually corrected as well 

def correct_species(string):
    # Owls
    if any(x in ["owl", 'Owl'] for x in (string.split(" "))): 
        return "Owl"

    # All types of trolls
    elif any(x in ["Troll", "troll"] for x in (string.split(" "))): 
        return "Troll"

    # Cats
    elif any(x in ["Cat"] for x in (string.split(" "))): 
        return "Cat"
    
    # Crabs
    elif any(x in ["Crab", 'crab'] for x in (string.split(" "))): 
        return "Crab"

    # Dragons
    elif any(x in ["Chinese Fireball",'Dragon','Norwegian Ridgeback',"Hungarian Horntail",'Common Welsh Green'] for x in [string]): 
        return "Dragon"

    # Veela
    elif any(x in ["Veela"] for x in (string.split("-"))): 
        return "Veela"

    # Chicken
    elif any(x in ["Chicken"] for x in (string.split(" "))): 
        return "Chicken"

    # Dog
    elif any(x in ["Bulldog", "Dog", "Boarhound"] for x in [string]): 
        return "Dog"

    # Hummingbird
    elif string == "Wikipedia:Hummingbird": 
        return "Hummingbird"

    else:
        return string

In [None]:
# The gender-types are very inconsistent in their naming, so they are renamed

def correct_gender(string):
    # Females
    if string in ['Females', 'Female']: 
        return "Female"

    # Males
    elif string in ['Males', 'Male', 'male']: 
        return "Male"

    # Otherwise
    else:
        return 'unknown'

In [None]:
# find links 
def get_links(text, name, names):
    links = []

    # Find matches
    pattern = r'\[\[(.*?)(?:\|.*?|#.*?)?\]\]'
    matches = re.findall(pattern,text, re.IGNORECASE) 

    # Check links
    for match in matches:
        if (match in names) & (match != name):
            links.append(match)

    # return unique links
    return list(np.unique(links)) 

# find features
def get_features(text):
    feature_dict = {}

    species_pattern = r'species\s*=\s*\[\[(.*?)(?:\|.*?)?\]\]'
    gender_pattern = r'gender\s*=\s*(.*?)[^\w]'
    house_pattern = r'house\s*=\s*\[\[(.*?)[^\w]'
    blood_pattern = r'blood\s*=\s*\[\[(.*?)\]\]'

    # SPECIES
    species_match = re.findall(species_pattern,text, re.IGNORECASE)
    if species_match:
        species_match = correct_species(species_match[0])
        feature_dict['species'] = species_match
    else:
        feature_dict['species'] = "unknown"

    # GENDER
    gender_match = re.findall(gender_pattern,text, re.IGNORECASE)
    if gender_match:
        gender_match = correct_gender(gender_match[0])
        feature_dict['gender'] = gender_match
    else:
        feature_dict['gender'] = "unknown"
    
    # HOUSE
    house_match = re.findall(house_pattern,text, re.IGNORECASE)
    if house_match:
        feature_dict['house'] = house_match[0]
    else:
        feature_dict['house'] = "unknown"

    # BLOOD
    blood_match = re.findall(blood_pattern,text, re.IGNORECASE)
    if blood_match:
        blood_match = correct_blood_type(blood_match[0])
        feature_dict['blood'] = blood_match
    else:
        feature_dict['blood'] = "unknown"

    return feature_dict


In [None]:
attributes = {}
links = {}

for name in all_names:
    try:
        name_ = name.replace(' ', '_')
        with open("Data_files/Characters/"+ name_ +".txt") as f:
            text = str(f.readlines())
        attributes[name] = get_features(text)
        links[name] = get_links(text, name, all_names)
    except: 
        print(name)

#### Create dataset
The attributes and links are now combined to a dataframe, which is saved for future use. The attribuets and links are saved as dictionaries as this makes constructing the networks very simple. 

In [None]:
# Create dataframe
HP_enriched_character_df = pd.DataFrame.from_dict(attributes).T
HP_enriched_character_df.reset_index(level = 0, inplace=True)
HP_enriched_character_df.rename(columns={"index": "name"}, inplace = True)

# Save
HP_enriched_character_df.to_csv("Data_files/Data/HP_enriched_character_df.csv",index=False)

# Show
HP_enriched_character_df.head(5)

In [None]:
with open('Data_files/Data/Network/attributes.json', 'w') as outfile:
    json.dump(attributes, outfile)

with open('Data_files/Data/Network/links.json', 'w') as outfile:
    json.dump(links, outfile)

### Creating links based on family ties


In [None]:
def get_family_links(text, name, names):
    links = []
    family_pattern = "(family = (.*?\\')\|)"
    family = re.finditer(family_pattern, text, re.MULTILINE)
    family = next(family).group()
    individual_pattern = '\[\[(.*?)(?:\|.*?|\#.*?)?\]\]'
    family_members = re.findall(individual_pattern, family)

    for member in family_members:
        if (member in names) & (member != name):
            links.append(member)

    # return unique links
    return list(np.unique(links))         

In [None]:
family_links = {}
mssin_cnt = 0
for name in all_names:
    try:
        name_ = name.replace(' ', '_')
        with open("Data_files/Characters/"+ name_ +".txt") as f:
            text = str(f.readlines())
        family_links[name] = get_family_links(text, name, all_names)
    except: 
        mssin_cnt += 1
print('{} entries with zero familial ties'.format(mssin_cnt))
print('{} entries left in the graph'.format(len(all_names) - mssin_cnt))

In [None]:
with open('Data_files/Data/Network/family_links.json', 'w') as outfile:
    json.dump(family_links, outfile)