# MPs data

In [1]:
import requests
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup

tqdm.pandas()

In [2]:
def parse_person_json(json_dict):
    # Every entry has an ID
    person_id = json_dict['id']
    
    # Sometimes, an entry has no name information
    # In these cases, the entry just serves as a redirect to another entry
    if 'other_names' not in json_dict.keys():
        redirect_id = json_dict['redirect'] # Save redirect IDs to resolve later
    else:
        redirect_id = None
    
    if 'other_names' in json_dict.keys():
        # Often people will have multiple name entries, stored as separate dicts
        # e.g. Tony Blair has an entry as 'Anthony Blair', another as 'Tony Blair', and one as 'The Prime Minister'
        # We want to collapse these down to one name dictionary
        name_dict = {k:v for name_dict in json_dict['other_names'] for k,v in name_dict.items()}
        if 'given_name' in name_dict.keys(): # Extract the first/given name
            first_name = name_dict['given_name']
        else:
            first_name = None
        if 'family_name' in name_dict.keys(): # And the surname/family name
            family_name = name_dict['family_name']
        else:
            family_name = None
    else:
        first_name,family_name = None,None
    
    return person_id,redirect_id,first_name,family_name

In [3]:
url = 'https://raw.githubusercontent.com/mysociety/parlparse/master/members/people.json'

response = requests.get(url)
response_json = response.json()

people_json_list = response_json['persons']

people_dict = {'person_id':[],
               'redirect_id':[],
               'first_name':[],
               'family_name':[]}

for person_json in tqdm(people_json_list):
    person_id,redirect_id,first_name,family_name = parse_person_json(person_json)
    for variable in people_dict.keys():
        people_dict[variable].append(eval(variable))

people_df = pd.DataFrame(people_dict)
display(people_df)

100%|█████████████████████████████████████████████████████████████████████████| 14004/14004 [00:00<00:00, 54925.43it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name
0,uk.org.publicwhip/person/10001,,Diane,Abbott
1,uk.org.publicwhip/person/10002,,Gerry,Adams
2,uk.org.publicwhip/person/10003,,Irene,Adams
3,uk.org.publicwhip/person/10004,,Nick,Ainger
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth
...,...,...,...,...
13999,uk.org.publicwhip/person/26060,,Ian,
14000,uk.org.publicwhip/person/26061,,Paulette,Hamilton
14001,uk.org.publicwhip/person/26062,,Jonathan,
14002,uk.org.publicwhip/person/26063,,Paul,


In [4]:
url = 'https://raw.githubusercontent.com/mysociety/parlparse/master/members/dates-of-birth.xml'

response = requests.get(url)
response_xml = response.content.decode()

bs = BeautifulSoup(response_xml)

dob_dict = {'person_id':[],
            'dob':[]}

for info in bs.find_all('personinfo'):
    person_id = info['id']
    dob = pd.to_datetime(info['date_of_birth'])
    
    for variable in dob_dict.keys():
        dob_dict[variable].append(eval(variable))
    
dob_df = pd.DataFrame(dob_dict)
people_df.merge(dob_df, how='left')

Unnamed: 0,person_id,redirect_id,first_name,family_name,dob
0,uk.org.publicwhip/person/10001,,Diane,Abbott,1953-09-27
1,uk.org.publicwhip/person/10002,,Gerry,Adams,1948-10-06
2,uk.org.publicwhip/person/10003,,Irene,Adams,NaT
3,uk.org.publicwhip/person/10004,,Nick,Ainger,1949-10-24
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,1952-06-19
...,...,...,...,...,...
13999,uk.org.publicwhip/person/26060,,Ian,,NaT
14000,uk.org.publicwhip/person/26061,,Paulette,Hamilton,NaT
14001,uk.org.publicwhip/person/26062,,Jonathan,,NaT
14002,uk.org.publicwhip/person/26063,,Paul,,NaT


In [5]:
# Also, in some editions of Hansard, MPs are referred to by their member_id rather than their person_id
# We want to therefore list all of the member_ids linked with each person_id
person_id_2_member_id = lambda person_id: [entry['id'] for entry in response_json['memberships'] if 'person_id' in entry.keys() if entry['person_id'] == person_id]
people_df['memberships'] = people_df.person_id.progress_apply(person_id_2_member_id)
display(people_df)

100%|███████████████████████████████████████████████████████████████████████████| 14004/14004 [01:47<00:00, 130.67it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw..."
...,...,...,...,...,...
13999,uk.org.publicwhip/person/26060,,Ian,,[uk.org.publicwhip/lord/101673]
14000,uk.org.publicwhip/person/26061,,Paulette,Hamilton,[uk.org.publicwhip/member/42744]
14001,uk.org.publicwhip/person/26062,,Jonathan,,[uk.org.publicwhip/lord/101675]
14002,uk.org.publicwhip/person/26063,,Paul,,[uk.org.publicwhip/lord/101676]


In [6]:
# Then, we can use memberships to link people to post IDs
memberid2postid = {entry['id']:(entry['post_id'],entry['start_date'],entry['end_date']) for entry in response_json['memberships'] if all([key in entry.keys() for key in ['post_id','start_date','end_date']])}

# And the same JSON links post IDs to constituency names
mp_posts = [entry for entry in response_json['posts'] if 'role' in entry.keys() if entry['role']=='Member of Parliament']
postid2constituency = {entry['id']:entry['area']['name'] for entry in mp_posts}

# So then we can lookup constituency names from member IDs
memberid2constituency = {member_id:((start_date,end_date),postid2constituency[post_id]) for member_id,(post_id,start_date,end_date) in memberid2postid.items() if post_id in postid2constituency.keys()}

def member_id_2_constituency(memberships):
    member_ids_to_lookup = []
    for member_id in memberships:
        if member_id in memberid2constituency.keys():
            member_ids_to_lookup.append(member_id)
            
    constituencies = dict()
    for member_id in member_ids_to_lookup:
        dates,constituency = memberid2constituency[member_id]
        constituencies[dates] = constituency
    return constituencies

people_df['constituencies'] = people_df.memberships.progress_apply(member_id_2_constituency)
display(people_df)

100%|████████████████████████████████████████████████████████████████████████| 14004/14004 [00:00<00:00, 451646.15it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships,constituencies
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw...","{('1987-06-11', '1992-03-16'): 'Hackney North ..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw...","{('1983-06-09', '1987-05-18'): 'Belfast West',..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw...","{('1990-11-29', '1992-03-16'): 'Paisley North'..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Pembroke', ('1..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Coventry North..."
...,...,...,...,...,...,...
13999,uk.org.publicwhip/person/26060,,Ian,,[uk.org.publicwhip/lord/101673],{}
14000,uk.org.publicwhip/person/26061,,Paulette,Hamilton,[uk.org.publicwhip/member/42744],{}
14001,uk.org.publicwhip/person/26062,,Jonathan,,[uk.org.publicwhip/lord/101675],{}
14002,uk.org.publicwhip/person/26063,,Paul,,[uk.org.publicwhip/lord/101676],{}


In [7]:
# Similarly, we can get details on party affiliations across periods of time
memberid2partyid = {entry['id']:(entry['on_behalf_of_id'],entry['start_date'],entry['end_date']) for entry in response_json['memberships'] if all([key in entry.keys() for key in ['on_behalf_of_id','start_date','end_date']])}

# And the same JSON links post IDs to constituency names
parties = [entry for entry in response_json['organizations'] if 'classification' in entry.keys() if entry['classification']=='party']
partyid2party = {entry['id']:entry['name'] for entry in parties}

# So then we can lookup constituency names from member IDs
memberid2party = {member_id:((start_date,end_date),partyid2party[party_id]) for member_id,(party_id,start_date,end_date) in memberid2partyid.items() if party_id in partyid2party.keys()}

def member_id_2_party(memberships):
    member_ids_to_lookup = []
    for member_id in memberships:
        if member_id in memberid2party.keys():
            member_ids_to_lookup.append(member_id)
            
    parties = dict()
    for member_id in member_ids_to_lookup:
        dates,party = memberid2party[member_id]
        parties[dates] = party
    return parties

people_df['parties'] = people_df.memberships.progress_apply(member_id_2_party)
display(people_df)

100%|████████████████████████████████████████████████████████████████████████| 14004/14004 [00:00<00:00, 607685.25it/s]


Unnamed: 0,person_id,redirect_id,first_name,family_name,memberships,constituencies,parties
0,uk.org.publicwhip/person/10001,,Diane,Abbott,"[uk.org.publicwhip/member/2069, uk.org.publicw...","{('1987-06-11', '1992-03-16'): 'Hackney North ...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
1,uk.org.publicwhip/person/10002,,Gerry,Adams,"[uk.org.publicwhip/member/2196, uk.org.publicw...","{('1983-06-09', '1987-05-18'): 'Belfast West',...","{('1997-05-01', '2001-05-14'): 'Sinn Féin', ('..."
2,uk.org.publicwhip/person/10003,,Irene,Adams,"[uk.org.publicwhip/member/2201, uk.org.publicw...","{('1990-11-29', '1992-03-16'): 'Paisley North'...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
3,uk.org.publicwhip/person/10004,,Nick,Ainger,"[uk.org.publicwhip/member/2321, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Pembroke', ('1...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
4,uk.org.publicwhip/person/10005,,Bob,Ainsworth,"[uk.org.publicwhip/member/2323, uk.org.publicw...","{('1992-04-09', '1997-04-08'): 'Coventry North...","{('1997-05-01', '2001-05-14'): 'Labour', ('200..."
...,...,...,...,...,...,...,...
13999,uk.org.publicwhip/person/26060,,Ian,,[uk.org.publicwhip/lord/101673],{},{}
14000,uk.org.publicwhip/person/26061,,Paulette,Hamilton,[uk.org.publicwhip/member/42744],{},{}
14001,uk.org.publicwhip/person/26062,,Jonathan,,[uk.org.publicwhip/lord/101675],{},{}
14002,uk.org.publicwhip/person/26063,,Paul,,[uk.org.publicwhip/lord/101676],{},{}


In [8]:
# Lastly, we want redirect IDs to contain the same details as entries to which they redirect
redirect = lambda row: people_df[people_df.person_id==row.redirect_id].iloc[0] if row.redirect_id is not None else row

people_df = people_df.apply(redirect, axis=1)

assert people_df.redirect_id.apply(lambda redirect_id: redirect_id is not None).sum()==0, "Still redirects required!"

people_df = people_df.drop('redirect_id', axis=1)
people_df = people_df.drop_duplicates('person_id')

In [9]:
people_df.to_csv('people.csv')