## Get postcodes for schools which aren't present in original list

In [2]:
import numpy as np
import pandas as pd
import osmnx as ox
import re

In [2]:
def extract_postcode(x):
    result = re.findall(r"([A-Za-z][A-Ha-hJ-Yj-y]?[0-9][A-Za-z0-9]? ?[0-9][A-Za-z]{2}|[Gg][Ii][Rr] ?0[Aa]{2})", x)
    return "" if len(result) == 0 else result[0]

In [3]:
dataset = pd.read_csv("csv/dataset1_2.csv")
postcodes = pd.read_csv("csv/schools.csv")

In [4]:
joined = pd.merge(dataset, postcodes, how="left", left_on="school_id", right_on="id")
no_school_info = joined[joined['id'].isna()][["school_id", "School", "postcode"]]
no_school_info_schools = no_school_info.drop_duplicates()

In [7]:
gdf = ox.geocoder.geocode_to_gdf(no_school_info_schools["School"]
                                 .map(lambda x: x
                                      .replace(" (Grammar)", "")
                                      .replace(" (", ", ")
                                      .replace(")", "")
                                      .replace("Grammar School", "!!!")
                                      .replace("Grammar", "Grammar School")
                                      .replace("!!!", "Grammar School")
                                      .replace("St Dominic's Grammar School for Girls", "St Dominic's Grammar School")
                                      .replace("Regent House", "Regent House Grammar School")
                                      .replace(", Knock", "")
                                      .replace("Rathmore", "Rathmore Grammar School")
                                      .replace("Royal School, Armagh", "The Royal School Armagh")
                                      + ", Northern Ireland")
                                 .tolist())

In [8]:
gdf.insert(0, "School", no_school_info_schools["School"].tolist())

In [11]:
schools_additional = pd.merge(no_school_info, gdf, how="left", on="School")

In [13]:
schools_additional["postcode"] = schools_additional["display_name"].map(extract_postcode)

In [18]:
schools_additional = schools_additional.rename(columns={"school_id": "id", "School": "name"})

In [19]:
schools_complete = pd.concat([postcodes, schools_additional[["id", "name", "postcode"]]], keys=["id", "name", "postcode"], ignore_index=True)

In [20]:
schools_additional.to_csv("csv/schools_additional.csv", index=False)

In [21]:
schools_complete.to_csv("csv/schools_complete.csv", index=False)

## Test join

In [24]:
postcodes = pd.read_csv("csv/schools_complete.csv")

In [29]:
pd.merge(dataset, postcodes, how="left", left_on="school_id", right_on="id").drop_duplicates(ignore_index=True)

Unnamed: 0,year,School,Label,AOL,Pupils,Provider,subject,course_name,course_code,school_id,id,id_raw,name,city,county,postcode,status,type
0,11,St Patrick's Academy (Lisburn),A,LLW,15,Own School,ICT,OCN NI Level 2 Certificate in Information Tech...,601/8497/8,4230165,4230165,423-0165,St Patrick's Academy,LISBURN,ANTRIM,BT28 1TD,Roman Catholic Maintained,Secondary
1,11,Larne High School,A,LLW,35,Own School,ICT,OCN NI Level 2 Certificate in Information Tech...,601/8497/8,3210038,3210038,321-0038,Larne High School,LARNE,ANTRIM,BT40 1NT,Controlled,Secondary
2,11,Cullybackey College,A,LLW,15,Own School,ICT,OCN NI Level 2 Certificate in Information Tech...,601/8497/8,3210172,3210172,321-0172,Cullybackey College,BALLYMENA,ANTRIM,BT42 1BP,Controlled,Secondary
3,11,St Mary's High (Newry),A,LLW,17,Own School,ICT,OCN NI Level 2 Certificate in Information Tech...,601/8497/8,5230108,5230108,523-0108,St Mary's High School,NEWRY,DOWN,BT34 2DT,Roman Catholic Maintained,Secondary
4,11,St Colman's High School,A,LLW,47,Own School,ICT,OCN NI Level 2 Certificate in Information Tech...,601/8497/8,4230161,4230161,423-0161,St Colman's High School & Sixth Form College,BALLYNAHINCH,DOWN,BT24 8XR,Roman Catholic Maintained,Secondary
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
553,14,Campbell College,G,ST,4,Collaboration with other school,Computer Studies,WJEC Level 3 Advanced GCE in Computer Science,601/5345/3,1420020,1420020,,Campbell College,,,BT4 2ND,,
554,14,Royal Belfast Academical Institution,G,ST,13,Own School,Computer Studies,WJEC Level 3 Advanced GCE in Computer Science,601/5345/3,1420027,1420027,,Royal Belfast Academical Institution,,,BT1 6DL,,
555,14,Belfast High School,G,ST,5,Own School,Computer Studies,WJEC Level 3 Advanced GCE in Computer Science,601/5345/3,3420077,3420077,,Belfast High School,,,BT37 0PX,,
556,14,Strathearn School,G,ST,1,Collaboration with other school,Computer Studies,WJEC Level 3 Advanced GCE in Computer Science,601/5345/3,1420089,1420089,,Strathearn School,,,BT4 2AU,,
