## Cleaning Adm2

Author: Verity Hill (verity.hill@ed.ac.uk)

This notebook contains code to clean Administrative level 2 regions in the UK in order to map the locations provided in the sequence metadata as accurately as possible.

The cleaned locations file is provided in supplementary information. It is a manually generated file which converts locations found in the sequence metadata to mappable regions based on the GADM Adm2 regions. This involves correcting spelling mistakes, identifying the correct adm2 region for a more precise region (eg Solihull --> Birmingham) and merging some real adm2 regions together to form what is in the sequence metadata eg West Midlands.

NB Historical Northern Irish counties are used because data was consistently submitted using these.

In [14]:
##Importing necesssary modules

from collections import defaultdict
from collections import Counter
from collections import OrderedDict
import datetime as dt
import pandas as pd
import csv
import geopandas 



not_mappable = ["NA","WALES", "YORKSHIRE", "OTHER", "UNKNOWN", "UNKNOWN SOURCE", "NOT FOUND", "GIBRALTAR", "FALKLAND ISLANDS", "CITY CENTRE"]

clean_locs_file = "adm2_cleaning.csv"

In [20]:
#Reading in map files from GADM

UK = geopandas.read_file("../../data/mapping_files/gadm36_GBR_2.json")
NI = geopandas.read_file("../../data/mapping_files/NI_counties.geojson")
channels = geopandas.read_file("../../data/mapping_files/channel_islands.json")

ni_name = []
for i in range(len(NI["CountyName"])):
    ni_name.append("Northern Ireland C")

NI["NAME_2"] = NI["CountyName"]
NI["NAME_1"] = ni_name  

all_uk = UK.append(channels).append(NI)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [21]:
#Prepping NI counties

country_to_adm2_raw = defaultdict(list)

for i,j in zip(UK["NAME_1"], UK["NAME_2"]):
    if i != "Northern Ireland":
        country_to_adm2_raw[i].append(j)
    
for i in NI["CountyName"]:
    country_to_adm2_raw["Northern_Ireland"].append(i)

In [1]:
##Converts 1:1 metadata to real, and performs any merging of locations from the metadata required

mapping_dictionary = defaultdict(list)
straight_map = {}
multi_loc_dict = {}

with open(clean_locs_file) as f:
    next(f)
    for l in f:
        toks = l.strip("\n").split(",")
        toks [:] = [x for x in toks if x]
        metadata_loc = toks[0]
        real_locs = toks[1:]   

        if metadata_loc == 'RHONDDA CYNON TAF':
            straight_map[metadata_loc] = "RHONDDA, CYNON, TAFF" 
        else:
            if len(real_locs) == 1:
                straight_map[metadata_loc] = real_locs[0].upper()
            else:
                for i in real_locs:
                    multi_loc_dict[i.upper()] = metadata_loc.upper()
                    

NameError: name 'defaultdict' is not defined

In [23]:
#Real locations that have no sequences in the metadata, but I suspect only because data collectors have merged them together

metadata_merging = {}

metadata_merging["BLACKBURN WITH DARWEN"] = "LANCASHIRE"
metadata_merging["BLACKPOOL"] = "LANCASHIRE"
metadata_merging["BRIGHTON AND HOVE"] = "SUSSEX"
metadata_merging["DARLINGTON"] = "DURHAM"
metadata_merging["DERBY"] = "DERBYSHIRE"
metadata_merging["HARTLEPOOL"] = "DURHAM"
metadata_merging["ISLES OF SCILLY"] = "CORNWALL"
metadata_merging["KINGSTON UPON HULL"] = "EAST RIDING OF YORKSHIRE"
metadata_merging["LEICESTER"] = 'LEICESTERSHIRE'
metadata_merging["MEDWAY"] = "KENT"
metadata_merging["MIDDLESBROUGH"] = 'NORTH YORKSHIRE'
metadata_merging["MILTON KEYNES"] = "BUCKINGHAMSHIRE"
metadata_merging["PETERBROUGH"] = "CAMBRIDGESHIRE"
metadata_merging["PORTSMOUTH"] = "HAMPSHIRE"
metadata_merging["REDCAR AND CLEVELAND"] = "SOUTH YORKSHIRE"
metadata_merging["SOUTHAMPTON"] = "HAMPSHIRE"
metadata_merging["SOUTHEND-ON-SEA"] = 'ESSEX'
metadata_merging["STOCKTON-ON-TEES"] = 'DURHAM'
metadata_merging["SWINDON"] = "WILTSHIRE"
metadata_merging["TELFORD AND WREKIN"] = 'SHROPSHIRE'
metadata_merging["THURROCK"]  = "ESSEX"
metadata_merging["TORBAY"] = "DEVON"
metadata_merging["WARRINGTON"] = "CHESHIRE"
metadata_merging["YORK"] = "NORTH YORKSHIRE"

In [24]:
metadata_multi_loc = []

for location in all_uk["NAME_2"]:
    
    if location.upper() in multi_loc_dict.keys():
        metadata_multi_loc.append(multi_loc_dict[location.upper()])     
    
    else:
        metadata_multi_loc.append(location.upper())

In [25]:
all_uk["Multi_loc"] = metadata_multi_loc

merged_locs = all_uk.dissolve(by="Multi_loc")

In [None]:
sequence_to_clean_location_dict = {}

with open(metadata) as f:
    reader = csv.DictReader(f)
    data = [r for r in reader]
    for sequence in data:
        if sequence['country'] == "UK":
            
            seq_name = sequence['sequence_name']

            adm2 = sequence['adm2']
            
            if adm2 != "" and adm2 not in not_mappable:
                if adm2 in straight_map.keys():
                    new = straight_map[adm2]
                    cleaned_locs.add(adm2)
                    if new in multi_loc_dict.keys():
                        new = multi_loc_dict[new]

                elif adm2 in multi_loc_dict.keys():
                    new = multi_loc_dict[adm2]
                    cleaned_locs.add(adm2)

                else:
                    new = adm2
                
                
            else:
                new = "NA"
                
            sequence_to_clean_location_dict[seq_name] = new