Vi_05 – Entry clearance visas granted by country of nationality

In [1]:
from gssutils import *

if is_interactive():
    scraper = Scraper('https://www.gov.uk/government/statistics/immigration-statistics-october-to-december-2017-data-tables')
    sheet = scraper.distribution(
        title='Entry clearance visas granted outside the UK data tables immigration statistics October to December 2017 volume 1'
    ).as_pandas(sheet_name='vi_05')

sheet

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,Table vi_05: Entry clearance visas granted by ...,,,,,,,,,,,,,,
1,Back to contents,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,
3,Geographical region,Country of nationality,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
4,*Total,*Total,2065424,2228261,2062634,1954605,1995353,2144621,2275417,2228762,2496488,2448563,2468347,2478973,2710350
5,Africa North,*Total Africa North,79138,87720,78194,79734,80331,87854,73838,81000,94654,90545,83540,81839,80532
6,Africa Sub-Saharan,*Total Africa Sub-Saharan,312395,275701,257988,240565,316245,332960,340493,322737,316422,298217,289329,236076,258073
7,America North,*Total America North,42403,44124,44730,47159,46587,49713,47355,45250,49181,48595,48370,45438,46030
8,America Central and South,*Total America Central and South,59612,60907,62848,59685,56535,56986,53613,54492,58077,69800,73813,66161,72351
9,Asia Central,*Total Asia Central,18531,23862,26359,22582,21828,22289,24891,24793,25496,25582,22980,19368,22196


In [2]:
sheet.rename(columns=sheet.iloc[3], inplace=True)
sheet.drop([0,1,2,3], inplace=True)
sheet.drop(sheet.index[sheet['Geographical region'] == ''], inplace=True)
sheet

Unnamed: 0,Geographical region,Country of nationality,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
4,*Total,*Total,2065424,2228261,2062634,1954605,1995353,2144621,2275417,2228762,2496488,2448563,2468347,2478973,2710350
5,Africa North,*Total Africa North,79138,87720,78194,79734,80331,87854,73838,81000,94654,90545,83540,81839,80532
6,Africa Sub-Saharan,*Total Africa Sub-Saharan,312395,275701,257988,240565,316245,332960,340493,322737,316422,298217,289329,236076,258073
7,America North,*Total America North,42403,44124,44730,47159,46587,49713,47355,45250,49181,48595,48370,45438,46030
8,America Central and South,*Total America Central and South,59612,60907,62848,59685,56535,56986,53613,54492,58077,69800,73813,66161,72351
9,Asia Central,*Total Asia Central,18531,23862,26359,22582,21828,22289,24891,24793,25496,25582,22980,19368,22196
10,Asia East,*Total Asia East,195703,229295,234579,206939,194931,247960,300234,321271,410782,450758,528730,618746,693422
11,Asia South,*Total Asia South,594374,688303,621755,568849,583488,595578,597874,518176,535592,540326,566378,585985,655170
12,Asia South East,*Total Asia South East,121080,128449,134248,134630,137946,149295,163300,168241,186567,195186,199525,220169,244591
13,EU 14,*Total EU 14,47,35,45,43,17,17,17,12,13,4,5,6,22


In [3]:
tidy = pd.melt(sheet,
               ['Geographical region','Country of nationality'],
               var_name="Year",
               value_name="Value")

# Clean up *Total strings
tidy.replace({'Country of nationality': {
    r'^\*Total$': 'Rest of world',
    r'^\*Total ': ''
}}, regex=True, inplace=True)

# While nationality and citizenship are different things, can we use the same citizenship details
# derived from IPS?
from io import BytesIO
import requests
import re
citizenship_csv = 'https://raw.githubusercontent.com/ONS-OpenData/ref_migration/master/codelists/citizenship.csv'
citizenship = pd.read_csv(BytesIO(requests.get(citizenship_csv).content))

# See https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/734677/user-guide-immigration-statistics.pdf
# section 19 for more details.

#display(list(set(tidy['Country of nationality'].unique()) - set(citizenship['Label'].unique())))

# The names don't directly match and while some are obvious 1-1, others aren't (e.g. St. Maarten & St. Martin)
# So keep track of countries and areas to create a separate codelist, then drop the areas/regions from this
# tidy table as they're determined by nationality.

countries = set(tidy['Country of nationality'].unique())
parents = {}
for country in countries:
    regions = tidy[tidy['Country of nationality'] == country]['Geographical region'].unique()
    assert len(regions) <= 1
    if len(regions) == 1 and country != regions[0] and regions[0] != '*Total':
        parents[country] = regions[0]

codelist = [('World', 'world', '')]
for region in sorted(set(parents.values())):
    codelist.append((region, pathify(region), pathify('World')))
for country in sorted(countries - set(parents.values())):
    if country in parents:
        codelist.append((country, pathify(country), pathify(parents[country])))
    else:
        codelist.append((country, pathify(country), pathify("World")))

tidy.drop(columns=['Geographical region'], inplace=True)

from pathlib import Path
out = Path('out')
out.mkdir(exist_ok=True)
codelist_df = pd.DataFrame.from_records(codelist,
                                        columns=('Label', 'Notation', 'Parent Notation'))
codelist_df['Sort Priority'] = codelist_df.index + 1
codelist_df['Description'] = ''
if not codelist_df['Notation'].is_unique:
    display(codelist_df[codelist_df.duplicated('Notation', keep='first')])
    assert False, "Notation not unique for countries codelist"
codelist_df.to_csv(out / 'ho-country-of-nationality.csv', index=False)

Todo: data markers, `z` means `not applicable` and `:` means `not available`

In [4]:
import numpy as np
tidy.drop(tidy.index[~tidy['Value'].map(np.isreal)], inplace=True)
tidy['Value'] = tidy['Value'].astype(int)
tidy

Unnamed: 0,Country of nationality,Year,Value
0,Rest of world,2005,2065424
1,Africa North,2005,79138
2,Africa Sub-Saharan,2005,312395
3,America North,2005,42403
4,America Central and South,2005,59612
5,Asia Central,2005,18531
6,Asia East,2005,195703
7,Asia South,2005,594374
8,Asia South East,2005,121080
9,EU 14,2005,47
