Long-term international migration 2.02, last or next resident, UK and England and Wales

In [1]:
from gssutils import *
scraper = Scraper('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/longterminternationalmigrationcountryoflastornextresidencetable202')
scraper

## Long-term international migration 2.02, country of last or next residence, UK and England and Wales

Nation of origin or destination of migrants. Estimates of Long-Term International Migration, annual table.

### Distributions

1. Long-term international migration 2.02, country of last or next residence, UK and England and Wales ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/longterminternationalmigrationcountryoflastornextresidencetable202/current/2.02ltimcountryoflastornextresidence2004to2017.xls))


In [2]:
tab = next(t for t in scraper.distribution().as_databaker() if t.name == 'Table 2.02')

In [3]:
cell = tab.filter('Year')
cell.assert_one()
Residence = cell.fill(RIGHT).is_not_blank().is_not_whitespace()  | \
            cell.shift(0,1).fill(RIGHT).is_not_blank().is_not_whitespace() | \
            cell.shift(0,2).fill(RIGHT).is_not_blank().is_not_whitespace() 
Residence = Residence - tab.filter(contains_string('Revisions'))  - \
                          tab.filter(contains_string('Original Estimates1')) -\
                          tab.filter(contains_string('All'))

In [4]:
observations = cell.shift(RIGHT).fill(DOWN).filter('Estimate').expand(RIGHT).filter('Estimate') \
                .fill(DOWN).is_not_blank().is_not_whitespace() \
                .filter(lambda x: type(x.value) != str or 'Statistically Significant Decrease' not in x.value)
observations = observations - (tab.excel_ref('A1').expand(DOWN).expand(RIGHT).filter(contains_string('Significant Change')))
original_estimates = tab.filter(contains_string('Original Estimates')).fill(DOWN).is_number()
observations = observations - original_estimates

In [5]:
CI = observations.shift(RIGHT)

In [6]:
Year = cell.fill(DOWN) & observations.fill(LEFT)
Year = Year.filter(lambda x: type(x.value) != str or 'Significant Change?' not in x.value)

In [7]:
Geography = cell.fill(DOWN).one_of(['United Kingdom', 'England and Wales'])
Flow = cell.fill(DOWN).one_of(['Inflow', 'Outflow', 'Balance'])

In [8]:
csObs = ConversionSegment(observations, [
    HDim(Year,'Year', DIRECTLY, LEFT),
    HDim(Geography,'Geography', CLOSEST, ABOVE),
    HDim(Residence, 'Residence', DIRECTLY, ABOVE),
    HDim(Flow, 'Migration Flow', CLOSEST, ABOVE),
    HDimConst('Measure Type', 'Count'),
    HDimConst('Unit','People (thousands)'),
    HDim(CI,'CI',DIRECTLY,RIGHT),
    HDimConst('Revision', '2011 Census Revision')
])
tidy_revised = csObs.topandas()




In [9]:
csRevs = ConversionSegment(original_estimates, [
    HDim(Year, 'Year', DIRECTLY, LEFT),
    HDim(Geography,'Geography', CLOSEST, ABOVE),
    HDim(Residence, 'Residence', DIRECTLY, ABOVE),
    HDim(Flow, 'Migration Flow', CLOSEST, ABOVE),
    HDimConst('Measure Type', 'Count'),
    HDimConst('Unit','People (thousands)'),
    HDim(original_estimates.shift(RIGHT), 'CI', DIRECTLY, RIGHT),
    HDimConst('Revision', 'Original Estimate')
])
orig_estimates = csRevs.topandas()




In [10]:
tidy = pd.concat([tidy_revised, orig_estimates], axis=0, join='outer', ignore_index=True, sort=False)

In [11]:
tidy['Residence'].fillna('all', inplace = True)

In [12]:
import numpy as np
tidy['OBS'].replace('', np.nan, inplace=True)
tidy.dropna(subset=['OBS'], inplace=True)
if 'DATAMARKER' in tidy.columns:
    tidy.drop(columns=['DATAMARKER'], inplace=True)
tidy.rename(columns={'OBS': 'Value'}, inplace=True)
tidy['Value'] = tidy['Value'].astype(int)
tidy['CI'] = tidy['CI'].map(lambda x:
                            '' if x == ':' else int(x[:-2]) if x.endswith('.0') else 'ERR')

In [13]:
for col in tidy.columns:
    if col not in ['Value', 'Year', 'CI']:
        tidy[col] = tidy[col].astype('category')
        display(col)
        display(tidy[col].cat.categories)

'Geography'

Index(['England and Wales', 'United Kingdom'], dtype='object')

'Residence'

Index(['Asia', 'Central and South America', 'East Asia', 'European Union EU15',
       'European Union EU2', 'European Union EU8', 'European Union Other',
       'European Union2', 'Middle East and Central Asia',
       'Non-European Union3', 'North Africa', 'North America', 'Oceania',
       'Other Europe3', 'Rest of the World', 'South Asia', 'South East Asia',
       'Sub-Saharan Africa', 'all'],
      dtype='object')

'Migration Flow'

Index(['Balance', 'Inflow', 'Outflow'], dtype='object')

'Measure Type'

Index(['Count'], dtype='object')

'Unit'

Index(['People (thousands)'], dtype='object')

'Revision'

Index(['2011 Census Revision', 'Original Estimate'], dtype='object')

In [14]:
tidy['Geography'] = tidy['Geography'].cat.rename_categories({
    'United Kingdom': 'K02000001',
    'England and Wales': 'K04000001'
})
tidy['Residence'] = tidy['Residence'].cat.rename_categories({
    'Asia' : 'asia',
    'Central and South America' : 'central-and-south-america', 
    'East Asia' : 'east-asia', 
    'European Union EU15' : 'eu15',
    'European Union EU2' : 'eu2', 
    'European Union EU8' : 'eu8', 
    'European Union Other' : 'eu-other',
    'European Union2' : 'eu' , 
    'Middle East and Central Asia' : 'middle-east-and-central-asia' ,
    'Non-European Union3' : 'non-eu', 
    'North Africa' : 'north-africa', 
    'North America' : 'north-america', 
    'Oceania' : 'oceania',
    'Other Europe3' : 'europe-exc-eu', 
    'Rest of the World' : 'rest-of-world', 
    'South Asia' : 'south-asia', 
    'South East Asia' :'south-east-asia',
    'Sub-Saharan Africa' : 'sub-saharan-africa'
        
})
tidy['Migration Flow'] = tidy['Migration Flow'].cat.rename_categories({
    'Balance': 'balance', 
    'Inflow': 'inflow',
    'Outflow': 'outflow'
})

tidy = tidy[['Geography', 'Year', 'Residence', 'Migration Flow',
             'Value', 'Measure Type', 'Unit', 'CI', 'Revision']]

In [15]:
tidy['Year'] = tidy['Year'].apply(lambda x: pd.to_numeric(x, downcast='integer'))

In [16]:
tidy['Year'] = tidy['Year'].astype(int)

In [17]:
tidy

Unnamed: 0,Geography,Year,Residence,Migration Flow,Value,Measure Type,Unit,CI,Revision
0,K02000001,2004,all,inflow,589,Count,People (thousands),40,2011 Census Revision
1,K02000001,2004,eu,inflow,151,Count,People (thousands),24,2011 Census Revision
2,K02000001,2004,eu15,inflow,98,Count,People (thousands),18,2011 Census Revision
3,K02000001,2004,eu8,inflow,51,Count,People (thousands),16,2011 Census Revision
5,K02000001,2004,eu-other,inflow,2,Count,People (thousands),2,2011 Census Revision
6,K02000001,2004,non-eu,inflow,438,Count,People (thousands),32,2011 Census Revision
7,K02000001,2004,europe-exc-eu,inflow,19,Count,People (thousands),6,2011 Census Revision
8,K02000001,2004,asia,inflow,213,Count,People (thousands),25,2011 Census Revision
9,K02000001,2004,middle-east-and-central-asia,inflow,31,Count,People (thousands),12,2011 Census Revision
10,K02000001,2004,east-asia,inflow,57,Count,People (thousands),16,2011 Census Revision


In [18]:
from pathlib import Path
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

tidy.to_csv(destinationFolder / ('observations.csv'), index = False)

In [19]:
from gssutils.metadata import THEME

scraper.dataset.family = 'migration'
scraper.dataset.theme = THEME['population']
scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'

with open(destinationFolder / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())