Long-term international migration 2.05, Occupation

In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/longterminternationalmigrationusualoccupationpriortomigrationtable205')
scraper

## Long-term international migration 2.05, usual occupation prior to migration, UK and England and Wales

Regular job of migrants entering or leaving UK. Estimates of Long-Term International Migration, annual table.

### Distributions

1. Long-term international migration 2.05, usual occupation prior to migration, UK and England and Wales ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/longterminternationalmigrationusualoccupationpriortomigrationtable205/current/2.05ltimusualoccupationpriortomigration1991to2017.xls))


In [2]:
tab = next(t for t in scraper.distribution().as_databaker() if t.name == 'Table 2.05')

In [3]:
cell = tab.filter('Year')
cell.assert_one()
Occupation = cell.fill(RIGHT).is_not_blank().is_not_whitespace() 
Year = cell.expand(DOWN).filter(lambda x: type(x.value) != str or 'Significant Change?' not in x.value)
Geography = cell.fill(DOWN).one_of(['United Kingdom', 'England and Wales'])
Flow = cell.fill(DOWN).one_of(['Inflow', 'Outflow', 'Balance'])

In [4]:
observations = cell.shift(RIGHT).fill(DOWN).filter('Estimate').expand(RIGHT).filter('Estimate') \
                .fill(DOWN).is_not_blank().is_not_whitespace() 
Str =  tab.filter(contains_string('Significant Change?')).fill(RIGHT).is_not_number()
observations = observations - (tab.excel_ref('A1').expand(DOWN).expand(RIGHT).filter(contains_string('Significant Change')))
original_estimates = tab.filter(contains_string('Original Estimates')).fill(DOWN).is_number()
observations = observations - original_estimates - Str
CI = observations.shift(RIGHT)

In [5]:
csObs = ConversionSegment(observations, [
    HDim(Year,'Year', DIRECTLY, LEFT),
    HDim(Geography,'Geography', CLOSEST, ABOVE),
    HDim(Occupation, 'Occupation', CLOSEST, LEFT),
    HDim(Flow, 'Flow', CLOSEST, ABOVE),
    HDimConst('Measure Type', 'Count'),
    HDimConst('Unit','People (thousands)'),
    HDim(CI,'CI',DIRECTLY,RIGHT),
    HDimConst('Revision', '2011 Census Revision')
])
# savepreviewhtml(csObs)
tidy_revised = csObs.topandas()




In [6]:
csRevs = ConversionSegment(original_estimates, [
    HDim(Year, 'Year', DIRECTLY, LEFT),
    HDim(Geography,'Geography', CLOSEST, ABOVE),
    HDim(Occupation, 'Occupation', CLOSEST, LEFT),
    HDim(Flow, 'Flow', CLOSEST, ABOVE),
    HDimConst('Measure Type', 'Count'),
    HDimConst('Unit','People (thousands)'),
    HDim(original_estimates.shift(RIGHT), 'CI', DIRECTLY, RIGHT),
    HDimConst('Revision', 'Original Estimate')
])
orig_estimates = csRevs.topandas()




In [7]:
tidy = pd.concat([tidy_revised, orig_estimates], axis=0, join='outer', ignore_index=True, sort=False)

In [8]:
import numpy as np
tidy['OBS'].replace('', np.nan, inplace=True)
tidy.dropna(subset=['OBS'], inplace=True)
if 'DATAMARKER' in tidy.columns:
    tidy.drop(columns=['DATAMARKER'], inplace=True)
tidy.rename(columns={'OBS': 'Value'}, inplace=True)
tidy['Value'] = tidy['Value'].astype(int)
tidy['CI'] = tidy['CI'].map(lambda x:'' if x == ':' else int(x[:-2]) if x.endswith('.0') else 'ERR')

In [9]:
tidy['Occupation'] = tidy['Occupation'].str.rstrip('1234')

In [10]:
for col in tidy.columns:
    if col not in ['Value', 'Year', 'CI']:
        tidy[col] = tidy[col].astype('category')
        display(col)
        display(tidy[col].cat.categories)

'Geography'

Index(['England and Wales', 'United Kingdom'], dtype='object')

'Occupation'

Index(['All persons', 'Children', 'Manual and clerical', 'Other adults',
       'Professional and managerial', 'Students'],
      dtype='object')

'Flow'

Index(['Balance', 'Inflow', 'Outflow'], dtype='object')

'Measure Type'

Index(['Count'], dtype='object')

'Unit'

Index(['People (thousands)'], dtype='object')

'Revision'

Index(['2011 Census Revision', 'Original Estimate'], dtype='object')

In [11]:
tidy['Geography'] = tidy['Geography'].cat.rename_categories({
    'United Kingdom': 'K02000001',
    'England and Wales': 'K04000001'
})
tidy['Occupation'] = tidy['Occupation'].cat.rename_categories({
    'All persons': 'all-persons',
    'Children' : 'children',
    'Manual and clerical' : 'manual-and-clerical',
    'Other adults' : 'other-adults',
    'Professional and managerial' : 'professional-and-managerial',
    'Students' : 'students'    
            
})
tidy['Flow'] = tidy['Flow'].cat.rename_categories({
    'Balance': 'balance', 
    'Inflow': 'inflow',
    'Outflow': 'outflow'
})

tidy = tidy[['Geography', 'Year', 'Occupation', 'Flow',
              'Measure Type','Value', 'CI','Unit', 'Revision']]

In [12]:
tidy['Year'] = tidy['Year'].apply(lambda x: pd.to_numeric(x, downcast='integer'))

In [13]:
tidy['Year'] = tidy['Year'].astype(int)

In [14]:
from pathlib import Path
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

tidy.drop_duplicates().to_csv(destinationFolder / ('observations.csv'), index = False)

In [15]:
from gssutils.metadata import THEME
scraper.dataset.theme = THEME['population']
scraper.dataset.family = 'migration'

with open(destinationFolder / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())

In [16]:
tidy

Unnamed: 0,Geography,Year,Occupation,Flow,Measure Type,Value,CI,Unit,Revision
0,K02000001,1991,all-persons,inflow,Count,329,23,People (thousands),2011 Census Revision
1,K02000001,1991,professional-and-managerial,inflow,Count,94,11,People (thousands),2011 Census Revision
2,K02000001,1991,manual-and-clerical,inflow,Count,64,11,People (thousands),2011 Census Revision
3,K02000001,1991,students,inflow,Count,57,10,People (thousands),2011 Census Revision
4,K02000001,1991,other-adults,inflow,Count,58,9,People (thousands),2011 Census Revision
5,K02000001,1991,children,inflow,Count,56,10,People (thousands),2011 Census Revision
6,K02000001,1992,all-persons,inflow,Count,268,20,People (thousands),2011 Census Revision
7,K02000001,1992,professional-and-managerial,inflow,Count,74,10,People (thousands),2011 Census Revision
8,K02000001,1992,manual-and-clerical,inflow,Count,51,8,People (thousands),2011 Census Revision
9,K02000001,1992,students,inflow,Count,50,8,People (thousands),2011 Census Revision
