# Long-term international migration 2.04, main reason for migration

In [30]:
from gssutils import *
from databaker.framework import *

def left(s, amount):
    return s[:amount]

def right(s, amount):
    return s[-amount:]

def mid(s, offset, amount):
    return s[offset:offset+amount]

scraper = Scraper('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/' \
                  'longterminternationalmigrationmainreasonformigrationtable204')
scraper

## Long-term international migration 2.04, main reason for migration, UK and England and Wales

The primary purpose of migrants entering or leaving UK. Estimates of Long-Term International Migration, annual table.

### Distributions

1. Long-term international migration 2.04, main reason for migration, UK and England and Wales ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/longterminternationalmigrationmainreasonformigrationtable204/current/2.04ltimmainreasonformigration1991to2017.xls))


In [31]:
tabs = scraper.distributions[0].as_databaker()

for i in tabs:
    print(i.name)

Contents and Notes
Table 2.04
Chart 2.04 TS


In [32]:
tidied_sheets = []

for tab in tabs:
    if not tab.name.startswith('Table 2.04'):
        continue

    year = tab.filter("Year").expand(DOWN).regex(r'[0-9]{4}(\.0)?').is_not_blank()
    obs = tab.filter("Year").fill(RIGHT).is_not_blank() | tab.filter("Year").shift(DOWN).fill(RIGHT).is_not_blank()
    flow = tab.filter("Year").expand(DOWN).one_of(['Inflow', 'Outflow'])
    reason = tab.filter("Year").expand(RIGHT).is_not_blank()
    reason2 = tab.filter("Year").shift(DOWN).fill(RIGHT).is_not_blank()
    geography = tab.filter("Year").expand(DOWN).one_of(['United Kingdom', 'England and Wales'])
    
    observations = year.fill(RIGHT) & obs.fill(DOWN) 
    observations_ci = observations.shift(RIGHT)
    
    dimensions = [
            HDim(year, 'Year', CLOSEST, ABOVE),
            HDim(flow, 'Migration Flow', CLOSEST, ABOVE),
            HDim(geography, 'Geography', CLOSEST, ABOVE),
            HDim(reason, 'Reason for Migration', CLOSEST, LEFT),
            HDimConst('Unit','People (thousands)'),
            HDim(observations_ci, 'CI', DIRECTLY, RIGHT),
            HDimConst('Measure Type', 'Count'),
            HDim(reason2, 'Reason2', CLOSEST, LEFT)
    ]
    
    tidy_sheet = ConversionSegment(tab, dimensions, observations)
    savepreviewhtml(tidy_sheet, fname="Preview.html")
    
    tidied_sheets.append(tidy_sheet.topandas())
    
import pandas as pd

df = pd.concat(tidied_sheets, ignore_index = True).fillna('')
df['Year'] = df.apply(lambda x: int(float(x['Year'])), axis = 1)
df['Reason for Migration'] = df.apply(lambda x: x['Reason for Migration'][:-1] if x['Reason for Migration'].endswith('2') else x['Reason for Migration'], axis = 1)
df['Reason for Migration'] = df.apply(lambda x: x['Reason for Migration'] if x['Reason2'] == '' else x['Reason for Migration'] + ' - ' + x['Reason2'], axis = 1)
df['Reason for Migration'] = df.apply(lambda x: x['Reason for Migration'][:-1] if x['Reason for Migration'].endswith('1') else x['Reason for Migration'], axis = 1)
df = df.drop(['Reason2'], axis = 1)
df.rename(columns={'OBS':'Value',
                   'DATAMARKER':'IPS Marker'}, 
                   inplace=True)
df

tablepart 'Table 2.04' written #injblock1005
javascript calculated





Unnamed: 0,Value,IPS Marker,Year,Migration Flow,Geography,Reason for Migration,Unit,CI,Measure Type
0,329,,1991,Inflow,United Kingdom,All reasons,People (thousands),23.0,Count
1,71,,1991,Inflow,United Kingdom,Work related - All,People (thousands),10.0,Count
2,50,,1991,Inflow,United Kingdom,Work related - Definite job,People (thousands),9.0,Count
3,21,,1991,Inflow,United Kingdom,Work related - Looking for work,People (thousands),4.0,Count
4,90,,1991,Inflow,United Kingdom,Accompany / join - Looking for work,People (thousands),14.0,Count
5,56,,1991,Inflow,United Kingdom,Formal study - Looking for work,People (thousands),10.0,Count
6,67,,1991,Inflow,United Kingdom,Other - Looking for work,People (thousands),7.0,Count
7,45,,1991,Inflow,United Kingdom,No reason stated - Looking for work,People (thousands),9.0,Count
8,268,,1992,Inflow,United Kingdom,All reasons,People (thousands),20.0,Count
9,76,,1992,Inflow,United Kingdom,Work related - All,People (thousands),10.0,Count


In [33]:
tidy = df[['Geography', 'Year', 'Reason for Migration', 'Migration Flow',
             'Measure Type','Value','CI','Unit','IPS Marker']]
tidy['IPS Marker'] = tidy.apply(lambda x: 'not-available' if x['IPS Marker'] == ':' else x['IPS Marker'], axis = 1)

from IPython.core.display import HTML
for col in tidy:
    if col not in ['Value', 'CI']:
        tidy[col] = tidy[col].astype('category')
        display(HTML(f"<h2>{col}</h2>"))
        display(tidy[col].cat.categories)

Index(['England and Wales', 'United Kingdom'], dtype='object')

Int64Index([1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
            2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
            2013, 2014, 2015, 2016, 2017],
           dtype='int64')

Index(['Accompany / join - Looking for work', 'All reasons',
       'Formal study - Looking for work',
       'No reason stated - Looking for work', 'Other - Looking for work',
       'Work related - All', 'Work related - Definite job',
       'Work related - Looking for work'],
      dtype='object')

Index(['Inflow', 'Outflow'], dtype='object')

Index(['Count'], dtype='object')

Index(['People (thousands)'], dtype='object')

Index(['', 'not-available'], dtype='object')

In [34]:
tidy['Geography'] = tidy['Geography'].cat.rename_categories({
    'United Kingdom': 'K02000001',
    'England and Wales': 'K04000001'
})
tidy['Migration Flow'].cat.categories = tidy['Migration Flow'].cat.categories.map(lambda x: pathify(x))

tidy

Unnamed: 0,Geography,Year,Reason for Migration,Migration Flow,Measure Type,Value,CI,Unit,IPS Marker
0,K02000001,1991,All reasons,inflow,Count,329,23.0,People (thousands),
1,K02000001,1991,Work related - All,inflow,Count,71,10.0,People (thousands),
2,K02000001,1991,Work related - Definite job,inflow,Count,50,9.0,People (thousands),
3,K02000001,1991,Work related - Looking for work,inflow,Count,21,4.0,People (thousands),
4,K02000001,1991,Accompany / join - Looking for work,inflow,Count,90,14.0,People (thousands),
5,K02000001,1991,Formal study - Looking for work,inflow,Count,56,10.0,People (thousands),
6,K02000001,1991,Other - Looking for work,inflow,Count,67,7.0,People (thousands),
7,K02000001,1991,No reason stated - Looking for work,inflow,Count,45,9.0,People (thousands),
8,K02000001,1992,All reasons,inflow,Count,268,20.0,People (thousands),
9,K02000001,1992,Work related - All,inflow,Count,76,10.0,People (thousands),


In [35]:
out = Path('out')
out.mkdir(exist_ok=True, parents=True)

tidy.drop_duplicates().to_csv('observations.csv', index = False)

In [36]:
from gssutils.metadata import THEME

scraper.dataset.family = 'migration'
scraper.dataset.theme = THEME['population']
with open('dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())