# International Passenger Survey 4.01, citizenship group by sex by age by country of last or next residence

Convert all tabs from latest Excel spreadsheet available from https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence

In [1]:
from gssutils import *

scraper = Scraper('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence')
scraper

## International Passenger Survey 4.01, citizenship group by sex by age by country of last or next residence

International Passenger Survey detailed estimates of Long-Term International Migration: Citizenship, sex and age by country of last or next residence. UK, Underlying datasheet 1.

### Distributions

1. International Passenger Survey 4.01, citizenship group by sex by age by country of last or next residence ([MS Excel Spreadsheet](https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2017/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2017.xls))


In [2]:
tabs = scraper.distribution().as_databaker()



Each tab is of the same form, with "software readable codes":

> The datasheets can be imported directly into suitable software. When importing the datasheets into other software import only rows 8 to 1448, starting at column F.

In [3]:
tidied_sheets = []

for tab in tabs:
    if not tab.name.startswith('Data'):
        continue
    year = int(tab.excel_ref('A2').value[-4:])
    start = tab.excel_ref('F8')
    end = tab.excel_ref('F1448')
    #end = tab.excel_ref('F48')
    codes = start.fill(DOWN) & end.expand(UP)
    observations = codes.fill(RIGHT).is_not_whitespace()
    country = start.shift(RIGHT).fill(RIGHT)
    country_ci = country.regex(r'^.*CI\s*$')
    country_est = country - country_ci
    observations_est = observations & country_est.fill(DOWN)
    observations_ci = observations & country_ci.fill(DOWN)
    cs_est = ConversionSegment(observations_est, [
        HDimConst('Year', year),
        HDim(codes, 'Code', DIRECTLY, LEFT),
        HDim(country_est, 'Country of Residence', DIRECTLY, ABOVE),
        HDim(observations_ci, 'CI', DIRECTLY, RIGHT),
        HDimConst('Measure Type', 'Count'),
        HDimConst('Unit', 'people-thousands')
    ])
    #savepreviewhtml(cs_est)
    tidy_sheet = cs_est.topandas()
    tidy_sheet = tidy_sheet[pd.isna(tidy_sheet['DATAMARKER'])].copy() # Todo: data markers
    tidy_sheet.drop(columns=['DATAMARKER'], inplace=True)
    tidy_sheet.rename(columns={'OBS': 'Value'}, inplace=True)
    tidied_sheets.append(tidy_sheet)
tidy = pd.concat(tidied_sheets)
tidy







Unnamed: 0,Value,Year,Code,Country of Residence,CI,Measure Type,Unit
0,253.1,2017,"INFLOW, CIT All, Persons, Age All",RESC EU EST,29.6,Count,people-thousands
1,141.9,2017,"INFLOW, CIT All, Persons, Age All",RESC EU15 EST,22.5,Count,people-thousands
2,48.8,2017,"INFLOW, CIT All, Persons, Age All",RESC EU8 EST,11.6,Count,people-thousands
3,52.7,2017,"INFLOW, CIT All, Persons, Age All",RESC EU2 EST,13.0,Count,people-thousands
4,255,2017,"INFLOW, CIT All, Persons, Age All",RESC EEA EST,29.8,Count,people-thousands
5,259.3,2017,"INFLOW, CIT All, Persons, Age All",RESC EFTA EST,30.1,Count,people-thousands
6,347.6,2017,"INFLOW, CIT All, Persons, Age All",RESC Non-EU EST,26.5,Count,people-thousands
7,165.4,2017,"INFLOW, CIT All, Persons, Age All",RESC CW EST,18.5,Count,people-thousands
8,60.9,2017,"INFLOW, CIT All, Persons, Age All",RESC Old CW EST,13.1,Count,people-thousands
9,104.5,2017,"INFLOW, CIT All, Persons, Age All",RESC New CW EST,13.2,Count,people-thousands


In [5]:
def residence_country_code(s):
    code = pathify(s)
    assert code.startswith('resc-'), code
    code = code[5:]
    assert code.endswith('-est'), code
    code = code[:-4]
    return code.replace('-/-', '-')

tidy['Country of Residence'] = tidy['Country of Residence'].apply(residence_country_code)
codes_table = tidy['Code'].str.split(', ', expand=True)
tidy['Migration Flow'] = codes_table[0]
tidy['IPS Citizenship'] = codes_table[1]
tidy['Sex'] = codes_table[2]
tidy['Age'] = codes_table[3]
tidy = tidy[['Year','Country of Residence','Migration Flow',
             'IPS Citizenship','Sex','Age',
             'Measure Type','Value','CI','Unit']]
tidy

Unnamed: 0,Year,Country of Residence,Migration Flow,IPS Citizenship,Sex,Age,Measure Type,Value,CI,Unit
0,2017,eu,INFLOW,CIT All,Persons,Age All,Count,253.1,29.6,people-thousands
1,2017,eu15,INFLOW,CIT All,Persons,Age All,Count,141.9,22.5,people-thousands
2,2017,eu8,INFLOW,CIT All,Persons,Age All,Count,48.8,11.6,people-thousands
3,2017,eu2,INFLOW,CIT All,Persons,Age All,Count,52.7,13.0,people-thousands
4,2017,eea,INFLOW,CIT All,Persons,Age All,Count,255,29.8,people-thousands
5,2017,efta,INFLOW,CIT All,Persons,Age All,Count,259.3,30.1,people-thousands
6,2017,non-eu,INFLOW,CIT All,Persons,Age All,Count,347.6,26.5,people-thousands
7,2017,cw,INFLOW,CIT All,Persons,Age All,Count,165.4,18.5,people-thousands
8,2017,old-cw,INFLOW,CIT All,Persons,Age All,Count,60.9,13.1,people-thousands
9,2017,new-cw,INFLOW,CIT All,Persons,Age All,Count,104.5,13.2,people-thousands


In [6]:
from IPython.core.display import HTML
for col in tidy:
    if col not in ['Value', 'CI']:
        tidy[col] = tidy[col].astype('category')
        display(HTML(f"<h2>{col}</h2>"))
        display(tidy[col].cat.categories)

Int64Index([2017], dtype='int64')

Index(['afghanistan', 'albania', 'algeria', 'angola',
       'antarctica-not-otherwise-specified', 'argentina', 'asia', 'australia',
       'austria', 'azerbaijan',
       ...
       'turks-and-caicos-islands', 'uganda', 'ukraine', 'united-arab-emirates',
       'usa', 'uzbekistan', 'vietnam', 'yemen', 'zambia', 'zimbabwe'],
      dtype='object', length=154)

Index(['BALANCE', 'INFLOW', 'OUTFLOW'], dtype='object')

Index(['CIT All', 'CIT British', 'CIT British or British Overseas',
       'CIT Non-British', 'CIT Not British or British Overseas'],
      dtype='object')

Index(['Female', 'Male', 'Persons'], dtype='object')

Index(['AG1 0-16', 'AG1 17-18', 'AG1 19-21', 'AG1 22-59', 'AG1 60-64',
       'AG1 65 plus', 'AG2 0-14', 'AG2 15-24', 'AG2 25-44', 'AG2 45-59',
       'AG2 60-64', 'AG2 65 plus', 'AGQ 0-4', 'AGQ 10-14', 'AGQ 15-19',
       'AGQ 20-24', 'AGQ 25-29', 'AGQ 30-34', 'AGQ 35-39', 'AGQ 40-44',
       'AGQ 45-49', 'AGQ 5-9', 'AGQ 50-54', 'AGQ 55-59', 'AGQ 60-64',
       'AGQ 65-69', 'AGQ 70-74', 'AGQ 75-79', 'AGQ 80-84', 'AGQ 90 plus',
       'Age All'],
      dtype='object')

Index(['Count'], dtype='object')

Index(['people-thousands'], dtype='object')

In [8]:
tidy['Country of Residence'] = tidy['Country of Residence'].cat.rename_categories({
    'bahamas-the': 'bahamas'
})
tidy['Migration Flow'].cat.categories = tidy['Migration Flow'].cat.categories.map(pathify)
tidy['IPS Citizenship'].cat.categories = tidy['IPS Citizenship'].cat.categories.map(lambda s: pathify(s[4:]))
tidy['Sex'] = tidy['Sex'].cat.rename_categories({
    'Female': 'F',
    'Male': 'M',
    'Persons': 'T'})
tidy['Age'].cat.categories = tidy['Age'].cat.categories.map(
    lambda s: 'all' if s == 'Age All' else pathify(s[:3]) + '/' + pathify(s[4:]))
tidy

Unnamed: 0,Year,Country of Residence,Migration Flow,IPS Citizenship,Sex,Age,Measure Type,Value,CI,Unit
0,2017,eu,inflow,all,T,all,Count,253.1,29.6,people-thousands
1,2017,eu15,inflow,all,T,all,Count,141.9,22.5,people-thousands
2,2017,eu8,inflow,all,T,all,Count,48.8,11.6,people-thousands
3,2017,eu2,inflow,all,T,all,Count,52.7,13.0,people-thousands
4,2017,eea,inflow,all,T,all,Count,255,29.8,people-thousands
5,2017,efta,inflow,all,T,all,Count,259.3,30.1,people-thousands
6,2017,non-eu,inflow,all,T,all,Count,347.6,26.5,people-thousands
7,2017,cw,inflow,all,T,all,Count,165.4,18.5,people-thousands
8,2017,old-cw,inflow,all,T,all,Count,60.9,13.1,people-thousands
9,2017,new-cw,inflow,all,T,all,Count,104.5,13.2,people-thousands


In [9]:
out = Path('out')
out.mkdir(exist_ok=True)
tidy.to_csv(out / 'observations.csv', index = False)

In [10]:
from gssutils.metadata import THEME

scraper.dataset.family = 'migration'
scraper.dataset.theme = THEME['population']
with open(out / 'dataset.trig', 'wb') as metadata:
    metadata.write(scraper.generate_trig())