"Datasheet 4.01B:
  Migration to and from the United Kingdom by citizenship group, sex and age by country of last or next residence.
  Countries of last or next residence in groups."

In [1]:
from databaker.framework import *
import pandas as pd 

In [2]:
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

%run lib/scrape_ons.ipynb

metadata = scrape('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence')
metadata

{'about': 'International Passenger Survey detailed estimates of Long-Term International Migration: Citizenship, sex and age by country of last or next residence. UK, Underlying datasheet 1.',
 'fileURL': 'https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls',
 'mailto': 'mailto:migstatsunit@ons.gsi.gov.uk',
 'nextRelease': datetime.date(2018, 11, 29),
 'releaseDate': datetime.date(2017, 11, 30),
 'title': 'International Passenger Survey\xa04.01, citizenship\xa0group\xa0by sex by age by country of last or next residence'}

https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/
    internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls

In [3]:
inputFile = sourceFolder / 'data.xls'
response = session.get(metadata['fileURL'])
with open(inputFile, 'wb') as f:
  f.write(response.content)
tab = loadxlstabs(inputFile, sheetids='Datasheet 4.01B')[0]

Loading in\data.xls which has size 6391296 bytes
Table names: ['Datasheet 4.01B']


In [4]:
observations = tab.excel_ref("G7").expand(RIGHT).filter(contains_string("Estimate")).expand(DOWN).is_not_blank().is_not_whitespace()
observations

{<AA309 '.'>, <W278 '.'>, <AY823 '.'>, <G466 8.8>, <BY734 '.'>, <AO528 '.'>, <M81 0.1>, <AY880 '.'>, <K1365 '.'>, <CA1151 '.'>, <BY667 '.'>, <AY486 '.'>, <BY933 '.'>, <CA174 '.'>, <BI25 '.'>, <AS310 '.'>, <AW239 2.3>, <BS606 '.'>, <AG825 '.'>, <Y1062 '.'>, <BW32 0.9>, <CA987 '.'>, <M695 '.'>, <G1082 '.'>, <BU793 '.'>, <M346 '.'>, <AQ413 '.'>, <AU899 '.'>, <BC258 '.'>, <Q637 -0.8>, <BW1064 '.'>, <AC372 0.8>, <AA287 '.'>, <BE1090 '.'>, <BK191 '.'>, <AM1124 '.'>, <Y676 '.'>, <CE465 '.'>, <G1185 -0.5>, <BA1357 '.'>, <O887 '.'>, <CG779 '.'>, <BU979 '.'>, <U132 '.'>, <AY466 '.'>, <BY1129 '.'>, <CG745 -4.1>, <CA152 '.'>, <AG966 '.'>, <AO1173 '.'>, <U823 '.'>, <BU1105 '.'>, <M1075 '.'>, <AQ186 '.'>, <CC410 '.'>, <BG1029 0.2>, <AU188 '.'>, <M635 '.'>, <AO650 '.'>, <CE209 0.1>, <CA357 '.'>, <BK217 '.'>, <CG628 -1.1>, <BM1053 '.'>, <Y650 '.'>, <BA247 '.'>, <S438 '.'>, <AK1210 '.'>, <U1381 '.'>, <BM407 '.'>, <AO1051 '.'>, <Y497 '.'>, <AW318 '.'>, <CG248 '.'>, <AW1045 '.'>, <BQ1182 '.'>, <BK1254 '.

In [5]:
CI = observations.shift(RIGHT)
CI

{<CH1051 '.'>, <CD152 '.'>, <BL1069 '.'>, <BR914 '.'>, <AZ266 '.'>, <AT548 1.7>, <N365 '.'>, <BL403 '.'>, <T1304 '.'>, <AJ1290 '.'>, <CD82 '.'>, <CB75 '.'>, <AT972 '.'>, <R1111 2.2>, <BN734 '.'>, <CH209 1.1>, <BZ54 '.'>, <AL313 '.'>, <BN800 '.'>, <AB1011 '.'>, <V128 0.5>, <V1151 '.'>, <AT1083 '.'>, <Z754 '.'>, <J811 '.'>, <AJ493 '.'>, <V950 '.'>, <BV1196 '.'>, <AH1447 '.'>, <CF971 0.1>, <BX246 '.'>, <BZ1119 '.'>, <AV1238 '.'>, <CH449 '.'>, <P484 '.'>, <T834 '.'>, <L1021 '.'>, <AX1415 '.'>, <BH599 '.'>, <BT302 '.'>, <AX14 8.9>, <BB490 '.'>, <CB31 '.'>, <L74 '.'>, <N1195 '.'>, <AV856 '.'>, <CD1037 '.'>, <AP824 '.'>, <BT1076 '.'>, <BD1183 '.'>, <BB939 '.'>, <AP1408 '.'>, <R933 1.7>, <AF1417 '.'>, <P813 '.'>, <Z702 '.'>, <P107 '.'>, <V980 '.'>, <BR1027 0.9>, <AR970 '.'>, <J1359 '.'>, <BN445 '.'>, <Z1320 '.'>, <AJ1208 '.'>, <P54 '.'>, <BD201 '.'>, <AX98 '.'>, <BB360 '.'>, <AL1262 '.'>, <BX777 '.'>, <BH899 '.'>, <BJ211 '.'>, <J1418 '.'>, <AJ1000 0.4>, <V1008 0.5>, <AZ624 '.'>, <AV818 '.'>, <

In [6]:
label = tab.excel_ref('F9').expand(DOWN).is_not_blank() -  tab.excel_ref('C1449').expand(DOWN)
label

{<F1132 'BALANCE, CIT British, Female, AGQ 10-14'>, <F466 'INFLOW, CIT Not British or British Overseas, Female, AGQ 40-44'>, <F1404 'BALANCE, CIT Not British or British Overseas, Male, AGQ 90 plus'>, <F1345 'BALANCE, CIT British or British Overseas, Female, AG1 60-64'>, <F1030 'BALANCE, CIT All, Male, AG2 45-59'>, <F1135 'BALANCE, CIT British, Female, AGQ 25-29'>, <F666 'OUTFLOW, CIT British, Female, AGQ 80-84'>, <F860 'OUTFLOW, CIT British or British Overseas, Female, AGQ 90 plus'>, <F906 'OUTFLOW, CIT Not British or British Overseas, Male, AGQ 0-4'>, <F665 'OUTFLOW, CIT British, Female, AGQ 75-79'>, <F408 'INFLOW, CIT Not British or British Overseas, Persons, AGQ 70-74'>, <F545 'OUTFLOW, CIT All, Male, AG1 60-64'>, <F880 'OUTFLOW, CIT Not British or British Overseas, Persons, AGQ 30-34'>, <F866 'OUTFLOW, CIT British or British Overseas, Female, AG1 65 plus'>, <F1008 'BALANCE, CIT All, Male, AGQ 30-34'>, <F755 'OUTFLOW, CIT Non-British, Female, AGQ 45-49'>, <F1069 'BALANCE, CIT Britis

In [7]:
country = tab.excel_ref('G8').expand(RIGHT).is_not_blank()
country

{<X8 'RESC Bahrain CI'>, <AJ8 'RESC Brazil CI'>, <R8 'RESC Australia CI'>, <AM8 'RESC Bulgaria EST'>, <AU8 'RESC Chile EST'>, <BB8 'RESC Congo CI'>, <AE8 'RESC Bermuda EST'>, <AL8 'RESC Brunei CI'>, <K8 'RESC Albania EST'>, <BL8 'RESC Cyprus, Southern CI'>, <AQ8 'RESC Cambodia / Kampuchea EST'>, <BU8 'RESC Egypt EST'>, <BE8 'RESC Croatia EST'>, <AK8 'RESC Brunei EST'>, <AY8 'RESC Colombia EST'>, <AS8 'RESC Canada EST'>, <CD8 'RESC Fiji CI'>, <I8 'RESC Afghanistan EST'>, <AW8 'RESC China EST'>, <AB8 'RESC Barbados CI'>, <Y8 'RESC Bangladesh EST'>, <AT8 'RESC Canada CI'>, <BP8 'RESC Denmark CI'>, <AA8 'RESC Barbados EST'>, <BC8 'RESC Costa Rica EST'>, <T8 'RESC Austria CI'>, <CG8 'RESC France EST'>, <Q8 'RESC Australia EST'>, <CE8 'RESC Finland EST'>, <CH8 'RESC France CI'>, <AZ8 'RESC Colombia CI'>, <J8 'RESC Afghanistan CI'>, <BF8 'RESC Croatia CI'>, <BJ8 'RESC Cyprus, Northern CI'>, <AR8 'RESC Cambodia / Kampuchea CI'>, <BT8 'RESC Ecuador CI'>, <BN8 'RESC Czech Republic CI'>, <AH8 'RE

In [8]:
Dimensions = [
            HDimConst('Geography', 'K02000001'),
            HDimConst('Year', '2016'),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People (thousands)'),
            HDim(CI,'CI',DIRECTLY,RIGHT),
            HDim(label,'label',DIRECTLY,LEFT),
            HDim(country,'Last or Next Residence',DIRECTLY, ABOVE)
            ]

In [9]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
# savepreviewhtml(c1)

In [10]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
0,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
1,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
2,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
3,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
4,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
5,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
6,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
7,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
8,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
9,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,


In [11]:
new_table.shape

(57680, 9)

In [12]:
new_table.count()

OBS                       57680
DATAMARKER                48378
Geography                 57680
Year                      57680
Measure Type              57680
Unit                      57680
CI                        57680
label                     57600
Last or Next Residence    57640
dtype: int64

In [13]:
new_table = new_table[new_table['Year'].isnull() == False]

In [14]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
dtype: object

In [15]:
new_table = new_table[new_table['OBS'] != '']

In [16]:
new_table.count()

OBS                       9302
DATAMARKER                   0
Geography                 9302
Year                      9302
Measure Type              9302
Unit                      9302
CI                        9302
label                     9302
Last or Next Residence    9302
dtype: int64

In [17]:
# new_table.to_csv('abc.csv')

In [18]:
new_table.head()

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
80,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
81,1.0,,K02000001,2016,Count,People (thousands),1.4,"INFLOW, CIT All, Persons, Age All",RESC Afghanistan EST
83,0.4,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Algeria EST
85,25.1,,K02000001,2016,Count,People (thousands),6.4,"INFLOW, CIT All, Persons, Age All",RESC Australia EST
86,0.6,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Austria EST


In [19]:
# new_table['label'] = new_table['label'].astype(str)

In [20]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
80,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
81,1,,K02000001,2016,Count,People (thousands),1.4,"INFLOW, CIT All, Persons, Age All",RESC Afghanistan EST
83,0.4,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Algeria EST
85,25.1,,K02000001,2016,Count,People (thousands),6.4,"INFLOW, CIT All, Persons, Age All",RESC Australia EST
86,0.6,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Austria EST
87,0.2,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Azerbaijan EST
88,0.8,,K02000001,2016,Count,People (thousands),1.0,"INFLOW, CIT All, Persons, Age All",RESC Bahrain EST
89,0.9,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Bangladesh EST
90,0.2,,K02000001,2016,Count,People (thousands),0.3,"INFLOW, CIT All, Persons, Age All",RESC Barbados EST
91,5.8,,K02000001,2016,Count,People (thousands),3.8,"INFLOW, CIT All, Persons, Age All",RESC Belgium EST


In [21]:
new_table['Flow'] = new_table.label.str.split(',').str[0]

In [22]:
new_table['Citizenship group'] = new_table.label.str.split(',').str[1]

In [23]:
new_table['Sex'] = new_table.label.str.split(',').str[2]

In [24]:
new_table['Age'] = new_table.label.str.split(',').str[3]

In [25]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence,Flow,Citizenship group,Sex,Age
80,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST,INFLOW,CIT All,Persons,Age All
81,1,,K02000001,2016,Count,People (thousands),1.4,"INFLOW, CIT All, Persons, Age All",RESC Afghanistan EST,INFLOW,CIT All,Persons,Age All
83,0.4,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Algeria EST,INFLOW,CIT All,Persons,Age All
85,25.1,,K02000001,2016,Count,People (thousands),6.4,"INFLOW, CIT All, Persons, Age All",RESC Australia EST,INFLOW,CIT All,Persons,Age All
86,0.6,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Austria EST,INFLOW,CIT All,Persons,Age All
87,0.2,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Azerbaijan EST,INFLOW,CIT All,Persons,Age All
88,0.8,,K02000001,2016,Count,People (thousands),1.0,"INFLOW, CIT All, Persons, Age All",RESC Bahrain EST,INFLOW,CIT All,Persons,Age All
89,0.9,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Bangladesh EST,INFLOW,CIT All,Persons,Age All
90,0.2,,K02000001,2016,Count,People (thousands),0.3,"INFLOW, CIT All, Persons, Age All",RESC Barbados EST,INFLOW,CIT All,Persons,Age All
91,5.8,,K02000001,2016,Count,People (thousands),3.8,"INFLOW, CIT All, Persons, Age All",RESC Belgium EST,INFLOW,CIT All,Persons,Age All


In [26]:
new_table.count()

OBS                       9302
DATAMARKER                   0
Geography                 9302
Year                      9302
Measure Type              9302
Unit                      9302
CI                        9302
label                     9302
Last or Next Residence    9302
Flow                      9302
Citizenship group         9302
Sex                       9302
Age                       9302
dtype: int64

In [27]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
dtype: object

In [28]:
new_table['Value'] = new_table['OBS'].astype(int)

In [29]:
new_table['Year'] = new_table['Year'].astype(int)

In [30]:
new_table['CI'] = new_table['CI'].astype(str)

In [31]:
new_table['CI'] = pd.to_numeric(new_table['CI'])

In [32]:
new_table['CI'] = new_table['CI'].astype(int)

In [33]:
# new_table['CI'] = pd.to_numeric(new_table['CI'], errors='coerce').fillna(0)

In [34]:
new_table.count()

OBS                       9302
DATAMARKER                   0
Geography                 9302
Year                      9302
Measure Type              9302
Unit                      9302
CI                        9302
label                     9302
Last or Next Residence    9302
Flow                      9302
Citizenship group         9302
Sex                       9302
Age                       9302
Value                     9302
dtype: int64

In [35]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                       int32
Measure Type              object
Unit                      object
CI                         int32
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Value                      int32
dtype: object

In [36]:
new_table['Last or Next Residence'] = new_table['Last or Next Residence'].str.rstrip('EST')

In [37]:
new_table = new_table[['Geography','Year','Flow','Citizenship group','Sex','Age','Last or Next Residence','Measure Type','Value','CI','Unit']]

In [38]:
new_table.dtypes

Geography                 object
Year                       int32
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Last or Next Residence    object
Measure Type              object
Value                      int32
CI                         int32
Unit                      object
dtype: object

In [39]:
new_table.head()

Unnamed: 0,Geography,Year,Flow,Citizenship group,Sex,Age,Last or Next Residence,Measure Type,Value,CI,Unit
80,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC All,Count,526,34,People (thousands)
81,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Afghanistan,Count,1,1,People (thousands)
83,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Algeria,Count,0,0,People (thousands)
85,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Australia,Count,25,6,People (thousands)
86,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Austria,Count,0,0,People (thousands)


In [40]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('tidydata4.01B.csv'), index = False)

In [41]:
writeMetadata(metadata, 'ONS-LTIM-Passenger-survey-4.01B', 'Migration')

In [42]:
new_table.count()

Geography                 9302
Year                      9302
Flow                      9302
Citizenship group         9302
Sex                       9302
Age                       9302
Last or Next Residence    9302
Measure Type              9302
Value                     9302
CI                        9302
Unit                      9302
dtype: int64