"Datasheet 4.01D:
  Migration to and from the United Kingdom by citizenship group, sex and age by country of last or next residence.
  Countries of last or next residence in groups."

In [1]:
from databaker.framework import *
import pandas as pd 

In [2]:
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

%run lib/scrape_ons.ipynb

metadata = scrape('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence')
metadata

{'about': 'International Passenger Survey detailed estimates of Long-Term International Migration: Citizenship, sex and age by country of last or next residence. UK, Underlying datasheet 1.',
 'fileURL': 'https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls',
 'mailto': 'mailto:migstatsunit@ons.gsi.gov.uk',
 'nextRelease': datetime.date(2018, 11, 29),
 'releaseDate': datetime.date(2017, 11, 30),
 'title': 'International Passenger Survey\xa04.01, citizenship\xa0group\xa0by sex by age by country of last or next residence'}

https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/
    internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls

In [3]:
inputFile = sourceFolder / 'data.xls'
response = session.get(metadata['fileURL'])
with open(inputFile, 'wb') as f:
  f.write(response.content)
tab = loadxlstabs(inputFile, sheetids='Datasheet 4.01D')[0]

Loading in\data.xls which has size 6391296 bytes
Table names: ['Datasheet 4.01D']


In [4]:
observations = tab.excel_ref("G7").expand(RIGHT).filter(contains_string("Estimate")).expand(DOWN).is_not_blank().is_not_whitespace()
observations

{<BA1428 '.'>, <BK1311 '.'>, <AI1096 '.'>, <K835 '.'>, <BM943 '.'>, <BS696 '.'>, <AA801 '.'>, <CK1302 '.'>, <BC1424 '.'>, <BA12 '.'>, <AC1201 3.0>, <AG1132 '.'>, <G793 -0.3>, <BW524 '.'>, <AK1380 '.'>, <BY647 '.'>, <AQ1144 '.'>, <I649 '.'>, <CC590 '.'>, <K343 '.'>, <M586 '.'>, <BO153 '.'>, <AE1332 '.'>, <BM1297 '.'>, <BQ810 '.'>, <K1213 0.3>, <S881 '.'>, <BO1303 '.'>, <W1362 0.4>, <BC1149 0.2>, <AQ129 '.'>, <AG328 '.'>, <AC777 '.'>, <BG489 -1.2>, <BE203 '.'>, <BC959 '.'>, <CI57 '.'>, <K1212 '.'>, <CE166 '.'>, <AE852 '.'>, <W551 '.'>, <BQ1354 '.'>, <BY1142 '.'>, <BE949 '.'>, <BC1386 -0.2>, <W202 0.6>, <S145 '.'>, <BO1406 '.'>, <AC868 '.'>, <AW1134 '.'>, <BQ334 '.'>, <BS260 '.'>, <M101 '.'>, <Y836 '.'>, <CK663 '.'>, <AM137 0.1>, <BY1200 0.6>, <AE19 0.4>, <AW208 0.4>, <K1238 '.'>, <I1201 '.'>, <AM834 '.'>, <BG337 '.'>, <BI66 '.'>, <AA509 '.'>, <AE248 '.'>, <AI1439 '.'>, <CG700 '.'>, <CI293 '.'>, <BU551 '.'>, <BM1336 '.'>, <BA1316 -0.2>, <W28 '.'>, <I57 '.'>, <AA461 0.5>, <O1209 '.'>, <CK1

In [5]:
CI = observations.shift(RIGHT)
CI

{<BR720 '.'>, <BR55 '.'>, <V62 '.'>, <BX644 0.2>, <N414 '.'>, <AR371 '.'>, <AP73 1.2>, <BR220 '.'>, <CH501 '.'>, <CB617 '.'>, <V649 0.7>, <Z587 '.'>, <V554 '.'>, <L1390 1.1>, <CB511 '.'>, <AN926 '.'>, <AB1443 1.1>, <CD1199 '.'>, <N372 '.'>, <BX589 '.'>, <CL1093 '.'>, <T591 '.'>, <AT479 '.'>, <AX184 '.'>, <CD793 '.'>, <AV1193 7.7>, <R726 '.'>, <AD1195 0.5>, <V461 1.7>, <BF85 '.'>, <L284 '.'>, <H447 12.3>, <X1422 1.9>, <AD178 '.'>, <CJ271 '.'>, <AL625 '.'>, <L724 0.3>, <BT704 '.'>, <BP478 '.'>, <CD710 '.'>, <BF1105 '.'>, <BL196 '.'>, <P1441 '.'>, <BH1055 '.'>, <BJ404 '.'>, <N770 '.'>, <CD1243 '.'>, <AH352 '.'>, <AN231 '.'>, <Z110 '.'>, <AN397 0.8>, <AF1110 '.'>, <AB924 '.'>, <V1446 1.0>, <BF1009 '.'>, <BZ146 '.'>, <AT965 0.3>, <AT1316 '.'>, <AT696 '.'>, <AX1309 '.'>, <BD805 1.1>, <V1018 '.'>, <N345 '.'>, <R177 '.'>, <AP654 1.4>, <AN590 '.'>, <BZ1087 0.7>, <V861 0.5>, <CB615 '.'>, <AP1168 1.0>, <T841 0.2>, <BV430 '.'>, <BX8 'RESC United Arab Emirates CI'>, <BR9 '.'>, <AT783 '.'>, <CB442 '

In [6]:
label = tab.excel_ref('F9').expand(DOWN).is_not_blank() -  tab.excel_ref('C1449').expand(DOWN)
label

{<F227 'INFLOW, CIT Non-British, Persons, AG2 0-14'>, <F248 'INFLOW, CIT Non-British, Male, AGQ 70-74'>, <F951 'OUTFLOW, CIT Not British or British Overseas, Female, AGQ 65-69'>, <F639 'OUTFLOW, CIT British, Male, AG1 19-21'>, <F625 'OUTFLOW, CIT British, Male, AGQ 35-39'>, <F922 'OUTFLOW, CIT Not British or British Overseas, Male, AGQ 80-84'>, <F1114 'BALANCE, CIT British, Male, AGQ 80-84'>, <F676 'OUTFLOW, CIT British, Female, AG2 15-24'>, <F761 'OUTFLOW, CIT Non-British, Female, AGQ 75-79'>, <F527 'OUTFLOW, CIT All, Male, AGQ 25-29'>, <F163 'INFLOW, CIT British, Male, AG2 0-14'>, <F309 'INFLOW, CIT British or British Overseas, Persons, AGQ 55-59'>, <F351 'INFLOW, CIT British or British Overseas, Male, AG1 19-21'>, <F888 'OUTFLOW, CIT Not British or British Overseas, Persons, AGQ 70-74'>, <F168 'INFLOW, CIT British, Male, AG2 65 plus'>, <F1155 'BALANCE, CIT British, Female, AG2 0-14'>, <F1083 'BALANCE, CIT British, Persons, AGQ 85-89'>, <F976 'BALANCE, CIT All, Persons, AGQ 30-34'>, 

In [7]:
country = tab.excel_ref('G8').expand(RIGHT).is_not_blank()
country

{<BD8 'RESC Switzerland CI'>, <BH8 'RESC Taiwan CI'>, <K8 'RESC Pakistan EST'>, <AB8 'RESC Republic of Ireland CI'>, <BQ8 'RESC Turks and Caicos Islands EST'>, <BM8 'RESC Tunisia EST'>, <BC8 'RESC Switzerland EST'>, <CD8 'RESC Uzbekistan CI'>, <AR8 'RESC Slovenia CI'>, <CF8 'RESC Venezuela CI'>, <BE8 'RESC Syria EST'>, <AD8 'RESC Romania CI'>, <AH8 'RESC Saudi Arabia CI'>, <BW8 'RESC United Arab Emirates EST'>, <I8 'RESC Oman EST'>, <BU8 'RESC Ukraine EST'>, <Y8 'RESC Qatar EST'>, <CC8 'RESC Uzbekistan EST'>, <CA8 'RESC Uruguay EST'>, <AF8 'RESC Russia CI'>, <CG8 'RESC Vietnam EST'>, <AA8 'RESC Republic of Ireland EST'>, <BF8 'RESC Syria CI'>, <BT8 'RESC Uganda CI'>, <S8 'RESC Philippines EST'>, <AO8 'RESC Slovakia EST'>, <AL8 'RESC Seychelles CI'>, <AX8 'RESC Sri Lanka CI'>, <J8 'RESC Oman CI'>, <G8 'RESC All EST'>, <O8 'RESC Paraguay EST'>, <V8 'RESC Poland CI'>, <AK8 'RESC Seychelles EST'>, <BV8 'RESC Ukraine CI'>, <AJ8 'RESC Serbia CI'>, <BX8 'RESC United Arab Emirates CI'>, <CH8 '

In [8]:
Dimensions = [
            HDimConst('Geography', 'K02000001'),
            HDimConst('Year', '2016'),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People (thousands)'),
            HDim(CI,'CI',DIRECTLY,RIGHT),
            HDim(label,'label',DIRECTLY,LEFT),
            HDim(country,'Last or Next Residence',DIRECTLY, ABOVE)
            ]

In [9]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
# savepreviewhtml(c1)

In [10]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
0,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
1,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
2,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
3,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
4,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
5,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
6,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
7,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
8,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
9,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,


In [11]:
new_table.shape

(60564, 9)

In [12]:
new_table.count()

OBS                       60564
DATAMARKER                48150
Geography                 60564
Year                      60564
Measure Type              60564
Unit                      60564
CI                        60564
label                     60480
Last or Next Residence    60522
dtype: int64

In [13]:
new_table = new_table[new_table['Year'].isnull() == False]

In [14]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
dtype: object

In [15]:
new_table = new_table[new_table['OBS'] != '']

In [16]:
new_table.count()

OBS                       12414
DATAMARKER                    0
Geography                 12414
Year                      12414
Measure Type              12414
Unit                      12414
CI                        12414
label                     12414
Last or Next Residence    12414
dtype: int64

In [17]:
# new_table.to_csv('abc.csv')

In [18]:
new_table.head()

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
84,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
85,2.5,,K02000001,2016,Count,People (thousands),2.1,"INFLOW, CIT All, Persons, Age All",RESC Oman EST
86,11.8,,K02000001,2016,Count,People (thousands),4.6,"INFLOW, CIT All, Persons, Age All",RESC Pakistan EST
87,0.5,,K02000001,2016,Count,People (thousands),0.7,"INFLOW, CIT All, Persons, Age All",RESC Palestine EST
88,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Paraguay EST


In [19]:
# new_table['label'] = new_table['label'].astype(str)

In [20]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
84,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
85,2.5,,K02000001,2016,Count,People (thousands),2.1,"INFLOW, CIT All, Persons, Age All",RESC Oman EST
86,11.8,,K02000001,2016,Count,People (thousands),4.6,"INFLOW, CIT All, Persons, Age All",RESC Pakistan EST
87,0.5,,K02000001,2016,Count,People (thousands),0.7,"INFLOW, CIT All, Persons, Age All",RESC Palestine EST
88,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Paraguay EST
89,0.3,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Peru EST
90,3.9,,K02000001,2016,Count,People (thousands),2.4,"INFLOW, CIT All, Persons, Age All",RESC Philippines EST
91,27.3,,K02000001,2016,Count,People (thousands),8.5,"INFLOW, CIT All, Persons, Age All",RESC Poland EST
92,11.8,,K02000001,2016,Count,People (thousands),5.5,"INFLOW, CIT All, Persons, Age All",RESC Portugal EST
93,1.3,,K02000001,2016,Count,People (thousands),1.1,"INFLOW, CIT All, Persons, Age All",RESC Qatar EST


In [21]:
new_table['Flow'] = new_table.label.str.split(',').str[0]

In [22]:
new_table['Citizenship group'] = new_table.label.str.split(',').str[1]

In [23]:
new_table['Sex'] = new_table.label.str.split(',').str[2]

In [24]:
new_table['Age'] = new_table.label.str.split(',').str[3]

In [25]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence,Flow,Citizenship group,Sex,Age
84,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST,INFLOW,CIT All,Persons,Age All
85,2.5,,K02000001,2016,Count,People (thousands),2.1,"INFLOW, CIT All, Persons, Age All",RESC Oman EST,INFLOW,CIT All,Persons,Age All
86,11.8,,K02000001,2016,Count,People (thousands),4.6,"INFLOW, CIT All, Persons, Age All",RESC Pakistan EST,INFLOW,CIT All,Persons,Age All
87,0.5,,K02000001,2016,Count,People (thousands),0.7,"INFLOW, CIT All, Persons, Age All",RESC Palestine EST,INFLOW,CIT All,Persons,Age All
88,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Paraguay EST,INFLOW,CIT All,Persons,Age All
89,0.3,,K02000001,2016,Count,People (thousands),0.5,"INFLOW, CIT All, Persons, Age All",RESC Peru EST,INFLOW,CIT All,Persons,Age All
90,3.9,,K02000001,2016,Count,People (thousands),2.4,"INFLOW, CIT All, Persons, Age All",RESC Philippines EST,INFLOW,CIT All,Persons,Age All
91,27.3,,K02000001,2016,Count,People (thousands),8.5,"INFLOW, CIT All, Persons, Age All",RESC Poland EST,INFLOW,CIT All,Persons,Age All
92,11.8,,K02000001,2016,Count,People (thousands),5.5,"INFLOW, CIT All, Persons, Age All",RESC Portugal EST,INFLOW,CIT All,Persons,Age All
93,1.3,,K02000001,2016,Count,People (thousands),1.1,"INFLOW, CIT All, Persons, Age All",RESC Qatar EST,INFLOW,CIT All,Persons,Age All


In [26]:
new_table.count()

OBS                       12414
DATAMARKER                    0
Geography                 12414
Year                      12414
Measure Type              12414
Unit                      12414
CI                        12414
label                     12414
Last or Next Residence    12414
Flow                      12414
Citizenship group         12414
Sex                       12414
Age                       12414
dtype: int64

In [27]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
dtype: object

In [28]:
new_table['Value'] = new_table['OBS'].astype(int)

In [29]:
new_table['Year'] = new_table['Year'].astype(int)

In [30]:
new_table['CI'] = new_table['CI'].astype(str)

In [31]:
new_table['CI'] = pd.to_numeric(new_table['CI'])

In [32]:
new_table['CI'] = new_table['CI'].astype(int)

In [33]:
# new_table['CI'] = pd.to_numeric(new_table['CI'], errors='coerce').fillna(0)

In [34]:
new_table.count()

OBS                       12414
DATAMARKER                    0
Geography                 12414
Year                      12414
Measure Type              12414
Unit                      12414
CI                        12414
label                     12414
Last or Next Residence    12414
Flow                      12414
Citizenship group         12414
Sex                       12414
Age                       12414
Value                     12414
dtype: int64

In [35]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                       int32
Measure Type              object
Unit                      object
CI                         int32
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Value                      int32
dtype: object

In [36]:
new_table['Last or Next Residence'] = new_table['Last or Next Residence'].str.rstrip('EST')

In [37]:
new_table = new_table[['Geography','Year','Flow','Citizenship group','Sex','Age','Last or Next Residence','Measure Type','Value','CI','Unit']]

In [38]:
new_table.dtypes

Geography                 object
Year                       int32
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Last or Next Residence    object
Measure Type              object
Value                      int32
CI                         int32
Unit                      object
dtype: object

In [39]:
new_table.head()

Unnamed: 0,Geography,Year,Flow,Citizenship group,Sex,Age,Last or Next Residence,Measure Type,Value,CI,Unit
84,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC All,Count,526,34,People (thousands)
85,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Oman,Count,2,2,People (thousands)
86,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Pakistan,Count,11,4,People (thousands)
87,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Palestine,Count,0,0,People (thousands)
88,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Paraguay,Count,0,0,People (thousands)


In [40]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('tidydata4.01D.csv'), index = False)

In [41]:
writeMetadata(metadata, 'ONS-LTIM-Passenger-survey-4.01D', 'Migration')

In [42]:
new_table.count()

Geography                 12414
Year                      12414
Flow                      12414
Citizenship group         12414
Sex                       12414
Age                       12414
Last or Next Residence    12414
Measure Type              12414
Value                     12414
CI                        12414
Unit                      12414
dtype: int64