"Datasheet 4.01C:
  Migration to and from the United Kingdom by citizenship group, sex and age by country of last or next residence.
  Countries of last or next residence in groups."

In [1]:
from databaker.framework import *
import pandas as pd 

In [2]:
from pathlib import Path

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

%run lib/scrape_ons.ipynb

metadata = scrape('https://www.ons.gov.uk/peoplepopulationandcommunity/populationandmigration/' \
                  'internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence')
metadata

{'about': 'International Passenger Survey detailed estimates of Long-Term International Migration: Citizenship, sex and age by country of last or next residence. UK, Underlying datasheet 1.',
 'fileURL': 'https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls',
 'mailto': 'mailto:migstatsunit@ons.gsi.gov.uk',
 'nextRelease': datetime.date(2018, 11, 29),
 'releaseDate': datetime.date(2017, 11, 30),
 'title': 'International Passenger Survey\xa04.01, citizenship\xa0group\xa0by sex by age by country of last or next residence'}

https://www.ons.gov.uk/file?uri=/peoplepopulationandcommunity/populationandmigration/
    internationalmigration/datasets/ipscitizenshipgroupbysexbyagebycountryoflastornextresidence/2016/underlyingdatasheet4.01ipscitizenshipgroupbysexbyagebycountryoflastornextresidence2016.xls

In [3]:
inputFile = sourceFolder / 'data.xls'
response = session.get(metadata['fileURL'])
with open(inputFile, 'wb') as f:
  f.write(response.content)
tab = loadxlstabs(inputFile, sheetids='Datasheet 4.01C')[0]

Loading in\data.xls which has size 6391296 bytes
Table names: ['Datasheet 4.01C']


In [4]:
observations = tab.excel_ref("G7").expand(RIGHT).filter(contains_string("Estimate")).expand(DOWN).is_not_blank().is_not_whitespace()
observations

{<BY1192 '.'>, <CM1333 '.'>, <M803 '.'>, <AI819 '.'>, <CA478 '.'>, <K61 '.'>, <AC453 '.'>, <AY1137 '.'>, <CM943 '.'>, <W270 '.'>, <BY659 '.'>, <Q1390 '.'>, <CS929 '.'>, <U544 -1.1>, <K695 '.'>, <AW893 -0.1>, <BQ1032 '.'>, <BW515 '.'>, <AC1147 '.'>, <U1383 '.'>, <BY41 '.'>, <AY544 '.'>, <BQ1220 -0.6>, <AE303 '.'>, <BW1348 '.'>, <BG749 '.'>, <AC191 '.'>, <BM444 '.'>, <W588 '.'>, <BY1069 '.'>, <BE51 '.'>, <AK1118 '.'>, <Y204 0.1>, <BM676 '.'>, <BS385 '.'>, <AW1108 '.'>, <O999 '.'>, <K393 0.7>, <BW153 '.'>, <CQ283 '.'>, <CU176 '.'>, <BG1131 '.'>, <BE928 -0.1>, <AQ644 '.'>, <BS779 '.'>, <AS432 0.2>, <CO1003 '.'>, <BC1403 '.'>, <BO923 '.'>, <W1226 '.'>, <BM370 '.'>, <AI1303 '.'>, <AI637 -0.2>, <BI898 '.'>, <BY589 '.'>, <CA920 '.'>, <CO122 '.'>, <CS816 -0.2>, <K643 '.'>, <U677 '.'>, <I1004 0.2>, <AK777 '.'>, <BA777 -0.3>, <AG1248 -0.3>, <AU1020 '.'>, <BA524 '.'>, <BA358 '.'>, <AA1125 '.'>, <AY467 '.'>, <Q121 '.'>, <BM752 '.'>, <W1048 '.'>, <BA881 '.'>, <AK501 '.'>, <CI832 '.'>, <O1411 '.'>, <

In [5]:
CI = observations.shift(RIGHT)
CI

{<AD633 '.'>, <CF476 '.'>, <CH1371 '.'>, <AD7 '+/-CI'>, <BP1394 '.'>, <AT1060 0.2>, <AB955 '.'>, <BF10 '.'>, <L1064 '.'>, <CD574 '.'>, <CX1239 '.'>, <N537 '.'>, <AH1336 '.'>, <AT535 '.'>, <BR166 '.'>, <AD475 '.'>, <X1071 '.'>, <CX203 '.'>, <AZ610 '.'>, <BX317 '.'>, <AD210 '.'>, <BD1125 '.'>, <Z1220 2.8>, <X35 '.'>, <AZ236 '.'>, <P1247 1.9>, <BP1272 '.'>, <AT337 '.'>, <AX599 '.'>, <AD1336 '.'>, <AJ816 0.4>, <BV877 '.'>, <BT713 '.'>, <H703 4.9>, <BP552 '.'>, <V604 '.'>, <CV1070 '.'>, <AF880 '.'>, <P391 '.'>, <BN917 '.'>, <AV334 '.'>, <CR802 '.'>, <CB390 '.'>, <BZ210 '.'>, <BN1349 '.'>, <BJ900 2.3>, <Z537 '.'>, <CR1093 1.2>, <BD1178 '.'>, <CR19 '.'>, <AZ1444 '.'>, <CD1191 '.'>, <L701 '.'>, <AR515 '.'>, <AN978 '.'>, <T1255 '.'>, <AT118 '.'>, <AV1125 1.7>, <BH161 '.'>, <AX368 '.'>, <T687 '.'>, <H933 13.1>, <CD700 '.'>, <L1186 '.'>, <CN281 '.'>, <AL772 '.'>, <CP816 '.'>, <X1318 '.'>, <CL531 '.'>, <BN732 '.'>, <AL138 '.'>, <BX606 '.'>, <CJ624 '.'>, <AR1105 '.'>, <CR937 1.4>, <T21 '.'>, <CR136

In [6]:
label = tab.excel_ref('F9').expand(DOWN).is_not_blank() -  tab.excel_ref('C1449').expand(DOWN)
label

{<F248 'INFLOW, CIT Non-British, Male, AGQ 70-74'>, <F19 'INFLOW, CIT All, Persons, AGQ 45-49'>, <F623 'OUTFLOW, CIT British, Male, AGQ 25-29'>, <F1353 'BALANCE, CIT Not British or British Overseas, Persons, Age All'>, <F1226 'BALANCE, CIT Non-British, Female, AGQ 0-4'>, <F1259 'BALANCE, CIT British or British Overseas, Persons, AGQ 5-9'>, <F1278 'BALANCE, CIT British or British Overseas, Persons, AG1 17-18'>, <F1170 'BALANCE, CIT Non-British, Persons, AGQ 40-44'>, <F183 'INFLOW, CIT British, Female, AGQ 65-69'>, <F1346 'BALANCE, CIT British or British Overseas, Female, AG1 65 plus'>, <F1122 'BALANCE, CIT British, Male, AG1 65 plus'>, <F1112 'BALANCE, CIT British, Male, AGQ 70-74'>, <F1345 'BALANCE, CIT British or British Overseas, Female, AG1 60-64'>, <F882 'OUTFLOW, CIT Not British or British Overseas, Persons, AGQ 40-44'>, <F57 'INFLOW, CIT All, Male, AGQ 75-79'>, <F99 'INFLOW, CIT All, Female, AG2 0-14'>, <F324 'INFLOW, CIT British or British Overseas, Persons, AG2 15-24'>, <F74 'I

In [7]:
country = tab.excel_ref('G8').expand(RIGHT).is_not_blank()
country

{<Z8 'RESC India CI'>, <P8 'RESC Greece CI'>, <AG8 'RESC Israel EST'>, <G8 'RESC All EST'>, <CC8 'RESC Moldova EST'>, <AB8 'RESC Indonesia CI'>, <BO8 'RESC Malawi EST'>, <H8 'RESC All CI'>, <CA8 'RESC Mexico EST'>, <AH8 'RESC Israel CI'>, <Q8 'RESC Guinea EST'>, <CF8 'RESC Mongolia CI'>, <BW8 'RESC Malta EST'>, <BX8 'RESC Malta CI'>, <BR8 'RESC Malaysia CI'>, <BN8 'RESC Macao CI'>, <S8 'RESC Haiti EST'>, <BS8 'RESC Maldives EST'>, <BA8 'RESC Kuwait EST'>, <BB8 'RESC Kuwait CI'>, <CW8 'RESC Norway EST'>, <M8 'RESC Gibraltar EST'>, <BY8 'RESC Mauritius EST'>, <CP8 'RESC Nepal CI'>, <AC8 'RESC Iran EST'>, <CL8 'RESC Myanmar / Burma CI'>, <AD8 'RESC Iran CI'>, <CE8 'RESC Mongolia EST'>, <T8 'RESC Haiti CI'>, <BT8 'RESC Maldives CI'>, <BQ8 'RESC Malaysia EST'>, <X8 'RESC Hungary CI'>, <BC8 'RESC Latvia EST'>, <R8 'RESC Guinea CI'>, <AP8 'RESC Japan CI'>, <AQ8 'RESC Jordan EST'>, <CJ8 'RESC Morocco CI'>, <AL8 'RESC Ivory Coast CI'>, <AW8 'RESC Korea, South / Republic EST'>, <BD8 'RESC Latvia

In [8]:
Dimensions = [
            HDimConst('Geography', 'K02000001'),
            HDimConst('Year', '2016'),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People (thousands)'),
            HDim(CI,'CI',DIRECTLY,RIGHT),
            HDim(label,'label',DIRECTLY,LEFT),
            HDim(country,'Last or Next Residence',DIRECTLY, ABOVE)
            ]

In [9]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
# savepreviewhtml(c1)

In [10]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
0,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
1,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
2,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
3,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
4,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
5,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
6,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
7,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
8,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,
9,,Estimate,K02000001,2016,Count,People (thousands),+/-CI,,


In [11]:
new_table.shape

(69216, 9)

In [12]:
new_table.count()

OBS                       69216
DATAMARKER                57527
Geography                 69216
Year                      69216
Measure Type              69216
Unit                      69216
CI                        69216
label                     69120
Last or Next Residence    69168
dtype: int64

In [13]:
new_table = new_table[new_table['Year'].isnull() == False]

In [14]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
dtype: object

In [15]:
new_table = new_table[new_table['OBS'] != '']

In [16]:
new_table.count()

OBS                       11689
DATAMARKER                    0
Geography                 11689
Year                      11689
Measure Type              11689
Unit                      11689
CI                        11689
label                     11689
Last or Next Residence    11689
dtype: int64

In [17]:
# new_table.to_csv('abc.csv')

In [18]:
new_table.head()

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
96,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
97,15.0,,K02000001,2016,Count,People (thousands),6.0,"INFLOW, CIT All, Persons, Age All",RESC Germany EST
98,0.7,,K02000001,2016,Count,People (thousands),0.6,"INFLOW, CIT All, Persons, Age All",RESC Ghana EST
100,4.2,,K02000001,2016,Count,People (thousands),2.7,"INFLOW, CIT All, Persons, Age All",RESC Greece EST
101,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Guinea EST


In [19]:
# new_table['label'] = new_table['label'].astype(str)

In [20]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence
96,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST
97,15,,K02000001,2016,Count,People (thousands),6.0,"INFLOW, CIT All, Persons, Age All",RESC Germany EST
98,0.7,,K02000001,2016,Count,People (thousands),0.6,"INFLOW, CIT All, Persons, Age All",RESC Ghana EST
100,4.2,,K02000001,2016,Count,People (thousands),2.7,"INFLOW, CIT All, Persons, Age All",RESC Greece EST
101,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Guinea EST
102,0.5,,K02000001,2016,Count,People (thousands),1.0,"INFLOW, CIT All, Persons, Age All",RESC Haiti EST
103,6.4,,K02000001,2016,Count,People (thousands),3.4,"INFLOW, CIT All, Persons, Age All",RESC Hong Kong EST
104,3.7,,K02000001,2016,Count,People (thousands),2.3,"INFLOW, CIT All, Persons, Age All",RESC Hungary EST
105,33.4,,K02000001,2016,Count,People (thousands),6.3,"INFLOW, CIT All, Persons, Age All",RESC India EST
106,1,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Indonesia EST


In [21]:
new_table['Flow'] = new_table.label.str.split(',').str[0]

In [22]:
new_table['Citizenship group'] = new_table.label.str.split(',').str[1]

In [23]:
new_table['Sex'] = new_table.label.str.split(',').str[2]

In [24]:
new_table['Age'] = new_table.label.str.split(',').str[3]

In [25]:
new_table

Unnamed: 0,OBS,DATAMARKER,Geography,Year,Measure Type,Unit,CI,label,Last or Next Residence,Flow,Citizenship group,Sex,Age
96,526.6,,K02000001,2016,Count,People (thousands),34.4,"INFLOW, CIT All, Persons, Age All",RESC All EST,INFLOW,CIT All,Persons,Age All
97,15,,K02000001,2016,Count,People (thousands),6.0,"INFLOW, CIT All, Persons, Age All",RESC Germany EST,INFLOW,CIT All,Persons,Age All
98,0.7,,K02000001,2016,Count,People (thousands),0.6,"INFLOW, CIT All, Persons, Age All",RESC Ghana EST,INFLOW,CIT All,Persons,Age All
100,4.2,,K02000001,2016,Count,People (thousands),2.7,"INFLOW, CIT All, Persons, Age All",RESC Greece EST,INFLOW,CIT All,Persons,Age All
101,0.1,,K02000001,2016,Count,People (thousands),0.2,"INFLOW, CIT All, Persons, Age All",RESC Guinea EST,INFLOW,CIT All,Persons,Age All
102,0.5,,K02000001,2016,Count,People (thousands),1.0,"INFLOW, CIT All, Persons, Age All",RESC Haiti EST,INFLOW,CIT All,Persons,Age All
103,6.4,,K02000001,2016,Count,People (thousands),3.4,"INFLOW, CIT All, Persons, Age All",RESC Hong Kong EST,INFLOW,CIT All,Persons,Age All
104,3.7,,K02000001,2016,Count,People (thousands),2.3,"INFLOW, CIT All, Persons, Age All",RESC Hungary EST,INFLOW,CIT All,Persons,Age All
105,33.4,,K02000001,2016,Count,People (thousands),6.3,"INFLOW, CIT All, Persons, Age All",RESC India EST,INFLOW,CIT All,Persons,Age All
106,1,,K02000001,2016,Count,People (thousands),0.8,"INFLOW, CIT All, Persons, Age All",RESC Indonesia EST,INFLOW,CIT All,Persons,Age All


In [26]:
new_table.count()

OBS                       11689
DATAMARKER                    0
Geography                 11689
Year                      11689
Measure Type              11689
Unit                      11689
CI                        11689
label                     11689
Last or Next Residence    11689
Flow                      11689
Citizenship group         11689
Sex                       11689
Age                       11689
dtype: int64

In [27]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                      object
Measure Type              object
Unit                      object
CI                        object
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
dtype: object

In [28]:
new_table['Value'] = new_table['OBS'].astype(int)

In [29]:
new_table['Year'] = new_table['Year'].astype(int)

In [30]:
new_table['CI'] = new_table['CI'].astype(str)

In [31]:
new_table['CI'] = pd.to_numeric(new_table['CI'])

In [32]:
new_table['CI'] = new_table['CI'].astype(int)

In [33]:
# new_table['CI'] = pd.to_numeric(new_table['CI'], errors='coerce').fillna(0)

In [34]:
new_table.count()

OBS                       11689
DATAMARKER                    0
Geography                 11689
Year                      11689
Measure Type              11689
Unit                      11689
CI                        11689
label                     11689
Last or Next Residence    11689
Flow                      11689
Citizenship group         11689
Sex                       11689
Age                       11689
Value                     11689
dtype: int64

In [35]:
new_table.dtypes

OBS                       object
DATAMARKER                object
Geography                 object
Year                       int32
Measure Type              object
Unit                      object
CI                         int32
label                     object
Last or Next Residence    object
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Value                      int32
dtype: object

In [36]:
new_table['Last or Next Residence'] = new_table['Last or Next Residence'].str.rstrip('EST')

In [37]:
new_table = new_table[['Geography','Year','Flow','Citizenship group','Sex','Age','Last or Next Residence','Measure Type','Value','CI','Unit']]

In [38]:
new_table.dtypes

Geography                 object
Year                       int32
Flow                      object
Citizenship group         object
Sex                       object
Age                       object
Last or Next Residence    object
Measure Type              object
Value                      int32
CI                         int32
Unit                      object
dtype: object

In [39]:
new_table.head()

Unnamed: 0,Geography,Year,Flow,Citizenship group,Sex,Age,Last or Next Residence,Measure Type,Value,CI,Unit
96,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC All,Count,526,34,People (thousands)
97,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Germany,Count,15,6,People (thousands)
98,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Ghana,Count,0,0,People (thousands)
100,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Greece,Count,4,2,People (thousands)
101,K02000001,2016,INFLOW,CIT All,Persons,Age All,RESC Guinea,Count,0,0,People (thousands)


In [40]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('tidydata4.01C.csv'), index = False)

In [41]:
writeMetadata(metadata, 'ONS-LTIM-Passenger-survey-4.01C', 'Migration')

In [42]:
new_table.count()

Geography                 11689
Year                      11689
Flow                      11689
Citizenship group         11689
Sex                       11689
Age                       11689
Last or Next Residence    11689
Measure Type              11689
Value                     11689
CI                        11689
Unit                      11689
dtype: int64