Table 1.2: Estimated Net International Migration, by Local Government District (year ending mid-2007 to year ending mid-2017)

In [1]:
from databaker.framework import *
import pandas as pd 

In [2]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Mig1617-Official.xlsx'
inputFile = sourceFolder / 'Mig1617-Official.xlsx'
response = session.get(inputURL)
with open(inputFile, 'wb') as f:
  f.write(response.content)

In [3]:
tab = loadxlstabs(inputFile, sheetids='Table 1.2')[0]

Loading in\Mig1617-Official.xlsx which has size 194382 bytes
Table names: ['Table 1.2']


In [4]:
tab

{<L5 -106.0>, <M41 108.0>, <D14 495.0>, <E39 41.0>, <K5 -74.0>, <K19 ''>, <H6 555.0>, <N44 ''>, <N14 'Mid Ulster'>, <F27 -1247.0>, <E36 216.0>, <M39 437.0>, <C43 -84.0>, <F5 -226.0>, <G45 -105.0>, <C44 -56.0>, <D12 293.0>, <A31 'Cookstown'>, <H18 ''>, <C1 ''>, <E6 665.0>, <N6 'Causeway Coast & Glens'>, <C13 1336.0>, <H23 141.0>, <A9 'Derry City & Strabane'>, <J27 568.0>, <O43 ''>, <F11 -56.0>, <I19 ''>, <M4 'Total'>, <J6 1086.0>, <O25 ''>, <M8 -832.0>, <J32 863.0>, <N39 ''>, <F43 -252.0>, <L16 583.0>, <F29 -84.0>, <O35 ''>, <O9 527.0>, <O17 ''>, <N31 ''>, <I30 -92.0>, <F4 '                     Jul 2010 - Jun 2011'>, <G39 -45.0>, <F8 -67.0>, <K7 -580.0>, <A7 'Belfast'>, <J14 428.0>, <N4 'LGD'>, <D41 -3.0>, <G27 -502.0>, <I11 13.0>, <E18 ''>, <H31 -26.0>, <L37 15.0>, <L26 71.0>, <I3 ''>, <J8 -68.0>, <J25 41.0>, <N5 'Derry City & Strabane'>, <J44 -105.0>, <H8 -136.0>, <N23 ''>, <B41 17.0>, <E23 285.0>, <C23 261.0>, <O16 ''>, <E26 0.0>, <K16 1458.0>, <G24 129.0>, <B13 2195.0>, <L14 491.0>,

In [5]:
observations = tab.excel_ref('B5').expand(DOWN).expand(RIGHT).is_not_blank()- tab.excel_ref('N5').expand(DOWN).expand(RIGHT)  

In [6]:
observations = observations - tab.excel_ref('B20').expand(RIGHT)

In [7]:
Area = tab.excel_ref('A5').expand(DOWN).is_not_blank()
Area

{<A26 'Banbridge'>, <A10 'Fermanagh & Omagh'>, <A25 'Ballymoney'>, <A11 'Lisburn & Castlereagh'>, <A13 'Mid Ulster'>, <A31 'Cookstown'>, <A35 'Dungannon'>, <A29 'Castlereagh'>, <A9 'Derry City & Strabane'>, <A28 'Carrickfergus'>, <A32 'Craigavon'>, <A37 'Larne'>, <A43 'Newtownabbey'>, <A21 'Antrim'>, <A23 'Armagh'>, <A14 'Newry, Mourne & Down'>, <A15 'Ards & North Down'>, <A44 'North Down'>, <A46 'Strabane'>, <A34 'Down'>, <A36 'Fermanagh'>, <A42 'Newry and Mourne'>, <A8 'Causeway Coast & Glens'>, <A20 'Area (Former LGD)'>, <A38 'Limavady'>, <A47 'Northern Ireland'>, <A33 'Derry'>, <A39 'Lisburn'>, <A22 'Ards'>, <A45 'Omagh'>, <A24 'Ballymena'>, <A27 'Belfast'>, <A5 'Antrim & Newtownabbey'>, <A41 'Moyle'>, <A12 'Mid & East Antrim'>, <A6 'Armagh City, Banbridge & Craigavon'>, <A40 'Magherafelt'>, <A16 'Northern Ireland'>, <A30 'Coleraine'>, <A7 'Belfast'>}

In [8]:
MidYear = tab.excel_ref('B4').expand(RIGHT) - tab.excel_ref('N4').expand(RIGHT)  
MidYear

{<L4 '                     Jul 2016 - Jun 2017'>, <M4 'Total'>, <K4 '                     Jul 2015 - Jun 2016'>, <D4 '                     Jul 2008 - Jun 2009'>, <E4 '                     Jul 2009 - Jun 2010'>, <H4 '                     Jul 2012 - Jun 2013'>, <I4 '                     Jul 2013 - Jun 2014'>, <F4 '                     Jul 2010 - Jun 2011'>, <G4 '                     Jul 2011 - Jun 2012'>, <J4 '                     Jul 2014 - Jun 2015'>, <C4 '                     Jul 2007 - Jun 2008'>, <B4 '                     Jul 2006 - Jun 2007'>}

In [9]:
Dimensions = [
            HDim(MidYear,'Mid Year',DIRECTLY,ABOVE),
            HDim(Area,'Area',DIRECTLY, LEFT),
            HDimConst('Measure Type', 'Count'),
            HDimConst('Unit','People')
            ]

In [10]:
c1 = ConversionSegment(observations, Dimensions, processTIMEUNIT=True)
# savepreviewhtml(c1)

In [11]:
new_table = c1.topandas()
new_table




Unnamed: 0,OBS,Mid Year,Area,Measure Type,Unit
0,259.0,Jul 2006 - Jun 2007,Antrim & Newtownabbey,Count,People
1,230.0,Jul 2007 - Jun 2008,Antrim & Newtownabbey,Count,People
2,24.0,Jul 2008 - Jun 2009,Antrim & Newtownabbey,Count,People
3,-333.0,Jul 2009 - Jun 2010,Antrim & Newtownabbey,Count,People
4,-226.0,Jul 2010 - Jun 2011,Antrim & Newtownabbey,Count,People
5,-80.0,Jul 2011 - Jun 2012,Antrim & Newtownabbey,Count,People
6,-321.0,Jul 2012 - Jun 2013,Antrim & Newtownabbey,Count,People
7,-126.0,Jul 2013 - Jun 2014,Antrim & Newtownabbey,Count,People
8,13.0,Jul 2014 - Jun 2015,Antrim & Newtownabbey,Count,People
9,-74.0,Jul 2015 - Jun 2016,Antrim & Newtownabbey,Count,People


In [12]:
new_table['OBS'] = new_table['OBS'].astype(int)

In [13]:
new_table.dtypes

OBS              int32
Mid Year        object
Area            object
Measure Type    object
Unit            object
dtype: object

In [14]:
def remove_whitespace(x):
    
    try:
        x = "".join(x.split())

    except:
        pass
    return x

new_table['Mid Year'] = new_table['Mid Year'].apply(remove_whitespace)

In [15]:
new_table['Mid Year'] = new_table['Mid Year'].map(lambda cell:cell.replace('Total', 'all years'))

In [16]:
# new_table['Mid Year'].str.ljust

In [17]:
new_table.columns = ['Value' if x=='OBS' else x for x in new_table.columns]

In [18]:
new_table = new_table[['Mid Year','Area','Measure Type','Value','Unit']]

In [19]:
new_table.head(5)

Unnamed: 0,Mid Year,Area,Measure Type,Value,Unit
0,Jul2006-Jun2007,Antrim & Newtownabbey,Count,259,People
1,Jul2007-Jun2008,Antrim & Newtownabbey,Count,230,People
2,Jul2008-Jun2009,Antrim & Newtownabbey,Count,24,People
3,Jul2009-Jun2010,Antrim & Newtownabbey,Count,-333,People
4,Jul2010-Jun2011,Antrim & Newtownabbey,Count,-226,People


In [20]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('tidydata1.2.csv'), index = False)