In [1]:
import pandas as pd

In [2]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/MYE17_NETMIG_AGE_BANDS.xlsx'
inputFile = sourceFolder / 'MYE17_NETMIG_AGE_BANDS.xlsx'
response = session.get(inputURL)
with open(inputFile, 'wb') as f:
  f.write(response.content)

In [3]:
oldFile = pd.read_excel(inputURL, sheet_name = 1)
oldFile

Unnamed: 0,area,area_code,area_name,year,type,gender,age,NETMIG
0,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,00-17,-90
1,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,18-24,-1070
2,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,25-34,1274
3,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,35-44,362
4,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,45-54,181
5,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,55-64,173
6,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,65+,91
7,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,Females,00-17,36
8,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,Females,18-24,-549
9,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,Females,25-34,609


In [4]:
newFile = pd.DataFrame()


In [5]:
newFile["Value"] = oldFile["NETMIG"]
newFile['Mid Year'] = oldFile["year"]
newFile['Age'] = oldFile["age"]
newFile['Area'] = oldFile["area_code"]
newFile['Sex'] = oldFile["gender"]
newFile['Population Change Component'] = oldFile["type"]
newFile['Measure Type'] = "Count"
newFile['Unit'] = "People"

In [6]:
oldFile.head(5)

Unnamed: 0,area,area_code,area_name,year,type,gender,age,NETMIG
0,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,00-17,-90
1,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,18-24,-1070
2,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,25-34,1274
3,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,35-44,362
4,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,1. United Kingdom Net,All persons,45-54,181


In [7]:
newFile.head(5)

Unnamed: 0,Value,Mid Year,Age,Area,Sex,Population Change Component,Measure Type,Unit
0,-90,2001/2002,00-17,N92000002,All persons,1. United Kingdom Net,Count,People
1,-1070,2001/2002,18-24,N92000002,All persons,1. United Kingdom Net,Count,People
2,1274,2001/2002,25-34,N92000002,All persons,1. United Kingdom Net,Count,People
3,362,2001/2002,35-44,N92000002,All persons,1. United Kingdom Net,Count,People
4,181,2001/2002,45-54,N92000002,All persons,1. United Kingdom Net,Count,People


In [8]:
newFile['Mid Year'].unique()

array(['2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006',
       '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011',
       '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016',
       '2016/2017'], dtype=object)

In [9]:
newFile['Mid Year'] = newFile['Mid Year'].map(lambda x: str(x)[0:4])

In [10]:
newFile['Mid Year'] = newFile['Mid Year'] + '-06-30T00:00:00/P1Y'

In [11]:
newFile['Mid Year'].unique()

array(['2001-06-30T00:00:00/P1Y', '2002-06-30T00:00:00/P1Y',
       '2003-06-30T00:00:00/P1Y', '2004-06-30T00:00:00/P1Y',
       '2005-06-30T00:00:00/P1Y', '2006-06-30T00:00:00/P1Y',
       '2007-06-30T00:00:00/P1Y', '2008-06-30T00:00:00/P1Y',
       '2009-06-30T00:00:00/P1Y', '2010-06-30T00:00:00/P1Y',
       '2011-06-30T00:00:00/P1Y', '2012-06-30T00:00:00/P1Y',
       '2013-06-30T00:00:00/P1Y', '2014-06-30T00:00:00/P1Y',
       '2015-06-30T00:00:00/P1Y', '2016-06-30T00:00:00/P1Y'], dtype=object)

In [12]:
newFile.dtypes

Value                           int64
Mid Year                       object
Age                            object
Area                           object
Sex                            object
Population Change Component    object
Measure Type                   object
Unit                           object
dtype: object

In [13]:
newFile['Age'] = newFile['Age'].map(lambda cell: cell.replace('65+', '65-plus'))

In [14]:
newFile['Age'] = 'nisra5/' + newFile['Age'].map(str)

In [15]:
newFile['Sex'] = newFile['Sex'].map(
    lambda x: {
        'All persons' : 'T', 
        'Females' : 'F',
        'Males': 'M' 
    }.get(x, x))

In [16]:
newFile['Population Change Component'] = newFile['Population Change Component'].str.lstrip('123.')

In [17]:
newFile = newFile[['Mid Year','Area','Age','Sex','Population Change Component','Measure Type','Value','Unit']]

In [18]:
newFile.head()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
0,2001-06-30T00:00:00/P1Y,N92000002,nisra5/00-17,T,United Kingdom Net,Count,-90,People
1,2001-06-30T00:00:00/P1Y,N92000002,nisra5/18-24,T,United Kingdom Net,Count,-1070,People
2,2001-06-30T00:00:00/P1Y,N92000002,nisra5/25-34,T,United Kingdom Net,Count,1274,People
3,2001-06-30T00:00:00/P1Y,N92000002,nisra5/35-44,T,United Kingdom Net,Count,362,People
4,2001-06-30T00:00:00/P1Y,N92000002,nisra5/45-54,T,United Kingdom Net,Count,181,People


In [19]:
newFile.tail()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
1003,2016-06-30T00:00:00/P1Y,N92000002,nisra5/25-34,M,Total Net,Count,395,People
1004,2016-06-30T00:00:00/P1Y,N92000002,nisra5/35-44,M,Total Net,Count,256,People
1005,2016-06-30T00:00:00/P1Y,N92000002,nisra5/45-54,M,Total Net,Count,239,People
1006,2016-06-30T00:00:00/P1Y,N92000002,nisra5/55-64,M,Total Net,Count,116,People
1007,2016-06-30T00:00:00/P1Y,N92000002,nisra5/65-plus,M,Total Net,Count,78,People


In [20]:
newFile['Value'] = newFile['Value'].astype(int)

In [21]:
newFile.count()

Mid Year                       1008
Area                           1008
Age                            1008
Sex                            1008
Population Change Component    1008
Measure Type                   1008
Value                          1008
Unit                           1008
dtype: int64

In [22]:
newFile = newFile[newFile['Value']!= 0]

In [23]:
newFile.count()

Mid Year                       1003
Area                           1003
Age                            1003
Sex                            1003
Population Change Component    1003
Measure Type                   1003
Value                          1003
Unit                           1003
dtype: int64

In [24]:
newFile.dtypes

Mid Year                       object
Area                           object
Age                            object
Sex                            object
Population Change Component    object
Measure Type                   object
Value                           int32
Unit                           object
dtype: object

In [25]:
# destinationFolder = Path('out')
# destinationFolder.mkdir(exist_ok=True, parents=True)

# newFile.to_csv(destinationFolder / ('MYE17_NETMIG_AGE.csv'), index = False)