In [1]:
import pandas as pd

In [2]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/MYE17_NETMIG_AGE.xlsx'
inputFile = sourceFolder / 'MYE17_NETMIG_AGE.xlsx'
response = session.get(inputURL)
with open(inputFile, 'wb') as f:
  f.write(response.content)

In [3]:
oldFile = pd.read_excel(inputURL, sheet_name = 1)
oldFile

Unnamed: 0,area,area_code,area_name,year,gender,age,NETMIG
0,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,0,-52
1,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,1,-48
2,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,2,124
3,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,3,122
4,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,4,86
5,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,5,10
6,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,6,10
7,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,7,16
8,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,8,4
9,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,All persons,9,28


In [4]:
newFile = pd.DataFrame()


In [5]:
newFile["Value"] = oldFile["NETMIG"]
newFile['Mid Year'] = oldFile["year"]
newFile['Age'] = oldFile["age"]
newFile['Area'] = oldFile["area_code"]
newFile['Sex'] = oldFile["gender"]
newFile['Population Change Component'] = "Total Net"
newFile['Measure Type'] = "Count"
newFile['Unit'] = "People"

In [6]:
newFile

Unnamed: 0,Value,Mid Year,Age,Area,Sex,Population Change Component,Measure Type,Unit
0,-52,2001/2002,0,N92000002,All persons,Total Net,Count,People
1,-48,2001/2002,1,N92000002,All persons,Total Net,Count,People
2,124,2001/2002,2,N92000002,All persons,Total Net,Count,People
3,122,2001/2002,3,N92000002,All persons,Total Net,Count,People
4,86,2001/2002,4,N92000002,All persons,Total Net,Count,People
5,10,2001/2002,5,N92000002,All persons,Total Net,Count,People
6,10,2001/2002,6,N92000002,All persons,Total Net,Count,People
7,16,2001/2002,7,N92000002,All persons,Total Net,Count,People
8,4,2001/2002,8,N92000002,All persons,Total Net,Count,People
9,28,2001/2002,9,N92000002,All persons,Total Net,Count,People


In [7]:
newFile['Mid Year'].unique()

array(['2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006',
       '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011',
       '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016',
       '2016/2017'], dtype=object)

In [8]:
newFile['Mid Year'] = newFile['Mid Year'].map(lambda x: str(x)[0:4])

In [9]:
newFile['Mid Year'] = newFile['Mid Year'] + '-06-30T00:00:00/P1Y'

In [10]:
newFile['Mid Year'].unique()

array(['2001-06-30T00:00:00/P1Y', '2002-06-30T00:00:00/P1Y',
       '2003-06-30T00:00:00/P1Y', '2004-06-30T00:00:00/P1Y',
       '2005-06-30T00:00:00/P1Y', '2006-06-30T00:00:00/P1Y',
       '2007-06-30T00:00:00/P1Y', '2008-06-30T00:00:00/P1Y',
       '2009-06-30T00:00:00/P1Y', '2010-06-30T00:00:00/P1Y',
       '2011-06-30T00:00:00/P1Y', '2012-06-30T00:00:00/P1Y',
       '2013-06-30T00:00:00/P1Y', '2014-06-30T00:00:00/P1Y',
       '2015-06-30T00:00:00/P1Y', '2016-06-30T00:00:00/P1Y'], dtype=object)

In [11]:
newFile.dtypes

Value                           int64
Mid Year                       object
Age                             int64
Area                           object
Sex                            object
Population Change Component    object
Measure Type                   object
Unit                           object
dtype: object

In [12]:
newFile['Age'] = 'year/' + newFile['Age'].map(str)

In [13]:
newFile['Sex'] = newFile['Sex'].map(
    lambda x: {
        'All persons' : 'T', 
        'Females' : 'F',
        'Males': 'M' 
    }.get(x, x))

In [14]:
newFile = newFile[['Mid Year','Area','Age','Sex','Population Change Component','Measure Type','Value','Unit']]

In [15]:
newFile.head()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
0,2001-06-30T00:00:00/P1Y,N92000002,year/0,T,Total Net,Count,-52,People
1,2001-06-30T00:00:00/P1Y,N92000002,year/1,T,Total Net,Count,-48,People
2,2001-06-30T00:00:00/P1Y,N92000002,year/2,T,Total Net,Count,124,People
3,2001-06-30T00:00:00/P1Y,N92000002,year/3,T,Total Net,Count,122,People
4,2001-06-30T00:00:00/P1Y,N92000002,year/4,T,Total Net,Count,86,People


In [16]:
newFile.tail()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
4363,2016-06-30T00:00:00/P1Y,N92000002,year/86,M,Total Net,Count,-1,People
4364,2016-06-30T00:00:00/P1Y,N92000002,year/87,M,Total Net,Count,-1,People
4365,2016-06-30T00:00:00/P1Y,N92000002,year/88,M,Total Net,Count,0,People
4366,2016-06-30T00:00:00/P1Y,N92000002,year/89,M,Total Net,Count,0,People
4367,2016-06-30T00:00:00/P1Y,N92000002,year/90,M,Total Net,Count,-5,People


In [17]:
newFile['Value'] = newFile['Value'].astype(int)

In [18]:
newFile.count()

Mid Year                       4368
Area                           4368
Age                            4368
Sex                            4368
Population Change Component    4368
Measure Type                   4368
Value                          4368
Unit                           4368
dtype: int64

In [19]:
newFile = newFile[newFile['Value']!= 0]

In [20]:
newFile.count()

Mid Year                       4264
Area                           4264
Age                            4264
Sex                            4264
Population Change Component    4264
Measure Type                   4264
Value                          4264
Unit                           4264
dtype: int64

In [21]:
newFile.dtypes

Mid Year                       object
Area                           object
Age                            object
Sex                            object
Population Change Component    object
Measure Type                   object
Value                           int32
Unit                           object
dtype: object

In [22]:
# destinationFolder = Path('out')
# destinationFolder.mkdir(exist_ok=True, parents=True)

# newFile.to_csv(destinationFolder / ('MYE17_NETMIG.csv'), index = False)