In [1]:
import pandas as pd

In [2]:
import requests
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import LastModified
from pathlib import Path

session = CacheControl(requests.Session(),
                       cache=FileCache('.cache'),
                       heuristic=LastModified())

sourceFolder = Path('in')
sourceFolder.mkdir(exist_ok=True)

inputURL = 'https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/MYE17_MIG_FLOWS.xlsx'
inputFile = sourceFolder / 'MYE17_MIG_FLOWS.xlsx'
response = session.get(inputURL)
with open(inputFile, 'wb') as f:
  f.write(response.content)

In [3]:
oldFile = pd.read_excel(inputURL, sheet_name = 1)
oldFile

Unnamed: 0,area,area_code,area_name,year,category,sort,MYE
0,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,United Kingdom Inflows,1,12510
1,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Rest of World Inflows,2,6488
2,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Total Inflows,3,18998
3,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,United Kingdom Outflows,4,11589
4,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Rest of World Outflows,5,6393
5,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Total Outflows,6,17982
6,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,United Kingdom Net,7,921
7,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Rest of World Net,8,95
8,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Total Net,9,1016
9,1. Northern Ireland,N92000002,NORTHERN IRELAND,2002/2003,United Kingdom Inflows,1,11107


In [4]:
newFile = pd.DataFrame()


In [5]:
newFile["Value"] = oldFile["MYE"]
newFile['Mid Year'] = oldFile["year"]
newFile['Area'] = oldFile["area_code"]
newFile['Population Change Component'] = oldFile["category"]
newFile['Measure Type'] = "Count"
newFile['Unit'] = "People"
newFile['Age'] = 'all'
newFile['Sex'] = 'T'

In [6]:
oldFile.head(5)

Unnamed: 0,area,area_code,area_name,year,category,sort,MYE
0,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,United Kingdom Inflows,1,12510
1,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Rest of World Inflows,2,6488
2,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Total Inflows,3,18998
3,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,United Kingdom Outflows,4,11589
4,1. Northern Ireland,N92000002,NORTHERN IRELAND,2001/2002,Rest of World Outflows,5,6393


In [7]:
newFile.head(5)

Unnamed: 0,Value,Mid Year,Area,Population Change Component,Measure Type,Unit,Age,Sex
0,12510,2001/2002,N92000002,United Kingdom Inflows,Count,People,all,T
1,6488,2001/2002,N92000002,Rest of World Inflows,Count,People,all,T
2,18998,2001/2002,N92000002,Total Inflows,Count,People,all,T
3,11589,2001/2002,N92000002,United Kingdom Outflows,Count,People,all,T
4,6393,2001/2002,N92000002,Rest of World Outflows,Count,People,all,T


In [8]:
newFile['Mid Year'].unique()

array(['2001/2002', '2002/2003', '2003/2004', '2004/2005', '2005/2006',
       '2006/2007', '2007/2008', '2008/2009', '2009/2010', '2010/2011',
       '2011/2012', '2012/2013', '2013/2014', '2014/2015', '2015/2016',
       '2016/2017'], dtype=object)

In [9]:
newFile['Mid Year'] = newFile['Mid Year'].map(lambda x: str(x)[0:4])

In [10]:
newFile['Mid Year'] = newFile['Mid Year'] + '-06-30T00:00:00/P1Y'

In [11]:
newFile['Mid Year'].unique()

array(['2001-06-30T00:00:00/P1Y', '2002-06-30T00:00:00/P1Y',
       '2003-06-30T00:00:00/P1Y', '2004-06-30T00:00:00/P1Y',
       '2005-06-30T00:00:00/P1Y', '2006-06-30T00:00:00/P1Y',
       '2007-06-30T00:00:00/P1Y', '2008-06-30T00:00:00/P1Y',
       '2009-06-30T00:00:00/P1Y', '2010-06-30T00:00:00/P1Y',
       '2011-06-30T00:00:00/P1Y', '2012-06-30T00:00:00/P1Y',
       '2013-06-30T00:00:00/P1Y', '2014-06-30T00:00:00/P1Y',
       '2015-06-30T00:00:00/P1Y', '2016-06-30T00:00:00/P1Y'], dtype=object)

In [12]:
newFile.dtypes

Value                           int64
Mid Year                       object
Area                           object
Population Change Component    object
Measure Type                   object
Unit                           object
Age                            object
Sex                            object
dtype: object

In [13]:
newFile = newFile[['Mid Year','Area','Age','Sex','Population Change Component','Measure Type','Value','Unit']]

In [14]:
newFile.head()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
0,2001-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Inflows,Count,12510,People
1,2001-06-30T00:00:00/P1Y,N92000002,all,T,Rest of World Inflows,Count,6488,People
2,2001-06-30T00:00:00/P1Y,N92000002,all,T,Total Inflows,Count,18998,People
3,2001-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Outflows,Count,11589,People
4,2001-06-30T00:00:00/P1Y,N92000002,all,T,Rest of World Outflows,Count,6393,People


In [15]:
newFile.tail()

Unnamed: 0,Mid Year,Area,Age,Sex,Population Change Component,Measure Type,Value,Unit
139,2016-06-30T00:00:00/P1Y,N92000002,all,T,Rest of World Outflows,Count,10727,People
140,2016-06-30T00:00:00/P1Y,N92000002,all,T,Total Outflows,Count,20894,People
141,2016-06-30T00:00:00/P1Y,N92000002,all,T,United Kingdom Net,Count,592,People
142,2016-06-30T00:00:00/P1Y,N92000002,all,T,Rest of World Net,Count,583,People
143,2016-06-30T00:00:00/P1Y,N92000002,all,T,Total Net,Count,1175,People


In [16]:
newFile['Value'] = newFile['Value'].astype(int)

In [17]:
newFile.count()

Mid Year                       144
Area                           144
Age                            144
Sex                            144
Population Change Component    144
Measure Type                   144
Value                          144
Unit                           144
dtype: int64

In [18]:
newFile = newFile[newFile['Value']!= 0]

In [19]:
newFile.count()

Mid Year                       143
Area                           143
Age                            143
Sex                            143
Population Change Component    143
Measure Type                   143
Value                          143
Unit                           143
dtype: int64

In [20]:
newFile.dtypes

Mid Year                       object
Area                           object
Age                            object
Sex                            object
Population Change Component    object
Measure Type                   object
Value                           int32
Unit                           object
dtype: object

In [21]:
# destinationFolder = Path('out')
# destinationFolder.mkdir(exist_ok=True, parents=True)

# newFile.to_csv(destinationFolder / ('MYE17_NETMIG_FLOW.csv'), index = False)