Table as_19_q: Refugees (and others) resettled, including dependants, by country of nationality 

In [1]:
def is_interactive():
    import __main__ as main
    return not hasattr(main, '__file__')

if is_interactive():
    %run lib/scrape_govuk.ipynb
    metadata = scrape('https://www.gov.uk/government/statistics/immigration-statistics-october-to-december-2017-data-tables')
    import pyexcel
    from io import BytesIO

    ods_files = [f for f in metadata['files']
                 if f['type'] == 'ODS' and
                 f['title'] == 'Asylum data tables immigration statistics October to December 2017 volume 4']
    assert len(ods_files) == 1, 'Should be exactly one matching ODS file'

    ods_url = ods_files[0]['url']
    ods_title = ods_files[0]['title']

    ods_file = BytesIO(session.get(ods_files[0]['url']).content)

    data = pyexcel.get_sheet(file_content=ods_file, file_type='ods', sheet_name='as_19_q')

In [2]:
df = pd.DataFrame(data.get_array())
df

Unnamed: 0,0,1,2,3,4,5
0,Table as_19_q: Refugees (and others) resettled...,,,,,
1,Back to contents,,,,,
2,,,,,,
3,Year,Country of nationality,Gateway Protection Programme,Mandate Scheme,Vulnerable Persons Resettlement Scheme,Vulnerable Children Resettlement Scheme
4,2004,*Total,150,z,z,z
5,2005,*Total,71,z,z,z
6,2006,*Total,353,z,z,z
7,2007,*Total,463,z,z,z
8,2008,*Total,642,75,z,z
9,2009,*Total,857,90,z,z


In [3]:
observations = df.iloc[2:, :]
observations.rename(columns= observations.iloc[1], inplace=True)
observations.drop(observations.index[0])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,Year,Country of nationality,Gateway Protection Programme,Mandate Scheme,Vulnerable Persons Resettlement Scheme,Vulnerable Children Resettlement Scheme
3,Year,Country of nationality,Gateway Protection Programme,Mandate Scheme,Vulnerable Persons Resettlement Scheme,Vulnerable Children Resettlement Scheme
4,2004,*Total,150,z,z,z
5,2005,*Total,71,z,z,z
6,2006,*Total,353,z,z,z
7,2007,*Total,463,z,z,z
8,2008,*Total,642,75,z,z
9,2009,*Total,857,90,z,z
10,2010,*Total,669,49,z,z
11,2011,*Total,432,22,z,z
12,2012,*Total,985,54,z,z


In [4]:
new_table = pd.melt(observations,
                       ['Year','Country of nationality'],
                       var_name="Gateway Protection Programme",
                       value_name="Value")

In [5]:
new_table = new_table.drop(new_table.index[0])

In [6]:
new_table.count()

Year                            719
Country of nationality          719
Gateway Protection Programme    719
Value                           719
dtype: int64

In [7]:
new_table.dropna(inplace=True)

In [8]:
new_table = new_table[new_table['Value'] != '.']

In [9]:
new_table = new_table[new_table['Value'] != 0 ]

In [10]:
new_table = new_table[new_table['Value'] != 'z']

In [11]:
new_table.count()

Year                            277
Country of nationality          277
Gateway Protection Programme    277
Value                           277
dtype: int64

In [12]:
new_table['Unit'] = 'People'
new_table['Measure Type'] = 'Count'

In [13]:
new_table.dtypes

Year                            object
Country of nationality          object
Gateway Protection Programme    object
Value                           object
Unit                            object
Measure Type                    object
dtype: object

In [14]:
new_table.head()

Unnamed: 0,Year,Country of nationality,Gateway Protection Programme,Value,Unit,Measure Type
1,Year,Country of nationality,Gateway Protection Programme,Gateway Protection Programme,People,Count
2,2004,*Total,Gateway Protection Programme,150,People,Count
3,2005,*Total,Gateway Protection Programme,71,People,Count
4,2006,*Total,Gateway Protection Programme,353,People,Count
5,2007,*Total,Gateway Protection Programme,463,People,Count


In [15]:
new_table = new_table.drop(new_table.index[0])

In [16]:
new_table['Gateway Protection Programme'].unique()

array(['Gateway Protection Programme', 'Mandate Scheme',
       'Vulnerable Persons Resettlement Scheme',
       'Vulnerable Children Resettlement Scheme'], dtype=object)

In [17]:
new_table['Value'] = pd.to_numeric(new_table['Value'], errors='coerce').fillna(0)

In [18]:
new_table['Value'] = new_table['Value'].astype(int)

In [19]:
new_table['Period'] = new_table['Year'].astype(str)

In [20]:
new_table.head()

Unnamed: 0,Year,Country of nationality,Gateway Protection Programme,Value,Unit,Measure Type,Period
2,2004,*Total,Gateway Protection Programme,150,People,Count,2004
3,2005,*Total,Gateway Protection Programme,71,People,Count,2005
4,2006,*Total,Gateway Protection Programme,353,People,Count,2006
5,2007,*Total,Gateway Protection Programme,463,People,Count,2007
6,2008,*Total,Gateway Protection Programme,642,People,Count,2008


In [21]:
new_table['Citizenship'] = new_table['Country of nationality'].str.lstrip('*')

In [22]:
new_table['Resettlement scheme'] = new_table['Gateway Protection Programme'].str.lstrip('*')

In [23]:
new_table['Period'] = new_table['Period'].map(lambda cell: cell.replace(' ', '-'))

In [24]:
new_table['Period'] = new_table['Period'].str.rstrip('-')

In [25]:
new_table = new_table[['Period','Citizenship','Resettlement scheme','Measure Type','Value','Unit']]

In [26]:
new_table.head()

Unnamed: 0,Period,Citizenship,Resettlement scheme,Measure Type,Value,Unit
2,2004,Total,Gateway Protection Programme,Count,150,People
3,2005,Total,Gateway Protection Programme,Count,71,People
4,2006,Total,Gateway Protection Programme,Count,353,People
5,2007,Total,Gateway Protection Programme,Count,463,People
6,2008,Total,Gateway Protection Programme,Count,642,People


In [27]:
new_table = new_table[new_table['Value'] != 0 ]

In [28]:
destinationFolder = Path('out')
destinationFolder.mkdir(exist_ok=True, parents=True)

new_table.to_csv(destinationFolder / ('as_19_q.csv'), index = False)

In [29]:
new_table.count()

Period                 266
Citizenship            266
Resettlement scheme    266
Measure Type           266
Value                  266
Unit                   266
dtype: int64

In [30]:
new_table.dtypes

Period                 object
Citizenship            object
Resettlement scheme    object
Measure Type           object
Value                   int64
Unit                   object
dtype: object