# OSM Data Availability

In [1]:
import os
import requests
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
from bs4 import BeautifulSoup
%matplotlib inline

## Database growth in bytes

We scrape the web page `http://download.geofabrik.de/africa.html` to get a list of all the country names.

In [2]:
html_body = requests.get('http://download.geofabrik.de/africa.html').text
soup = BeautifulSoup(html_body, 'html.parser')

links = soup.find_all(name='a')
country_links = [a.attrs['href'] for a in links if 'latest.osm.pbf' in a.attrs['href']]

def _clean_link(href):
    """Clean URL to return only the country name."""
    href = href.replace('africa/', '')
    href = href.replace('-latest.osm.pbf', '')
    href = href.replace('.md5', '')
    return href

countries = list(map(_clean_link, country_links))
countries = [country for country in countries if country != 'africa']
print(', '.join(countries))

algeria, angola, benin, botswana, burkina-faso, burundi, cameroon, canary-islands, cape-verde, central-african-republic, chad, comores, congo-brazzaville, congo-democratic-republic, djibouti, egypt, equatorial-guinea, eritrea, ethiopia, gabon, ghana, guinea, guinea-bissau, ivory-coast, kenya, lesotho, liberia, libya, madagascar, malawi, mali, mauritania, mauritius, morocco, mozambique, namibia, niger, nigeria, rwanda, saint-helena-ascension-and-tristan-da-cunha, sao-tome-and-principe, senegal-and-gambia, seychelles, sierra-leone, somalia, south-africa, south-sudan, sudan, swaziland, tanzania, togo, tunisia, uganda, zambia, zimbabwe, south-africa-and-lesotho


Geofabrik keeps an archive of data files for each country since 2014 (see [here](http://download.geofabrik.de/africa/congo-democratic-republic.html#) for DR Congo). We can estimate the quantity of information in the OSM database by reffering to the data file size (in bytes). To do so, we request the `Content-Length` HTTP header for each file.

In [3]:
def _build_url(continent, country, year):
    """Build URL to OSM data file."""
    year = str(year)[-2:]
    return 'http://download.geofabrik.de/{}/{}-{}0101.osm.pbf'.format(
        continent, country, year)

def osm_datasize(continent, country, year):
    """Get size in MB of the OSM data file for a given continent,
    country and year. The file size is fetched from the HTTP headers.
    """
    url = _build_url(continent, country, year)
    r = requests.head(url)
    if r.status_code == 200:
        size = int(r.headers['Content-Length'])
        return round(size * 10e-7, 1)
    return np.nan

In [39]:
dbsizes = pd.DataFrame(index=countries, columns=[2014, 2015, 2016, 2017, 2018, 2019])

for country in countries:
    for year in dbsizes.columns:
        dbsizes.at[(country, year)] = osm_datasize('africa', country, year)

In [40]:
dbsizes.sort_values(by=2019, ascending=False).head()

Unnamed: 0,2014,2015,2016,2017,2018,2019
tanzania,8.6,10.8,37.3,87.3,180.3,345.3
south-africa-and-lesotho,57.1,82.7,122.4,171.3,211.0,230.1
nigeria,11.0,26.4,38.3,56.1,99.7,171.9
congo-democratic-republic,31.4,44.6,64.4,75.5,92.6,143.1
cameroon,23.8,53.1,78.3,111.9,117.1,132.7


In [41]:
dbsizes.index

Index(['algeria', 'angola', 'benin', 'botswana', 'burkina-faso', 'burundi',
       'cameroon', 'canary-islands', 'cape-verde', 'central-african-republic',
       'chad', 'comores', 'congo-brazzaville', 'congo-democratic-republic',
       'djibouti', 'egypt', 'equatorial-guinea', 'eritrea', 'ethiopia',
       'gabon', 'ghana', 'guinea', 'guinea-bissau', 'ivory-coast', 'kenya',
       'lesotho', 'liberia', 'libya', 'madagascar', 'malawi', 'mali',
       'mauritania', 'mauritius', 'morocco', 'mozambique', 'namibia', 'niger',
       'nigeria', 'rwanda', 'saint-helena-ascension-and-tristan-da-cunha',
       'sao-tome-and-principe', 'senegal-and-gambia', 'seychelles',
       'sierra-leone', 'somalia', 'south-africa', 'south-sudan', 'sudan',
       'swaziland', 'tanzania', 'togo', 'tunisia', 'uganda', 'zambia',
       'zimbabwe', 'south-africa-and-lesotho'],
      dtype='object')

### Compared to population

We can compare DB sizes with population estimates to have a better idea of data coverage. Here we use population estimates from the [World Population Prospects](https://population.un.org/wpp/Download/Standard/Population/).

In [42]:
wpp = pd.read_csv('data/WPP2015.csv', index_col='Country')

In [44]:
# Harmonize country names

COUNTRIES = {}

for country in dbsizes.index:
    # Try with uppercase
    country_formatted = country.title()
    if country_formatted in population.index:
        COUNTRIES[country] = country_formatted
        continue
    # Try without hyphens
    country_formatted = country_formatted.replace('-', ' ')
    if country_formatted in population.index:
        COUNTRIES[country] = country_formatted
        
COUNTRIES.update({
    'cape-verde': 'Cabo Verde',
    'comores': 'Comoros',
    'ivory-coast': "Côte d'Ivoire",
    'congo-brazzaville': 'Congo',
    'congo-democratic-republic': 'Democratic Republic of the Congo',
    'sao-tome-and-principe': 'Sao Tome and Principe',
    'tanzania': 'United Republic of Tanzania',
    'senegal-and-gambia': 'Senegal and Gambia'
})

In [64]:
# Drop nodata countries
dbsizes_pop = dbsizes.copy()
dbsizes_pop = dbsizes_pop.drop(index=[country for country in dbsizes.index if country not in COUNTRIES])
dbsizes_pop = dbsizes_pop.drop(columns=[2014, 2015, 2016])
dbsizes_pop.columns = ['DB2017', 'DB2018', 'DB2019']
dbsizes_pop['POP2015'] = None

In [69]:
for country, country_un in COUNTRIES.items():
    # Dealing with the "senegal and gambia" group
    if country == 'senegal-and-gambia':
        pop = wpp.at[('Senegal', 'Population')] + wpp.at[('Gambia', 'Population')]
    else:
        pop = wpp.at[(country_un, 'Population')]
    dbsizes_pop.at[(country), 'POP2015'] = pop

# Update indexes
dbsizes_pop.index = [COUNTRIES[country] for country in dbsizes_pop.index]

Megabytes per million people:

In [76]:
dbsizes_pop['DBPOP2017'] = dbsizes_pop.DB2017 / (dbsizes_pop.POP2015 * 10e-4)
dbsizes_pop['DBPOP2018'] = dbsizes_pop.DB2018 / (dbsizes_pop.POP2015 * 10e-4)
dbsizes_pop['DBPOP2019'] = dbsizes_pop.DB2019 / (dbsizes_pop.POP2015 * 10e-4)

In [79]:
dbsizes_pop.sort_values(by='DBPOP2019', ascending=False).DBPOP2019

Lesotho                              49.3414
Seychelles                           21.3352
Botswana                             21.0484
Swaziland                            16.6792
Cabo Verde                           11.2589
Namibia                              9.85339
Zimbabwe                             8.15087
United Republic of Tanzania          6.40869
Zambia                                 6.124
Cameroon                             5.81138
Central African Republic             5.74118
Liberia                              5.71159
South Sudan                          5.42832
Guinea                               4.77193
Sierra Leone                         4.40789
Mali                                 4.22489
Mozambique                           4.08772
Malawi                               3.93772
Togo                                 3.91004
Mauritius                            3.89057
Guinea-Bissau                        3.61474
Gabon                                3.57481
Uganda    

### Compared to country size

https://github.com/datasets/geo-countries/raw/master/data/countries.geojson

In [80]:
!wget -nc https://github.com/datasets/geo-countries/raw/master/data/countries.geojson

--2019-05-16 17:36:49--  https://github.com/datasets/geo-countries/raw/master/data/countries.geojson
Resolving github.com (github.com)... 140.82.118.4
Connecting to github.com (github.com)|140.82.118.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson [following]
--2019-05-16 17:36:50--  https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.36.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.36.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24090863 (23M) [text/plain]
Saving to: ‘countries.geojson’


2019-05-16 17:36:54 (8.84 MB/s) - ‘countries.geojson’ saved [24090863/24090863]



In [152]:
countries_geom = gpd.read_file('countries.geojson')
countries_geom.set_index(countries_geom.ADMIN, inplace=True)
countries_geom.drop(columns=['ADMIN', 'ISO_A3'], inplace=True)

In [164]:
dbsizes_geom = dbsizes_pop.copy()
dbsizes_geom['geometry'] = None

In [165]:
for country in dbsizes_pop.index:
    if country in countries_geom.index:
        dbsizes_geom.at[(country, 'geometry')] = countries_geom.at[(country, 'geometry')]
    # Edge cases
    elif country == 'Cabo Verde':
        dbsizes_geom.at[(country, 'geometry')] = countries_geom.at[('Cape Verde', 'geometry')]
    elif country == 'Congo':
        dbsizes_geom.at[(country, 'geometry')] = countries_geom.at[('Republic of Congo', 'geometry')]
    elif country == 'Côte d\'Ivoire':
        dbsizes_geom.at[(country, 'geometry')] = countries_geom.at[('Ivory Coast', 'geometry')]
    elif country == 'Guinea-Bissau':
        dbsizes_geom.at[(country, 'geometry')] = countries_geom.at[('Guinea Bissau', 'geometry')]
    elif country == 'Senegal and Gambia':
        senegal = countries_geom.at[('Senegal', 'geometry')]
        gambia = countries_geom.at[('Gambia', 'geometry')]
        dbsizes_geom.at[(country, 'geometry')] = senegal.union(gambia)

In [166]:
from fiona import crs

In [167]:
africa_equal_area = crs.from_string("+proj=aea +lat_1=20 +lat_2=-23 +lat_0=0 +lon_0=25 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m no_defs")

In [168]:
dbsizes_geom = gpd.GeoDataFrame(dbsizes_geom)
dbsizes_geom.crs = crs.from_epsg(4326)
dbsizes_geom.to_crs(crs=africa_equal_area, inplace=True)
dbsizes_geom['surface'] = dbsizes_geom.geometry.apply(lambda x: x.area) * 1e-6
dbsizes_geom['DENSITY2017'] = dbsizes_geom.DB2017 / dbsizes_geom.surface * 1e6
dbsizes_geom['DENSITY2018'] = dbsizes_geom.DB2018 / dbsizes_geom.surface * 1e6
dbsizes_geom['DENSITY2019'] = dbsizes_geom.DB2019 / dbsizes_geom.surface * 1e6

Bytes per sq. km:

In [173]:
dbsizes_geom.DENSITY2019.sort_values(ascending=False)

Seychelles                          4590.69
Lesotho                             3564.02
Mauritius                           2432.45
Cabo Verde                          1545.14
Swaziland                           1285.65
Comoros                             1255.81
Malawi                              579.575
Sao Tome and Principe               578.505
Rwanda                               533.49
Uganda                              520.977
Togo                                509.994
Sierra Leone                        445.459
Burundi                             421.578
United Republic of Tanzania         366.753
Zimbabwe                            330.304
Cameroon                            285.795
Liberia                              269.68
Benin                               240.282
Guinea                              236.183
Senegal and Gambia                  213.326
Guinea-Bissau                       194.949
Nigeria                             189.422
Tunisia                         

## Road network

### Importing OSM data into PostGIS

To analyze OSM road networks, OSM data must be downloaded and imported into a PostGIS database. We download 6 different snapshots (~8GB) from geofabrik in order to conduct a multi-temporal analysis.