In [1]:
import os
import requests
import pandas as pd
import numpy as np
import geopandas as gpd
%matplotlib inline

#### Download

Download all four datasets from here: https://crtm.maps.arcgis.com/apps/MinimalGallery/index.html?appid=a60bb2f0142b440eadee1a69a11693fc
and store in *data\raw\*

In [2]:
# working directory
path = os.getcwd()
print("Current working directory:", path)

# create sub-folders in 'data' folder
for x in ['raw', 'interim', 'processed']:
    temp = os.path.join('../data/', x)
    try:
        os.mkdir(temp)
    except:
        print('Folder', temp, 'already exists.')
    
# download and save raw datasets
    # HOGARES
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/d9e8c48ae6a1474faa34083239007307/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018HOGARES.xlsx', 'wb')
output.write(r.content)
output.close()

    # INDIVIDUOS
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/07dad41b543641d3964a68851fc9ad11/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018INDIVIDUOS.xlsx', 'wb')
output.write(r.content)
output.close()

    # VIAJES
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/6afd4db8175d4902ada0803f08ccf50e/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018VIAJES.xlsx', 'wb')
output.write(r.content)
output.close()

    # XETAPAS
url = 'https://crtm.maps.arcgis.com/sharing/rest/content/items/81919e30e674422d93203a3190eafcdc/data'
r = requests.get(url, allow_redirects=True)
output = open('../data/raw/EDM2018XETAPAS.xlsx', 'wb')
output.write(r.content)
output.close()

# display content of 'raw' folder
print('\n"../data/raw" folder contains:')
print(os.listdir('../data/raw'))


Current working directory: /Users/cassini/Documents/GitHub/gender-equality-and-mobility/notebooks
Folder ../data/raw already exists.
Folder ../data/interim already exists.
Folder ../data/processed already exists.
"../data/raw" folder contains:


['EDM2018VIAJES.xlsx',
 'EDM2018XETAPAS.xlsx',
 'EDM2018HOGARES.xlsx',
 'EDM2018INDIVIDUOS.xlsx']

**Data**

### Join dataset

In [3]:
ind = pd.read_excel('../data/raw/EDM2018INDIVIDUOS.xlsx')
hog = pd.read_excel('../data/raw/EDM2018HOGARES.xlsx')
via = pd.read_excel('../data/raw/EDM2018VIAJES.xlsx', dtype = {'VORIHORAINI':str, 'VDESHORAFIN':str})
etap = pd.read_excel('../data/raw/EDM2018XETAPAS.xlsx')

via.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace = True)
ind.set_index(["ID_HOGAR", "ID_IND"], inplace = True)
hog.set_index("ID_HOGAR", inplace = True)
etap.set_index(["ID_HOGAR", "ID_IND", "ID_VIAJE"], inplace =True)

In [4]:
people = hog.join(ind, lsuffix = "_hog", rsuffix = "_ind")
trips = people.join(via, rsuffix = "_via")
legs = trips.join(etap, rsuffix = "_etap")

people.to_csv('../data/interim/people.csv')
trips.to_csv('../data/interim/trips.csv')
legs.to_csv('../data/interim/legs.csv')

**Codes**

In [5]:
codes_hog = pd.read_excel ('../data/raw/EDM2018HOGARES.xlsx', sheet_name = 1)
codes_ind = pd.read_excel ('../data/raw/EDM2018INDIVIDUOS.xlsx', sheet_name = 1)
codes_via = pd.read_excel ('../data/raw/EDM2018VIAJES.xlsx', sheet_name = 1)
codes_eta = pd.read_excel ('../data/raw/EDM2018XETAPAS.xlsx', sheet_name = 1)

codes = codes_hog.append(codes_ind, ignore_index = True, sort = False
                        ).append(codes_via, ignore_index = True, sort = False
                                ).append(codes_eta, ignore_index = True, sort = False
                                        ).drop(columns = ["Unnamed: 0", "Unnamed: 1"])

codes["VARIABLE"] = codes.VARIABLE.fillna(method = "ffill")
codes["ESPECIFICACIÓN"] = codes["ESPECIFICACIÓN"].fillna(method = "ffill")

codes['CODE'], codes['VALUE'] = codes['VALORES'].str.split(".", 1).str
codes['CODE'], unused = codes.CODE.str.split(" '", 1).str

codes.drop(index = codes[codes.VARIABLE.isna()].index, inplace = True)

codes.to_csv('../data/interim/codes.csv', index = False)

  codes['CODE'], codes['VALUE'] = codes['VALORES'].str.split(".", 1).str
  codes['CODE'], unused = codes.CODE.str.split(" '", 1).str


translated codes not programmatically, but manually via google Sheets.

- Translation is stored in data\processed\codes_translated.csv