## Data Preparation for Regression Analysis

In [4]:
import pandas as pd

In [5]:
# average number of OSM mappers per months since 2022
start_year = 2022
data_path = '/data/processing/ohsome-stats-by-year-full/*/*.parquet'

sql = f"""
COPY
(
    SELECT country_iso_a3, year, month, count(distinct user_id) as n_users
    FROM (
    	SELECT user_id, unnest(country_iso_a3) as country_iso_a3, year, month
    	FROM read_parquet("{data_path}", hive_partitioning=1)
    	WHERE year >= {start_year}
    ) foo
    GROUP BY year, month, country_iso_a3
    ORDER BY country_iso_a3, year, month
    )
TO 'mappers_per_country_per_month.csv' (HEADER, DELIMITER ',');
"""
print(sql)


COPY
(
    SELECT country_iso_a3, year, month, count(distinct user_id) as n_users
    FROM (
    	SELECT user_id, unnest(country_iso_a3) as country_iso_a3, year, month
    	FROM read_parquet("/data/processing/ohsome-stats-by-year-full/*/*.parquet", hive_partitioning=1)
    	WHERE year >= 2022
    ) foo
    GROUP BY year, month, country_iso_a3
    ORDER BY country_iso_a3, year, month
    )
TO 'mappers_per_country_per_month.csv' (HEADER, DELIMITER ',');



In [44]:
osm_df = pd.read_csv("../data/mappers_per_country_per_month.csv")
osm_df_avg = osm_df.groupby(["country_iso_a3"]).agg(
    avg_OSM_users_month=pd.NamedAgg(column="n_users", aggfunc="mean"),
    total_OSM_users=pd.NamedAgg(column="n_users", aggfunc="sum")
)
display(osm_df_avg.sort_values(by="avg_OSM_users_month", ascending=False)[0:15])

Unnamed: 0_level_0,avg_OSM_users_month,total_OSM_users
country_iso_a3,Unnamed: 1_level_1,Unnamed: 2_level_1
DEU,7300.05,146001
USA,4973.95,99479
FRA,3782.0,75640
GBR,2012.95,40259
ITA,2001.2,40024
RUS,1785.55,35711
POL,1591.75,31835
ESP,1306.75,26135
BRA,1172.0,23440
GTM,1022.85,20457


In [20]:
# load HDI data and get ISO A3 code
hdi_df = pd.read_json("../data/hdi_2021.json")
hdi_df["iso_a3_code"] = hdi_df["country"].str[:3]
hdi_df.set_index("iso_a3_code", inplace=True)
hdi_df.rename(columns={"value": "hdi_2021"}, inplace=True)
display(hdi_df)

Unnamed: 0_level_0,country,index,indicator,year,hdi_2021
iso_a3_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AFG,AFG - Afghanistan,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.478
AGO,AGO - Angola,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.586
ALB,ALB - Albania,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.796
AND,AND - Andorra,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.858
ARE,ARE - United Arab Emirates,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.911
...,...,...,...,...,...
ZZG,ZZG.ECA - Europe And Central Asia,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.796
ZZH,ZZH.LAC - Latin America And The Caribbean,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.754
ZZI,ZZI.SA - South Asia,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.632
ZZJ,ZZJ.SSA - Sub-Saharan Africa,HDI - Human Development Index,hdi - Human Development Index (value),2021,0.547


In [48]:
pop_df = pd.read_csv("../data/population.csv", delimiter=';')
pop_df.set_index("Country Code", inplace=True)
pop_df.rename(columns={"2022": "pop_2022"}, inplace=True)
pop_df.drop(columns=["Unnamed: 3", "Unnamed: 4", "Country Name"], inplace=True)
display(pop_df)

Unnamed: 0_level_0,pop_2022
Country Code,Unnamed: 1_level_1
ABW,106445.0
AFE,720859132.0
AFG,41128771.0
AFW,490330870.0
AGO,35588987.0
...,...
XKX,1761985.0
YEM,33696614.0
ZAF,59893885.0
ZMB,20017675.0


In [26]:
disaster_df = pd.read_csv("../data/deaths_per_100000.csv", delimiter=';')
disaster_df.set_index("ID_2", inplace=True)
disaster_df.rename(columns={"Deaths per 100000": "disaster_death_per_100k"}, inplace=True)
disaster_df.rename(columns={"Total Deaths": "disaster_death"}, inplace=True)
display(disaster_df)

Unnamed: 0_level_0,Country Name,disaster_death,World Bank population 2022,disaster_death_per_100k
ID_2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AFG,Afghanistan,5106,41128771,124146671
AGO,Angola,220,35588987,0618168761
ALB,Albania,403,2777689,1450846369
ARG,Argentina,49,46234830,0105980708
ARM,Armenia,0,2780469,0
...,...,...,...,...
WSM,Samoa,83,222382,3732316464
YEM,"Yemen, Rep.",1499,33696614,444851818
ZAF,South Africa,877,59893885,1464256326
ZMB,Zambia,12,20017675,0059947022


In [41]:
cellphone_df = pd.read_csv("../data/cell_phones_per_100.csv", delimiter=';', decimal=",")
cellphone_df.set_index("ID", inplace=True)
cellphone_df.rename(columns={"2004": "2004_cellphone"}, inplace=True)
cellphone_df.rename(columns={"2021": "2021_cellphone"}, inplace=True)
cellphone_df.rename(columns={"2022": "2022_cellphone"}, inplace=True)
cellphone_df["cellphone_per_100"] =  cellphone_df[["2004_cellphone", "2021_cellphone", "2022_cellphone"]].max(axis=1)

display(cellphone_df)

Unnamed: 0_level_0,Country Name,2004_cellphone,2021_cellphone,2022_cellphone,cellphone_per_100
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ABW,Aruba,,132.349628,,132.349628
AFE,Africa Eastern and Southern,,75.858421,78.380607,78.380607
AFG,Afghanistan,,56.554435,,56.554435
AFW,Africa Western and Central,,96.504644,106.852197,106.852197
AGO,Angola,,44.423732,67.373474,67.373474
...,...,...,...,...,...
XKX,Kosovo,32.302099,,,32.302099
YEM,"Yemen, Rep.",,46.019542,,46.019542
ZAF,South Africa,,168.924391,167.395870,168.924391
ZMB,Zambia,,103.974637,99.102418,103.974637


In [56]:
# merge all dataframes
df = pop_df.merge(
    osm_df_avg[["total_OSM_users", "avg_OSM_users_month"]],
    how="left",
    left_index=True,
    right_index=True
).merge(
    hdi_df["hdi_2021"],
    how="left",
    left_index=True,
    right_index=True
).merge(
    disaster_df[["disaster_death_per_100k", "disaster_death"]],
    how="left",
    left_index=True,
    right_index=True
).merge(
    cellphone_df["cellphone_per_100"],
    how="left",
    left_index=True,
    right_index=True
)
display(df)
df.to_csv("../data/regression_analysis_data.csv")

Unnamed: 0_level_0,pop_2022,total_OSM_users,avg_OSM_users_month,hdi_2021,disaster_death_per_100k,disaster_death,cellphone_per_100
Country Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ABW,106445.0,,,,,,132.349628
AFE,720859132.0,,,,,,78.380607
AFG,41128771.0,1425.0,71.25,0.478,124146671,5106.0,56.554435
AFW,490330870.0,,,,,,106.852197
AGO,35588987.0,1444.0,72.20,0.586,0618168761,220.0,67.373474
...,...,...,...,...,...,...,...
XKX,1761985.0,1356.0,67.80,,,,32.302099
YEM,33696614.0,1365.0,68.25,0.455,444851818,1499.0,46.019542
ZAF,59893885.0,8010.0,400.50,0.713,1464256326,877.0,168.924391
ZMB,20017675.0,5865.0,293.25,0.565,0059947022,12.0,103.974637
