In [None]:
# load packages
import mlflow
import yaml
import numpy as np
import pandas as pd
import pickle
# import pickle 5 as pickle in case model cannot be loaded (was exported under Python 3.8)
import pickle5 as pickle
import os
import geopandas as gpd
import matplotlib.pyplot as plt

In [None]:
# read in (yaml) configs
with open('../conf/predict_config.yaml', 'r') as conf:
    model_config = yaml.safe_load(conf)

# import data
dataset = model_config['model']['loc'] + model_config['model']['file']
dataset = pd.read_csv(dataset)

# rename satellite variables
dataset = dataset.rename(columns={"mean_GHM":"mean_ghm",
                                  "viirs_slope_yr":"slope_avg_rad",
                                  "cf_cvg_viirs_slope_yr":"slope_cf_cvg",
                                  "viirs_slope_month":"slope_month_avg_rad",
                                  "viirs_slope_month_cf_cvg":"slope_monthcf_cvg"})

#define predictors and target variable
predictor = model_config['features']['predictors']
target = model_config['features']['target']

In [None]:
# prepare data
X = dataset[predictor]
y = dataset[target]
print('X Shape:', X.shape)
print('y Shape:', y.shape)

In [None]:
# load the model from ml flow saves
filename = model_config['model']['champion'] 
model = pickle.load(open(filename, 'rb'))

# apply model on dataset and add predictions as column
# import pickle 5 as pickle in case model cannot be loaded 
# (was exported under Python 3.8) - see first cell
y_pred = model.predict(X)
dataset["y_pred"] = y_pred

In [None]:
#remove initial pixel population data
dataset = dataset.drop(columns = "population")

#load and merge population data, with information on absolute school area population 
popread = open("../data/worldpop/school_agg_pop_"+ str(model_config["country_3digit"]) + ".csv")
popdata = pd.read_csv(popread)

dataset = dataset.merge(popdata, how ='inner')

In [None]:
#load and merge school data, with additional information about schools (name, internet & computer availability)
schoolread = open("../data/school_loc/brazil_school_geolocation_master.csv")
schooldata = pd.read_csv(schoolread)
schooldata = schooldata[["source_school_id", "school_name", "internet_availability", "computer_availability"]]
dataset = dataset.merge(schooldata, how = "inner")

Please adjust the file name above to the respective file that contains schools geolocation and potential further information. The same holds true for the additional school data variable names. They can be excluded or altered, depending on the dataset at hand.

In [None]:
#adjust predictions that exceed [0;1]
dataset.y_pred = [1 if x>1 else x for x in dataset.y_pred]
dataset.y_pred = [0 if x<0 else x for x in dataset.y_pred]

dataset.y_pred.describe()

In [None]:
#create lists of total online population according to predction and GT
dataset["target"]=dataset[target]
onlinepop_pred = (dataset.population * dataset.y_pred)
onlinepop_gt = (dataset.population * dataset.target)

#create variables for relative and absolute offline population
dataset["offline_p"]= 1- dataset.y_pred
dataset["offline_g"]= 1- dataset[target]
dataset["offlinepop_pred"] = dataset.population * dataset.offline_p
dataset["offlinepop_gt"] = dataset.population * dataset.offline_g
onlinepop_gt.head()

In [None]:
#subset relevant variables for priorization list export
prio = dataset[["source_school_id","school_name", "offlinepop_pred", "offline_p", "offline_g","internet_availability","computer_availability"]]
prio.head()

In [None]:
#sort descendingly according to absolute offline population 
prio = prio.sort_values("offlinepop_pred", ascending = False)
print(prio.head())
#export priorization list
prio.to_excel(r"../data/predictions/Total_Priorization_List.xlsx", index = False)

In [None]:
#Creating a geodataframe
from shapely import wkt
#Changing dataframe into a geodataframe focused on the school geometry column
dataset['School'] = gpd.GeoSeries.from_wkt(dataset['School'])
gdf = gpd.GeoDataFrame(dataset, geometry='School')
#gdf.head()

In [None]:
#Set the figure size
fig,ax =plt.subplots(1, figsize=(15,10))

#plotting with the new geodataframe, each school point by prediction value
cmap_reversed = plt.cm.get_cmap('magma_r')
gdf.plot(column=gdf.offlinepop_pred, cmap = cmap_reversed ,legend=True, ax=ax)

# add a title and annotation
plt.suptitle('Absolute offline Population for '+ model_config['country'], fontsize=18, fontweight=3)
plt.title(str(gdf.shape[0]) +' schools', fontsize=13)

#show the plot
plt.show()

In [None]:
#repeat subsetting for schools with no internet available
dataset_off = dataset[dataset.internet_availability == "No"]
prio_off = dataset_off[["source_school_id","school_name", "offlinepop_pred", "offline_p", "offline_g","internet_availability","computer_availability"]]
prio_off.head()

In [None]:
#sort, print and export priorization list of offline schools
prio_off= prio_off.sort_values("offlinepop_pred", ascending= False)
print(prio_off.head())
prio_off.to_excel(r"../data/predictions/Offline_Priorization_List.xlsx", index= False)

In [None]:
#subset geodataframe to only offline schools
offline_schools = gdf.loc[gdf['internet_availability'] == "No"]
offline_schools.shape

In [None]:
#create figure 
fig,ax =plt.subplots(1, figsize=(15,10))

# add a title and annotation
offline_schools.plot(column=offline_schools.offlinepop_pred, cmap = cmap_reversed ,legend=True, ax=ax)
plt.suptitle('Predictions for offline schools', fontsize=18, fontweight=3)
plt.title(str(offline_schools.shape[0]) + ' schools', fontsize=13)

plt.show()

In [None]:
#repeat subsetting with adding the exclusion of high population outliers 
dataset_low = dataset_off[dataset_off.population < np.percentile(dataset_off.population, 90)]
prio_low = dataset_low[["source_school_id","school_name", "offlinepop_pred", "offline_p", "offline_g","internet_availability","computer_availability"]]
prio_low.head()

In [None]:
#sort, print and export priorization list of "low" population schools
prio_low= prio_low.sort_values("offlinepop_pred", ascending= False)
print(prio_low.head())
prio_low.to_excel(r"../data/predictions/Offline_Priorization_List_no_outliers.xlsx", index= False)

In [None]:
#subset geodata by excluding 10th decile
offline_schools_no_outliers = offline_schools = offline_schools[offline_schools.population < np.percentile(offline_schools.population, 90)]
offline_schools_no_outliers.shape

In [None]:
#create figure 
fig,ax =plt.subplots(1, figsize=(15,10))

# add a title and annotation
offline_schools_no_outliers.plot(column=offline_schools_no_outliers.offlinepop_pred, cmap = cmap_reversed ,legend=True, ax=ax)
plt.suptitle('Predictions for offline schools with outliers excluded', fontsize=18, fontweight=3)
plt.title(str(offline_schools_no_outliers.shape[0]) +' schools', fontsize=13)

plt.show()

In [None]:
#aggregate online population (predicted) and population to full sample 
totalonlinepop = onlinepop_pred.sum()
totalpop = dataset.population.sum()
#calculate country level online share (predicted)
brazil_online = (totalonlinepop/totalpop)
print("Relative predicted online population:")
print(str(100*np.round(brazil_online,4)) + ' %')
#multiply with population data to create estimate of total online population
#official population data taken from: https://data.worldbank.org/indicator/SP.POP.TOTL?locations=BR
brazil_online_abs= model_config["country_pop"] * (brazil_online)
print("Absolute predicted online population:")
print(int(brazil_online_abs))

In [None]:
#aggregate online population (ground truth) to full sample 
totalonlinepop_gt = onlinepop_gt.sum()
totalpop = dataset.population.sum()
#calculate country level online share (ground truth)
brazil_online_gt = (totalonlinepop_gt/totalpop)
print("Relative ground truth online population:")
print(str(100*np.round(brazil_online_gt,4)) + ' %')
#multiply with population data to create estimate of total online population
#official population data taken from: https://data.worldbank.org/indicator/SP.POP.TOTL?locations=BR
brazil_online_abs_gt = model_config["country_pop"] * (brazil_online_gt)
print("Absolute ground truth online population:")
print(int(brazil_online_abs_gt))