In [1]:
%load_ext autoreload 
%autoreload 2

NB: to beat / get comparable to fb results, need R^2 values between 0.56 (spatial CV), 0.59 (leave country out), and 0.7 (conventional CV). They construct spatial CV as follows: 

    In each country, we select a random cell as the training centroid, then define the  training dataset as the nearest (k-1)/k percent of cells to that centroid. The remaining 1/k cells from that country form the test dataset. This procedure is repeated k times in each country.

Importantly, they construct the ground truth in the first place carefully, to account for ~2km location jitter in urban areas, and ~5km jitter in rural areas: 

    To ensure that the input data associated with each village cover the village’s true location, we include a 2x2 grid of 2.4km cells around the centroid in urban areas, and a 4x4 grid in rural areas. For each of village, we then take the population-weighted average of the 112-dimensional feature vectors across 2x2 or 4x4 set of cells, using existing estimates of the population of 2.4km grid cells

In [2]:
import mlflow
from flaml import AutoML
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    classification_report as class_rep,
    confusion_matrix as conf_mat,
)
from sklearn.model_selection import train_test_split
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from stc_unicef_cpi.models import lgbm_baseline as baseline

from pathlib import Path

base_dir = Path("/Users/johnf/Downloads/higher_res_dssg/")
all_data = base_dir / "clean_nga_w_autov1.csv"
thr_data = base_dir / "nga_clean_v2_thr30.csv"

test_size = 0.2


  from pandas import MultiIndex, Int64Index


In [3]:
all_df = pd.read_csv(all_data)
all_df['name_commuting_zone'] = all_df['name_commuting_zone'].astype('category')
thr_df = pd.read_csv(thr_data)


In [6]:
print(*[(name,val) for name,val in zip(all_df.isna().sum(axis=0).index,all_df.isna().sum(axis=0).values) if val > 5],sep='\n')

('dep_nutrition_sev', 7)
('discrete_classification-proba_mean', 84)
('GDP_PPP_1990', 458)
('GDP_PPP_2000', 458)
('GDP_PPP_2015', 282)
('Nigeria_EC2019', 8)
('Nigeria_2019GDP', 8)
('avg_signal', 890)
('avg_d_kbps', 1098)
('avg_u_kbps', 1098)
('avg_lat_ms', 1098)
('nga_cis', 8)
('estimate_dau', 14)


In [16]:
auto_norm = pd.read_csv(base_dir / "autoencodings_norm.csv",index_col=0)
auto_norm.index.name = 'hex_code'
auto_norm.columns = [f"auto_norm_{i}" for i in range(len(auto_norm.columns))]
auto_unnorm = pd.read_csv(base_dir / "autoencodings_unnorm.csv",index_col=0)
auto_unnorm.index.name = 'hex_code'
auto_unnorm.columns = [f"auto_unnorm_{i}" for i in range(len(auto_unnorm.columns))]

In [17]:
auto_norm.head()

Unnamed: 0_level_0,auto_norm_0,auto_norm_1,auto_norm_2,auto_norm_3,auto_norm_4,auto_norm_5,auto_norm_6,auto_norm_7,auto_norm_8,auto_norm_9,auto_norm_10,auto_norm_11,auto_norm_12,auto_norm_13,auto_norm_14,auto_norm_15,auto_norm_16,auto_norm_17,auto_norm_18,auto_norm_19,auto_norm_20,auto_norm_21,auto_norm_22,auto_norm_23,auto_norm_24,auto_norm_25,auto_norm_26,auto_norm_27,auto_norm_28,auto_norm_29,auto_norm_30,auto_norm_31
hex_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
609534208263585791,0.0,0.367998,0.313854,2.346897,0.642107,0.133117,0.925322,0.052055,0.0,1.142332,0.46155,0.959708,0.072248,0.0,0.284353,0.0,0.717812,0.29883,0.0,0.0,1.028097,1.645104,0.590119,1.65936,0.0,1.762253,0.0,0.220387,1.152896,0.0,0.0,0.0
609534208330694655,0.0,0.195767,0.468772,1.507258,1.039365,0.255427,0.618564,0.029398,0.0,1.436025,0.011524,1.277509,0.0,0.0,0.118833,0.0,1.274537,0.0,0.285193,0.0,1.104283,1.343859,0.013607,0.876219,0.131755,0.686347,0.0,0.056465,0.810911,0.194073,0.0,0.397429
609534209186332671,0.0,0.744297,0.079329,2.39985,0.436857,0.490792,0.818108,0.773995,0.0,1.504221,0.443195,0.789816,0.072595,0.0,0.0,0.0,0.856033,0.249483,0.0,0.0,1.039699,1.421422,0.169287,1.537413,0.017722,0.951224,0.0,0.0,1.502426,0.064002,0.0,0.080853
609534209219887103,0.0,0.589383,0.197756,1.503982,1.046422,0.386447,0.326182,0.0,0.457376,1.507247,1.15698,0.384519,0.0,0.0,0.437944,0.0,0.974909,0.539859,0.239785,0.0,0.732402,0.742182,0.0,1.56654,0.0,1.435353,0.0,0.0,1.725444,0.080453,0.0,0.10639
609534209253441535,0.0,0.162741,0.313771,0.805157,1.299468,0.369703,0.139935,0.0,0.574669,0.723929,1.138089,0.928137,0.0,0.001886,0.364537,0.0,0.667914,0.912832,0.797436,0.0,0.799375,0.929868,0.0,0.860507,0.0,1.417223,0.3907,0.263386,1.145957,0.213083,0.0,0.167257


In [18]:
all_df.drop(columns=[col for col in all_df.columns if 'auto_' in col],inplace=True) 
all_df = all_df.join(auto_norm,on='hex_code',how='left').join(auto_unnorm,on='hex_code',how='left')
all_df.head()

Unnamed: 0,hex_code,year,round,cluster,hhid,indid,location,sex,wealth,wealthscore,age,hhweight,sumpoor_sev,dep_housing_sev,dep_water_sev,dep_sanitation_sev,dep_nutrition_sev,dep_health_sev,dep_education_sev,deprived_sev,region,region2,LATNUM,LONGNUM,Elevation,Slope,NDWI,NDVI,Road_density,n_conflicts,n_education,n_health,OSM_hospital,OSM_school,health_gv_osm,school_gv_osm,Optical_Depth_047,Optical_Depth_055,F_0,F_1,F_10,F_15,F_20,F_25,F_30,F_35,F_40,F_45,F_5,F_50,F_55,F_60,F_65,F_70,F_75,F_80,M_0,M_1,M_10,M_15,M_20,M_25,M_30,M_35,M_40,M_45,M_5,M_50,M_55,M_60,M_65,M_70,M_75,M_80,population,PDSI,Evapotrans,built,cnfd,PrecipiS,PrecipiAcc,discrete_classification-proba_mean,discrete_classification_mean,PrecipiMean,avg_rad,cf_cvg,GDP_PPP_1990,GDP_PPP_2000,GDP_PPP_2015,Nigeria_EC2019,Nigeria_2019GDP,GSM,LTE,NR,UMTS,avg_signal,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,nga_cis,accessibility,accessibility_walking_only,name_commuting_zone,population_commuting,road_len_commuting,area_commuting,estimate_dau,auto_norm_0,auto_norm_1,auto_norm_2,auto_norm_3,auto_norm_4,auto_norm_5,auto_norm_6,auto_norm_7,auto_norm_8,auto_norm_9,auto_norm_10,auto_norm_11,auto_norm_12,auto_norm_13,auto_norm_14,auto_norm_15,auto_norm_16,auto_norm_17,auto_norm_18,auto_norm_19,auto_norm_20,auto_norm_21,auto_norm_22,auto_norm_23,auto_norm_24,auto_norm_25,auto_norm_26,auto_norm_27,auto_norm_28,auto_norm_29,auto_norm_30,auto_norm_31,auto_unnorm_0,auto_unnorm_1,auto_unnorm_2,auto_unnorm_3,auto_unnorm_4,auto_unnorm_5,auto_unnorm_6,auto_unnorm_7,auto_unnorm_8,auto_unnorm_9,auto_unnorm_10,auto_unnorm_11,auto_unnorm_12,auto_unnorm_13,auto_unnorm_14,auto_unnorm_15,auto_unnorm_16,auto_unnorm_17,auto_unnorm_18,auto_unnorm_19,auto_unnorm_20,auto_unnorm_21,auto_unnorm_22,auto_unnorm_23,auto_unnorm_24,auto_unnorm_25,auto_unnorm_26,auto_unnorm_27,auto_unnorm_28,auto_unnorm_29,auto_unnorm_30,auto_unnorm_31
0,609534210041970687,2018.0,72.0,739.0,103.803738,5.523364,2.0,1.439252,1.140187,-124478.626168,7.831776,2.361982,1.682243,0.17757,0.0,0.943925,0.5,0.4,0.636364,1.0,3.0,20.0,13.082262,6.417623,1656.0,0.789573,-0.329164,1.449629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.766197,429.430539,0.057965,0.219383,0.16305,0.124726,0.120279,0.108163,0.102888,0.066634,0.071879,0.034429,0.2441,0.047802,0.021363,0.029736,0.012698,0.016655,0.006594,0.006487,0.060364,0.228133,0.157959,0.126658,0.085037,0.082813,0.08999,0.069195,0.073443,0.039689,0.234607,0.050317,0.029,0.031377,0.012422,0.017375,0.005782,0.005214,2.854207,672.428571,456.819109,2.047619,7.238095,0.091393,52.336184,46.09,40.0,0.068561,0.109068,7.723545,,,891627.1,5571.969494,0.198119,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.005211,5.470185,32.47619,gusau,4358806.0,115446.278384,36113.350645,0.0,0.0,0.147444,0.563954,1.81107,1.148793,0.0,1.12941,0.0,0.524787,0.997408,0.883352,1.087622,0.0,0.0,0.494786,0.0,1.316703,0.213678,0.0,0.0,0.880798,0.957972,0.650046,0.852535,0.0,1.30528,0.19514,1.550367,1.292052,0.281059,0.0,0.0,7.692298,9.820053,8.337338,0.0,6.289094,9.95261,0.0,2.154445,27.086336,20.439949,7.698069,6.012801,7.721428,20.310978,16.217228,0.0,36.46504,0.0,28.993528,5.757328,1.243242,11.930851,0.0,0.0,28.642044,32.903984,3.393516,26.631174,18.096096,19.70921,13.132363,0.0
1,609534214102056959,2018.0,72.0,701.0,21.901099,5.21978,2.0,1.505495,1.461538,-108688.43956,7.076923,0.860969,1.615385,0.340659,0.0,0.692308,0.538462,0.294118,0.694915,0.846154,3.0,10.0,13.28261,6.368814,1931.0,0.588017,-0.370748,1.165454,0.468752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,481.278762,415.266877,0.069766,0.264044,0.19335,0.163181,0.151428,0.13146,0.118004,0.082325,0.084496,0.040219,0.29561,0.052223,0.024926,0.030402,0.013406,0.019066,0.006094,0.008298,0.074648,0.282117,0.200479,0.160844,0.11039,0.100373,0.099605,0.079704,0.076833,0.044777,0.288097,0.053609,0.033257,0.034409,0.015043,0.017363,0.007647,0.006628,3.434124,649.818182,411.574998,2.0,0.409091,0.085988,46.67755,46.254545,39.090909,0.063376,0.115169,9.136364,1461548.0,1308916.0,2496201.0,3908.346591,0.138967,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.005214,6.090442,24.318182,sokoto,4358806.0,304245.55336,95731.143731,0.0,0.0,0.0,0.0,1.370239,1.669951,0.0,1.017327,0.0,0.134136,0.701598,0.85469,1.419614,0.0,0.0,0.417175,0.0,0.937024,0.0,0.0,0.0,1.101776,1.076802,0.234371,0.338749,0.109042,1.43555,0.0,0.863628,0.957627,0.351711,0.0,0.0,16.822483,3.912396,8.756618,0.0,0.0,14.721613,0.963247,9.112961,29.20986,27.457403,7.171832,6.90194,4.76115,23.733084,17.084122,0.0,41.542286,3.232432,33.313026,2.583391,4.233995,11.295981,0.0,0.0,37.343365,26.82644,3.604886,20.104885,10.463251,10.478625,17.173752,0.0
2,609534229268660223,2018.0,72.0,738.0,26.513889,5.125,2.0,1.444444,1.027778,-135450.652778,8.138889,0.998599,1.75,0.0,0.0,0.902778,0.333333,0.466667,0.980769,0.986111,3.0,20.0,12.72734,6.168603,1768.0,2.429234,-0.466368,1.37806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,466.735531,398.169291,0.023059,0.087273,0.064536,0.056903,0.048162,0.041406,0.034876,0.025445,0.022495,0.014278,0.085833,0.014973,0.007872,0.009626,0.004287,0.005597,0.002362,0.001921,0.024971,0.094372,0.075746,0.05961,0.041783,0.036472,0.034805,0.028318,0.024543,0.016724,0.098341,0.016037,0.008459,0.009888,0.00364,0.005264,0.002013,0.002064,1.133955,784.181818,539.503348,2.0,0.0,0.100014,61.793128,44.990909,24.545455,0.077517,0.106217,8.364899,,,,3899.12571,0.138639,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.006386,41.586681,47.818182,gusau,4358806.0,115446.278384,36113.350645,0.0,0.0,0.033158,1.343115,1.031196,0.596767,0.066596,0.325692,0.0,0.312576,1.31482,0.148956,1.054774,0.0,0.0,0.226803,0.0,0.984997,0.463267,1.111346,0.0,0.880264,1.156572,0.035421,1.039866,0.23788,0.645213,0.0,0.129056,0.710759,0.504811,0.0,1.012544,3.202079,4.60522,0.0,1.926636,3.224059,1.990959,0.10718,3.253919,1.351533,1.387562,2.669946,2.634491,4.441424,3.168879,1.505771,0.0,2.370025,0.0,0.0,1.793624,2.114812,0.720897,0.0,1.103797,4.115558,2.339253,0.0,3.969554,2.744229,2.390452,3.18694,0.0
3,609534232070455295,2018.0,72.0,733.0,16.504202,6.647059,2.0,1.504202,1.067227,-136376.226891,7.252101,0.455639,2.042017,0.151261,0.378151,0.907563,0.25,0.393939,0.753425,0.94958,3.0,20.0,12.770098,5.826144,1412.0,0.764033,-0.056507,1.29241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,478.371359,410.822028,0.01855,0.070208,0.050023,0.045589,0.039306,0.03354,0.031369,0.020068,0.021948,0.010839,0.073908,0.014138,0.006679,0.008894,0.003942,0.005422,0.00164,0.002367,0.019742,0.074612,0.057938,0.045215,0.029393,0.027305,0.028288,0.023206,0.022122,0.01278,0.078894,0.014973,0.009342,0.010008,0.004639,0.005,0.002002,0.00178,0.925672,771.818182,535.258851,2.045455,4.045455,0.098161,60.340572,50.09,39.545455,0.076378,0.107208,8.429925,715936.6,641170.3,2164586.0,3899.12571,0.138639,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.015024,21.89837,25.818182,gusau,4358806.0,115446.278384,36113.350645,0.0,0.0,0.657511,0.0,2.038318,0.352512,0.0,1.547103,0.140703,0.0,0.491926,0.961484,0.949003,0.105905,0.126629,0.15085,0.0,0.915048,0.570503,0.0,0.078324,1.031455,1.761939,1.073125,1.783032,0.0,1.583902,0.0,0.56332,1.63251,0.0,0.0,0.0,10.564234,13.920404,17.442434,11.630425,5.980145,14.087487,0.0,4.78765,30.767946,15.751625,7.598676,11.931562,2.564843,36.707886,16.885864,0.0,28.03569,14.604324,20.84916,11.263785,9.965304,10.296106,0.0,0.0,39.840813,25.035883,13.593884,21.989025,26.001284,24.9184,3.971871,0.0
4,609534235375566847,2018.0,72.0,699.0,21.633333,6.3,2.0,1.477778,1.555556,-97301.866667,6.911111,0.919946,1.722222,0.322222,0.4,0.544444,0.111111,0.3,0.607143,0.833333,3.0,10.0,13.539368,5.990493,1175.0,2.185193,1.085046,0.50101,0.322869,0.0,0.0,0.0,0.0,0.0,0.0,0.0,624.037073,534.691149,0.015669,0.059303,0.0376,0.036601,0.03733,0.03115,0.02549,0.017058,0.018232,0.008447,0.062215,0.011594,0.005655,0.008131,0.003342,0.004944,0.001621,0.001742,0.015994,0.060447,0.03634,0.030932,0.021899,0.024324,0.024825,0.019328,0.016548,0.009573,0.060233,0.011036,0.007388,0.007309,0.004386,0.004386,0.001751,0.001618,0.744443,403.157895,375.891581,1.473684,0.684211,0.081186,41.878642,33.2,62.947368,0.059719,0.104768,9.010965,,,866412.2,3899.125822,0.138639,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.092303,14.808872,48.0,sokoto,4358806.0,304245.55336,95731.143731,0.0,0.0,0.0,0.0,0.0,18.076458,0.0,19.630417,7.552211,34.852978,3.356856,12.968057,0.0,0.0,12.346037,12.414999,0.0,23.25457,0.0,0.0,0.0,8.496155,1.043452,0.0,2.423465,12.959326,15.144411,5.713549,0.0,7.344479,19.1246,6.129739,0.0,315.02005,106.399895,0.0,163.68814,603.98267,0.0,0.0,0.0,79.52208,0.0,54.844173,41.63699,374.37036,0.0,83.822266,0.0,78.84152,129.738,0.0,0.0,230.43362,52.685577,229.27153,35.860577,138.4359,254.76172,0.0,473.60168,448.08328,182.20883,186.41273,0.0


In [19]:
thr_all = all_df.set_index('hex_code').loc[thr_df.hex_code].reset_index()

In [20]:
thr_all.head(2)

Unnamed: 0,hex_code,year,round,cluster,hhid,indid,location,sex,wealth,wealthscore,age,hhweight,sumpoor_sev,dep_housing_sev,dep_water_sev,dep_sanitation_sev,dep_nutrition_sev,dep_health_sev,dep_education_sev,deprived_sev,region,region2,LATNUM,LONGNUM,Elevation,Slope,NDWI,NDVI,Road_density,n_conflicts,n_education,n_health,OSM_hospital,OSM_school,health_gv_osm,school_gv_osm,Optical_Depth_047,Optical_Depth_055,F_0,F_1,F_10,F_15,F_20,F_25,F_30,F_35,F_40,F_45,F_5,F_50,F_55,F_60,F_65,F_70,F_75,F_80,M_0,M_1,M_10,M_15,M_20,M_25,M_30,M_35,M_40,M_45,M_5,M_50,M_55,M_60,M_65,M_70,M_75,M_80,population,PDSI,Evapotrans,built,cnfd,PrecipiS,PrecipiAcc,discrete_classification-proba_mean,discrete_classification_mean,PrecipiMean,avg_rad,cf_cvg,GDP_PPP_1990,GDP_PPP_2000,GDP_PPP_2015,Nigeria_EC2019,Nigeria_2019GDP,GSM,LTE,NR,UMTS,avg_signal,avg_d_kbps,avg_u_kbps,avg_lat_ms,tests,devices,nga_cis,accessibility,accessibility_walking_only,name_commuting_zone,population_commuting,road_len_commuting,area_commuting,estimate_dau,auto_norm_0,auto_norm_1,auto_norm_2,auto_norm_3,auto_norm_4,auto_norm_5,auto_norm_6,auto_norm_7,auto_norm_8,auto_norm_9,auto_norm_10,auto_norm_11,auto_norm_12,auto_norm_13,auto_norm_14,auto_norm_15,auto_norm_16,auto_norm_17,auto_norm_18,auto_norm_19,auto_norm_20,auto_norm_21,auto_norm_22,auto_norm_23,auto_norm_24,auto_norm_25,auto_norm_26,auto_norm_27,auto_norm_28,auto_norm_29,auto_norm_30,auto_norm_31,auto_unnorm_0,auto_unnorm_1,auto_unnorm_2,auto_unnorm_3,auto_unnorm_4,auto_unnorm_5,auto_unnorm_6,auto_unnorm_7,auto_unnorm_8,auto_unnorm_9,auto_unnorm_10,auto_unnorm_11,auto_unnorm_12,auto_unnorm_13,auto_unnorm_14,auto_unnorm_15,auto_unnorm_16,auto_unnorm_17,auto_unnorm_18,auto_unnorm_19,auto_unnorm_20,auto_unnorm_21,auto_unnorm_22,auto_unnorm_23,auto_unnorm_24,auto_unnorm_25,auto_unnorm_26,auto_unnorm_27,auto_unnorm_28,auto_unnorm_29,auto_unnorm_30,auto_unnorm_31
0,609534210041970687,2018.0,72.0,739.0,103.803738,5.523364,2.0,1.439252,1.140187,-124478.626168,7.831776,2.361982,1.682243,0.17757,0.0,0.943925,0.5,0.4,0.636364,1.0,3.0,20.0,13.082262,6.417623,1656.0,0.789573,-0.329164,1.449629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,500.766197,429.430539,0.057965,0.219383,0.16305,0.124726,0.120279,0.108163,0.102888,0.066634,0.071879,0.034429,0.2441,0.047802,0.021363,0.029736,0.012698,0.016655,0.006594,0.006487,0.060364,0.228133,0.157959,0.126658,0.085037,0.082813,0.08999,0.069195,0.073443,0.039689,0.234607,0.050317,0.029,0.031377,0.012422,0.017375,0.005782,0.005214,2.854207,672.428571,456.819109,2.047619,7.238095,0.091393,52.336184,46.09,40.0,0.068561,0.109068,7.723545,,,891627.1,5571.969494,0.198119,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.005211,5.470185,32.47619,gusau,4358806.0,115446.278384,36113.350645,0.0,0.0,0.147444,0.563954,1.81107,1.148793,0.0,1.12941,0.0,0.524787,0.997408,0.883352,1.087622,0.0,0.0,0.494786,0.0,1.316703,0.213678,0.0,0.0,0.880798,0.957972,0.650046,0.852535,0.0,1.30528,0.19514,1.550367,1.292052,0.281059,0.0,0.0,7.692298,9.820053,8.337338,0.0,6.289094,9.95261,0.0,2.154445,27.086336,20.439949,7.698069,6.012801,7.721428,20.310978,16.217228,0.0,36.46504,0.0,28.993528,5.757328,1.243242,11.930851,0.0,0.0,28.642044,32.903984,3.393516,26.631174,18.096096,19.70921,13.132363,0.0
1,609534214102056959,2018.0,72.0,701.0,21.901099,5.21978,2.0,1.505495,1.461538,-108688.43956,7.076923,0.860969,1.615385,0.340659,0.0,0.692308,0.538462,0.294118,0.694915,0.846154,3.0,10.0,13.28261,6.368814,1931.0,0.588017,-0.370748,1.165454,0.468752,0.0,0.0,0.0,0.0,0.0,0.0,0.0,481.278762,415.266877,0.069766,0.264044,0.19335,0.163181,0.151428,0.13146,0.118004,0.082325,0.084496,0.040219,0.29561,0.052223,0.024926,0.030402,0.013406,0.019066,0.006094,0.008298,0.074648,0.282117,0.200479,0.160844,0.11039,0.100373,0.099605,0.079704,0.076833,0.044777,0.288097,0.053609,0.033257,0.034409,0.015043,0.017363,0.007647,0.006628,3.434124,649.818182,411.574998,2.0,0.409091,0.085988,46.67755,46.254545,39.090909,0.063376,0.115169,9.136364,1461547.9,1308916.3,2496201.0,3908.346591,0.138967,0.0,0.0,0.0,0.0,,,,,0.0,0.0,0.005214,6.090442,24.318182,sokoto,4358806.0,304245.55336,95731.143731,0.0,0.0,0.0,0.0,1.370239,1.669951,0.0,1.017327,0.0,0.134136,0.701598,0.85469,1.419614,0.0,0.0,0.417175,0.0,0.937024,0.0,0.0,0.0,1.101776,1.076802,0.234371,0.338749,0.109042,1.43555,0.0,0.863628,0.957627,0.351711,0.0,0.0,16.822483,3.912396,8.756618,0.0,0.0,14.721613,0.963247,9.112961,29.20986,27.457403,7.171832,6.90194,4.76115,23.733084,17.084122,0.0,41.542286,3.232432,33.313026,2.583391,4.233995,11.295981,0.0,0.0,37.343365,26.82644,3.604886,20.104885,10.463251,10.478625,17.173752,0.0


In [22]:
thr_all.to_csv(base_dir / "new_auto_thr_clean_nga.csv")
thr_all.to_csv("../data/processed/new_auto_thr_clean_nga.csv")

In [None]:
print(*[(name,val) for name,val in zip(thr_all.nunique(axis=0).index,thr_all.nunique(axis=0).values) if val < 200],sep='\n')

In [None]:
# NB reduce resolution from ~100m x 100m squares to ~500m x 500m squares using average, then again 
# take average using centroids of pixels within hex boundaries. 
# Hence to get absolute population estimates, need to x (500/100)^2 for each 500m pixel, so x 25
# then multiply again by average number of 500m pixels within hex, which is very roughly 5.16km^2 / (0.25)
# ~ 20.64
# abs pop of Nigeria is ~220M so should be decently less than this
thr_all['abs_pop']=(thr_all.population*25*20.6)

In [None]:
for pop_thr in np.linspace(50,500,10):
    print(f"{pop_thr:.0f}: {(thr_all.abs_pop<pop_thr).mean()*100:.2f}")

In [None]:
thr_df.head(2)

In [None]:
start_idx = thr_df.columns.tolist().index("LATNUM")
X = thr_df.iloc[:, start_idx:]
X["n_conflicts"].fillna(0, inplace=True)
sev_cols = [col for col in thr_df.columns if "sev" in col]
Y = thr_df[sev_cols]


In [None]:
n_quants = 5
quant_Y = pd.concat(
    [
        pd.cut(
            Y[col],
            np.linspace(0, 1, n_quants + 1),
            labels=range(n_quants),
            include_lowest=True,
        ).astype("category")
        for col in Y.columns
        if "sum" not in col
    ],
    axis=1,
)


In [None]:
Y[quant_Y.isna().sum(axis=1) > 0]


In [None]:
quant_Y.dropna().astype(int).hist()
plt.show()


In [None]:
quant_Y.info()


In [None]:
good_idxs = ["housing", "water", "sanitation", "education"]


In [None]:
test_size = 0.2
for chosen_idx in good_idxs:
    qX_train, qX_test, qy_train, qy_test = train_test_split(
        X,
        quant_Y[chosen_idx.join(["dep_", "_sev"])],
        test_size=test_size,
        random_state=42,
        stratify=quant_Y[chosen_idx.join(["dep_", "_sev"])],
    )
    # Initialize an AutoML instance
    automl = AutoML()
    # Specify automl goal and constraint
    automl_settings = {
        # "time_budget": 120,  # in seconds
        "metric": "micro_f1",
        "task": "classification",
        "log_file_name": "quint_v1.log",
        "max_iter": 500,
        # "ensemble": {
        #     "final_estimator": LogisticRegressionCV(),
        #     "passthrough": False,
        # },
    }
    # Train with labeled input data
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-quint-{chosen_idx}")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == "flaml-automl-quint"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=qX_train, y_train=qy_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        # mlflow.log_params(automl.model.config2params())
        mlflow.log_metric(
            key="f1_score",
            value=f1_score(qy_test, automl.predict(qX_test), average="micro"),
        )
    # Predict
    # print(automl.predict_proba(qX_train))
    # Print the best model
    # print(automl.model.estimator)


# Cast as (quantile) classification

In [None]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "quint_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
mlflow.set_tracking_uri("../models/mlruns")
mlflow.set_experiment("flaml-automl-quint")
client = mlflow.tracking.MlflowClient()
experiments = client.list_experiments()
# print(experiments)
exp_id = [
    experiment.experiment_id
    for experiment in experiments
    if experiment.name == "flaml-automl-quint"
][0]

with mlflow.start_run(experiment_id=exp_id) as run:
    # print(run.info.experiment_id)
    automl.fit(X_train=qX_train, y_train=qy_train, **automl_settings)
    # mlflow.sklearn.log_model(automl,"automl-quint")
    mlflow.log_metric(
        f1_score(qy_test, automl.predict(qX_test), average="micro"), "f1_score"
    )
# Predict
# print(automl.predict_proba(qX_train))
# Print the best model
# print(automl.model.estimator)


In [None]:
from sklearn.metrics import (
    classification_report as class_rep,
    confusion_matrix as conf_mat,
)
import seaborn as sns

preds = automl.predict(qX_test)
print(
    class_rep(
        qy_test,
        preds,
    )
)
fig, ax = plt.subplots(dpi=150)
hmap = sns.heatmap(conf_mat(qy_test, preds), annot=True, fmt="d")
hmap.set_xlabel("Predicted")
hmap.set_ylabel("True")

plt.show()


In [None]:
# Try with focal loss? See https://github.com/jrzaurin/LightGBM-with-Focal-Loss


# Cast as ordinal classification / regression

# Cast as regression problem

## AutoML (flaml)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "r2", #"rmse",
    "task": "regression",
    "log_file_name": "reg_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[f"dep_{chosen_idx}_sev"], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{chosen_idx}-reg")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{chosen_idx}-reg"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_param(key="best_model", value=automl.best_estimator)
        mlflow.log_params(automl.best_config)
        mlflow.log_metric(
            key="rmse",
            value=np.sqrt(mean_squared_error(y_test, automl.predict(X_test))),
        )
        mlflow.log_metric(
            key="r2_score", value=r2_score(y_test, automl.predict(X_test))
        )

    preds = automl.predict(X_test)
    fig, ax = plt.subplots(dpi=150)
    scplot = sns.scatterplot(x=preds, y=y_test)
    scplot.set_xlabel("Predicted")
    scplot.set_ylabel("True")
    scplot.set_title(chosen_idx)
    plt.show()


## LightGBM model + tuning

In [None]:
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx.join(["dep_", "_sev"])], test_size=test_size, random_state=42
    )
    model, loss = baseline.lgbmreg_optunaCV(
        X_train,
        X_test,
        y_train,
        y_test,
        target_name=chosen_idx,
        experiment_name=f"lgbm-opt-{chosen_idx}",
    )


In [None]:
# Y.hist(bins=20,density=True)
plt.show()
for col in Y.columns:
    # sns.distplot(np.log(Y[col]+1),bins=20,kde=False)
    sns.distplot(Y[col], bins=20, kde=False)
    plt.show()


In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
qX_resamp, qy_resamp = smote_enn.fit_resample(qX_train, qy_train)


# FLAML reg on all data

In [None]:
sev_cols = [col for col in thr_all.columns if "sev" in col]
good_cols = [col for col in sev_cols if 'health' not in col and 'nutrition' not in col]
good_names = [col.replace('dep_','').replace('_sev','') for col in good_cols]

In [None]:
start_idx = thr_all.columns.tolist().index("LATNUM")
X = thr_all.iloc[:, start_idx:]
sev_cols = [col for col in thr_all.columns if "sev" in col]
Y = thr_all[sev_cols]

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "rmse", #"r2",
    "task": "regression",
    "log_file_name": "reg_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}

# Train with labeled input data
for name,chosen_idx in zip(good_names,good_cols):
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{name}-full-reg")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{name}-full-reg"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_param(key="best_model", value=automl.best_estimator)
        mlflow.log_params(automl.best_config)
        mlflow.log_metric(
            key="rmse",
            value=np.sqrt(mean_squared_error(y_test, automl.predict(X_test))),
        )
        mlflow.log_metric(
            key="r2_score", value=r2_score(y_test, automl.predict(X_test))
        )

    preds = automl.predict(X_test)
    fig, ax = plt.subplots(dpi=150)
    scplot = sns.scatterplot(x=preds, y=y_test)
    scplot.set_xlabel("Predicted")
    scplot.set_ylabel("True")
    scplot.set_title(chosen_idx)
    plt.show()


# Full NGA survey dataset

In [None]:
full_nga_data = pd.read_csv(
    "/Users/johnf/Downloads/raw_low_res_dssg/dhs/clean_nga_dhs.csv"
)


In [None]:
sev_cols


In [None]:
full_nga_data["dep_sev_idx"] = full_nga_data["sumpoor_sev"] / (
    6 - full_nga_data[sev_cols].drop(columns=["sumpoor_sev"]).isna().sum(axis=1)
)


In [None]:
# sns.distplot(np.log(full_nga_data.groupby('hex_code').dep_sev_idx.mean()+1),bins=20,kde=False)
fig, ax = plt.subplots(dpi=150)
sns.distplot(full_nga_data.groupby("hex_code").dep_sev_idx.mean(), bins=20, kde=False)
plt.show()


# Feature selection

In [None]:
from stc_unicef_cpi.features.build_features import boruta_shap_ftr_select


In [None]:
subX_train = boruta_shap_ftr_select(
    X_train,
    y_train,
    plot=True,
    n_trials=100,
    sample=False,
    train_or_test="test",
    normalize=True,
    verbose=True,
)


In [None]:
print(*subX_train.columns, sep="\n")


In [None]:
subX_test = X_test[[col for col in subX_train.columns]]
submodel, subloss = baseline.lgbmreg_optunaCV(
    subX_train,
    subX_test,
    y_train,
    y_test,
    target_name=chosen_idx,
    experiment_name=f"lgbm-opt-{chosen_idx}-sub",
)


In [None]:
scplot = sns.scatterplot(x=submodel.predict(subX_test), y=y_test)
scplot.set_xlabel(f"Predicted (subset): {chosen_idx}")
scplot.set_ylabel("True")
plt.show()


In [None]:
sns.scatterplot(x=model.predict(X_test), y=y_test)
scplot.set_xlabel(f"Predicted (full): {chosen_idx}")
scplot.set_ylabel("True")
plt.show()


# Full feature selection vis 

In [None]:
from stc_unicef_cpi.features.build_features import boruta_shap_ftr_select


In [None]:
for name,chosen_idx in zip(good_names,good_cols):
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx], test_size=test_size, random_state=42
    )

    subX_train = boruta_shap_ftr_select(
        X_train,
        y_train,
        plot=True,
        n_trials=100,
        sample=False,
        train_or_test="test",
        normalize=True,
        verbose=True,
    )

# Two-stage modelling approach

In [None]:
good_idxs


In [None]:
inflated_vals = {
    "housing": [0],
    "water": [0, 1],
    "sanitation": [0, 1],
    "education": [0, 1],
}


In [None]:
for chosen_idx in good_idxs:
    map_dict = {
        i: val
        for i, val in zip(
            range(len(inflated_vals[chosen_idx])), inflated_vals[chosen_idx]
        )
    }
    map_fn = lambda x: map_dict.get(x, len(inflated_vals[chosen_idx]))
    Y[f"{chosen_idx}_stg_cls"] = (
        Y[chosen_idx.join(["dep_", "_sev"])].apply(map_fn).astype("category")
    )
    # print(f"{col.mean()*100:.2f}% of {chosen_idx} are {inflated_val}")


In [None]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "stg1_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[f"{chosen_idx}_stg_cls"], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{chosen_idx}-stg1")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{chosen_idx}-stg1"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_metric(
            key="f1_score",
            value=f1_score(y_test, automl.predict(X_test), average="micro"),
        )

    preds = automl.predict(X_test)
    print(
        class_rep(
            y_test,
            preds,
        )
    )
    fig, ax = plt.subplots(dpi=150)
    hmap = sns.heatmap(conf_mat(y_test, preds), annot=True, fmt="d")
    hmap.set_xlabel("Predicted")
    hmap.set_ylabel("True")

    plt.show()


In [None]:
automl.predict_proba(X_test).shape


In [None]:
from stc_unicef_cpi.models.inflated_vals_2stg import InflatedValsRegressor

# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

automl_cls = AutoML()
automl_reg = AutoML()
automl_cls_settings = {
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "stg1_cls.log",
    "max_iter": 500,
    "estimator_list":["lgbm", "xgboost"] # if want to try others, need to impute nans
}
automl_reg_settings = {
    "metric": "rmse",
    "task": "regression",
    "log_file_name": "stg2_reg.log",
    "max_iter": 500,
    "estimator_list":["lgbm", "xgboost"]
}
# infl_vals_reg = InflatedValsRegressor(LGBMClassifier(), LGBMRegressor())
infl_vals_reg = InflatedValsRegressor(automl_cls, automl_reg)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y[f"dep_{chosen_idx}_sev"], test_size=test_size, random_state=42
)
infl_vals_reg.fit(
    X_train,
    y_train,
    inflated_vals=inflated_vals[chosen_idx],
    cls_fit_kwargs=automl_cls_settings,
    reg_fit_kwargs=automl_reg_settings,
)


In [None]:
infl_vals_reg.predict(X_test, weighted=True).shape


In [None]:
from sklearn.metrics import r2_score

print(r2_score(y_test, infl_vals_reg.predict(X_test, weighted=True)))
print(r2_score(y_test, infl_vals_reg.predict(X_test, weighted=False)))


In [None]:
plt.scatter(infl_vals_reg.predict(X_test, weighted=True), y_test)
plt.scatter(infl_vals_reg.predict(X_test), y_test)

plt.show()

In [None]:
base_lgbm = LGBMRegressor().fit(X_train, y_train)
print(r2_score(y_test, base_lgbm.predict(X_test)))
plt.scatter(base_lgbm.predict(X_test), y_test)


# Set up as pipelines for different combs
- With / without expanded data, possibly w data extrapolated in different ways
- With / without GDP imputation of different kinds (simple / knn / rf etc.) 
- With / without standardisation (standard / robust etc.)
- With / without target transformation (e.g. log / box-cox)

In [None]:
# try KNN imputer, speak to Arpita about more sophisticated imputers later
# resave w n_conflicts and 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=42)  
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

set_config(display='diagram')

imputer = SimpleImputer()
standardiser = StandardScaler()
automl = AutoML()

automl_pipeline = Pipeline([
    ("imputer",imputer),
    ("standardiser", standardiser),
    ("automl", automl)
])
# automl_pipeline
automl_settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": "mse",  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "task": "regression",  # task type
    "estimator_list": ["xgboost", "catboost", "lgbm"],
    "log_file_name": f"{comb_name}.log",  # flaml log file
    "seed": 42, # random seed
}
pipeline_settings = {
    f"automl__{key}": value for key, value in automl_settings.items()
}
automl_pipeline.fit(X_train, y_train, **pipeline_settings)

# get automl object back 
automl = automl_pipeline.steps[2][1]
# Get the best config and best learner
print('Best ML learner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

# plot basic feature importances  
plt.barh(automl.feature_names_in_, automl.feature_importances_)

# compute different metrics on test set 

print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))