In [None]:
%load_ext autoreload 
%autoreload 2

NB: to beat / get comparable to fb results, need R^2 values between 0.56 (spatial CV), 0.59 (leave country out), and 0.7 (conventional CV). They construct spatial CV as follows: 

    In each country, we select a random cell as the training centroid, then define the  training dataset as the nearest (k-1)/k percent of cells to that centroid. The remaining 1/k cells from that country form the test dataset. This procedure is repeated k times in each country.

Importantly, they construct the ground truth in the first place carefully, to account for ~2km location jitter in urban areas, and ~5km jitter in rural areas: 

    To ensure that the input data associated with each village cover the village’s true location, we include a 2x2 grid of 2.4km cells around the centroid in urban areas, and a 4x4 grid in rural areas. For each of village, we then take the population-weighted average of the 112-dimensional feature vectors across 2x2 or 4x4 set of cells, using existing estimates of the population of 2.4km grid cells

In [None]:
import mlflow
from flaml import AutoML
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import (
    classification_report as class_rep,
    confusion_matrix as conf_mat,
)
from sklearn.model_selection import train_test_split
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from stc_unicef_cpi.models import lgbm_baseline as baseline

from pathlib import Path

base_dir = Path("/Users/johnf/Downloads/higher_res_dssg/")
all_data = base_dir / "clean_nga_w_autov1.csv"
thr_data = base_dir / "nga_clean_v2_thr30.csv"

test_size = 0.2


In [None]:
all_df = pd.read_csv(all_data)
all_df['name_commuting_zone'] = all_df['name_commuting_zone'].astype('category')
thr_df = pd.read_csv(thr_data)


In [None]:
print(*[(name,val) for name,val in zip(all_df.isna().sum(axis=0).index,all_df.isna().sum(axis=0).values) if val > 5],sep='\n')

In [None]:
auto_norm = pd.read_csv(base_dir / "autoencodings_norm.csv",index_col=0)
auto_norm.index.name = 'hex_code'
auto_norm.columns = [f"auto_norm_{i}" for i in range(len(auto_norm.columns))]
auto_unnorm = pd.read_csv(base_dir / "autoencodings_unnorm.csv",index_col=0)
auto_unnorm.index.name = 'hex_code'
auto_unnorm.columns = [f"auto_unnorm_{i}" for i in range(len(auto_unnorm.columns))]

In [None]:
auto_norm.head()

In [None]:
all_df.drop(columns=[col for col in all_df.columns if 'auto_' in col],inplace=True) 
all_df = all_df.join(auto_norm,on='hex_code',how='left').join(auto_unnorm,on='hex_code',how='left')
all_df.head()

In [None]:
thr_all = all_df.set_index('hex_code').loc[thr_df.hex_code].reset_index()

In [None]:
thr_all.head(2)

In [None]:
thr_all.to_csv(base_dir / "new_auto_thr_clean_nga.csv")
thr_all.to_csv("../data/processed/new_auto_thr_clean_nga.csv")

In [None]:
print(*[(name,val) for name,val in zip(thr_all.nunique(axis=0).index,thr_all.nunique(axis=0).values) if val < 200],sep='\n')

In [None]:
# NB reduce resolution from ~100m x 100m squares to ~500m x 500m squares using average, then again 
# take average using centroids of pixels within hex boundaries. 
# Hence to get absolute population estimates, need to x (500/100)^2 for each 500m pixel, so x 25
# then multiply again by average number of 500m pixels within hex, which is very roughly 5.16km^2 / (0.25)
# ~ 20.64
# abs pop of Nigeria is ~220M so should be decently less than this
thr_all['abs_pop']=(thr_all.population*25*20.6)

In [None]:
for pop_thr in np.linspace(50,500,10):
    print(f"{pop_thr:.0f}: {(thr_all.abs_pop<pop_thr).mean()*100:.2f}")

In [None]:
thr_df.head(2)

In [None]:
start_idx = thr_df.columns.tolist().index("LATNUM")
X = thr_df.iloc[:, start_idx:]
X["n_conflicts"].fillna(0, inplace=True)
sev_cols = [col for col in thr_df.columns if "sev" in col]
Y = thr_df[sev_cols]


In [None]:
n_quants = 5
quant_Y = pd.concat(
    [
        pd.cut(
            Y[col],
            np.linspace(0, 1, n_quants + 1),
            labels=range(n_quants),
            include_lowest=True,
        ).astype("category")
        for col in Y.columns
        if "sum" not in col
    ],
    axis=1,
)


In [None]:
Y[quant_Y.isna().sum(axis=1) > 0]


In [None]:
quant_Y.dropna().astype(int).hist()
plt.show()


In [None]:
quant_Y.info()


In [None]:
good_idxs = ["housing", "water", "sanitation", "education"]


In [None]:
test_size = 0.2
for chosen_idx in good_idxs:
    qX_train, qX_test, qy_train, qy_test = train_test_split(
        X,
        quant_Y[chosen_idx.join(["dep_", "_sev"])],
        test_size=test_size,
        random_state=42,
        stratify=quant_Y[chosen_idx.join(["dep_", "_sev"])],
    )
    # Initialize an AutoML instance
    automl = AutoML()
    # Specify automl goal and constraint
    automl_settings = {
        # "time_budget": 120,  # in seconds
        "metric": "micro_f1",
        "task": "classification",
        "log_file_name": "quint_v1.log",
        "max_iter": 500,
        # "ensemble": {
        #     "final_estimator": LogisticRegressionCV(),
        #     "passthrough": False,
        # },
    }
    # Train with labeled input data
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-quint-{chosen_idx}")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == "flaml-automl-quint"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=qX_train, y_train=qy_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        # mlflow.log_params(automl.model.config2params())
        mlflow.log_metric(
            key="f1_score",
            value=f1_score(qy_test, automl.predict(qX_test), average="micro"),
        )
    # Predict
    # print(automl.predict_proba(qX_train))
    # Print the best model
    # print(automl.model.estimator)


# Cast as (quantile) classification

In [None]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "quint_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
mlflow.set_tracking_uri("../models/mlruns")
mlflow.set_experiment("flaml-automl-quint")
client = mlflow.tracking.MlflowClient()
experiments = client.list_experiments()
# print(experiments)
exp_id = [
    experiment.experiment_id
    for experiment in experiments
    if experiment.name == "flaml-automl-quint"
][0]

with mlflow.start_run(experiment_id=exp_id) as run:
    # print(run.info.experiment_id)
    automl.fit(X_train=qX_train, y_train=qy_train, **automl_settings)
    # mlflow.sklearn.log_model(automl,"automl-quint")
    mlflow.log_metric(
        f1_score(qy_test, automl.predict(qX_test), average="micro"), "f1_score"
    )
# Predict
# print(automl.predict_proba(qX_train))
# Print the best model
# print(automl.model.estimator)


In [None]:
from sklearn.metrics import (
    classification_report as class_rep,
    confusion_matrix as conf_mat,
)
import seaborn as sns

preds = automl.predict(qX_test)
print(
    class_rep(
        qy_test,
        preds,
    )
)
fig, ax = plt.subplots(dpi=150)
hmap = sns.heatmap(conf_mat(qy_test, preds), annot=True, fmt="d")
hmap.set_xlabel("Predicted")
hmap.set_ylabel("True")

plt.show()


In [None]:
# Try with focal loss? See https://github.com/jrzaurin/LightGBM-with-Focal-Loss


# Cast as ordinal classification / regression

# Cast as regression problem

## AutoML (flaml)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "r2", #"rmse",
    "task": "regression",
    "log_file_name": "reg_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[f"dep_{chosen_idx}_sev"], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{chosen_idx}-reg")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{chosen_idx}-reg"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_param(key="best_model", value=automl.best_estimator)
        mlflow.log_params(automl.best_config)
        mlflow.log_metric(
            key="rmse",
            value=np.sqrt(mean_squared_error(y_test, automl.predict(X_test))),
        )
        mlflow.log_metric(
            key="r2_score", value=r2_score(y_test, automl.predict(X_test))
        )

    preds = automl.predict(X_test)
    fig, ax = plt.subplots(dpi=150)
    scplot = sns.scatterplot(x=preds, y=y_test)
    scplot.set_xlabel("Predicted")
    scplot.set_ylabel("True")
    scplot.set_title(chosen_idx)
    plt.show()


## LightGBM model + tuning

In [None]:
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx.join(["dep_", "_sev"])], test_size=test_size, random_state=42
    )
    model, loss = baseline.lgbmreg_optunaCV(
        X_train,
        X_test,
        y_train,
        y_test,
        target_name=chosen_idx,
        experiment_name=f"lgbm-opt-{chosen_idx}",
    )


In [None]:
# Y.hist(bins=20,density=True)
plt.show()
for col in Y.columns:
    # sns.distplot(np.log(Y[col]+1),bins=20,kde=False)
    sns.distplot(Y[col], bins=20, kde=False)
    plt.show()


In [None]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
qX_resamp, qy_resamp = smote_enn.fit_resample(qX_train, qy_train)


# FLAML reg on all data

In [None]:
sev_cols = [col for col in thr_all.columns if "sev" in col]
good_cols = [col for col in sev_cols if 'health' not in col and 'nutrition' not in col]
good_names = [col.replace('dep_','').replace('_sev','') for col in good_cols]

In [None]:
start_idx = thr_all.columns.tolist().index("LATNUM")
X = thr_all.iloc[:, start_idx:]
sev_cols = [col for col in thr_all.columns if "sev" in col]
Y = thr_all[sev_cols]

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "rmse", #"r2",
    "task": "regression",
    "log_file_name": "reg_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}

# Train with labeled input data
for name,chosen_idx in zip(good_names,good_cols):
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{name}-full-reg")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{name}-full-reg"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_param(key="best_model", value=automl.best_estimator)
        mlflow.log_params(automl.best_config)
        mlflow.log_metric(
            key="rmse",
            value=np.sqrt(mean_squared_error(y_test, automl.predict(X_test))),
        )
        mlflow.log_metric(
            key="r2_score", value=r2_score(y_test, automl.predict(X_test))
        )

    preds = automl.predict(X_test)
    fig, ax = plt.subplots(dpi=150)
    scplot = sns.scatterplot(x=preds, y=y_test)
    scplot.set_xlabel("Predicted")
    scplot.set_ylabel("True")
    scplot.set_title(chosen_idx)
    plt.show()


# Full NGA survey dataset

In [None]:
full_nga_data = pd.read_csv(
    "/Users/johnf/Downloads/raw_low_res_dssg/dhs/clean_nga_dhs.csv"
)


In [None]:
sev_cols


In [None]:
full_nga_data["dep_sev_idx"] = full_nga_data["sumpoor_sev"] / (
    6 - full_nga_data[sev_cols].drop(columns=["sumpoor_sev"]).isna().sum(axis=1)
)


In [None]:
# sns.distplot(np.log(full_nga_data.groupby('hex_code').dep_sev_idx.mean()+1),bins=20,kde=False)
fig, ax = plt.subplots(dpi=150)
sns.distplot(full_nga_data.groupby("hex_code").dep_sev_idx.mean(), bins=20, kde=False)
plt.show()


# Feature selection

In [None]:
from stc_unicef_cpi.features.build_features import boruta_shap_ftr_select


In [None]:
subX_train = boruta_shap_ftr_select(
    X_train,
    y_train,
    plot=True,
    n_trials=100,
    sample=False,
    train_or_test="test",
    normalize=True,
    verbose=True,
)


In [None]:
print(*subX_train.columns, sep="\n")


In [None]:
subX_test = X_test[[col for col in subX_train.columns]]
submodel, subloss = baseline.lgbmreg_optunaCV(
    subX_train,
    subX_test,
    y_train,
    y_test,
    target_name=chosen_idx,
    experiment_name=f"lgbm-opt-{chosen_idx}-sub",
)


In [None]:
scplot = sns.scatterplot(x=submodel.predict(subX_test), y=y_test)
scplot.set_xlabel(f"Predicted (subset): {chosen_idx}")
scplot.set_ylabel("True")
plt.show()


In [None]:
sns.scatterplot(x=model.predict(X_test), y=y_test)
scplot.set_xlabel(f"Predicted (full): {chosen_idx}")
scplot.set_ylabel("True")
plt.show()


# Full feature selection vis 

In [None]:
from stc_unicef_cpi.features.build_features import boruta_shap_ftr_select


In [None]:
for name,chosen_idx in zip(good_names,good_cols):
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[chosen_idx], test_size=test_size, random_state=42
    )

    subX_train = boruta_shap_ftr_select(
        X_train,
        y_train,
        plot=True,
        n_trials=100,
        sample=False,
        train_or_test="test",
        normalize=True,
        verbose=True,
    )

# Two-stage modelling approach

In [None]:
good_idxs


In [None]:
inflated_vals = {
    "housing": [0],
    "water": [0, 1],
    "sanitation": [0, 1],
    "education": [0, 1],
}


In [None]:
for chosen_idx in good_idxs:
    map_dict = {
        i: val
        for i, val in zip(
            range(len(inflated_vals[chosen_idx])), inflated_vals[chosen_idx]
        )
    }
    map_fn = lambda x: map_dict.get(x, len(inflated_vals[chosen_idx]))
    Y[f"{chosen_idx}_stg_cls"] = (
        Y[chosen_idx.join(["dep_", "_sev"])].apply(map_fn).astype("category")
    )
    # print(f"{col.mean()*100:.2f}% of {chosen_idx} are {inflated_val}")


In [None]:
# Initialize an AutoML instance
automl = AutoML()
# Specify automl goal and constraint
automl_settings = {
    # "time_budget": 120,  # in seconds
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "stg1_v1.log",
    "max_iter": 500,
    # "ensemble": {
    #     "final_estimator": LogisticRegressionCV(),
    #     "passthrough": False,
    # },
}
# Train with labeled input data
for chosen_idx in good_idxs:
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y[f"{chosen_idx}_stg_cls"], test_size=test_size, random_state=42
    )
    mlflow.set_tracking_uri("../models/mlruns")
    mlflow.set_experiment(f"flaml-automl-{chosen_idx}-stg1")
    client = mlflow.tracking.MlflowClient()
    experiments = client.list_experiments()
    # print(experiments)
    exp_id = [
        experiment.experiment_id
        for experiment in experiments
        if experiment.name == f"flaml-automl-{chosen_idx}-stg1"
    ][0]

    with mlflow.start_run(experiment_id=exp_id) as run:
        # print(run.info.experiment_id)
        automl.fit(X_train=X_train, y_train=y_train, **automl_settings)
        # mlflow.sklearn.log_model(automl,"automl-quint")
        mlflow.log_metric(
            key="f1_score",
            value=f1_score(y_test, automl.predict(X_test), average="micro"),
        )

    preds = automl.predict(X_test)
    print(
        class_rep(
            y_test,
            preds,
        )
    )
    fig, ax = plt.subplots(dpi=150)
    hmap = sns.heatmap(conf_mat(y_test, preds), annot=True, fmt="d")
    hmap.set_xlabel("Predicted")
    hmap.set_ylabel("True")

    plt.show()


In [None]:
automl.predict_proba(X_test).shape


In [None]:
from stc_unicef_cpi.models.inflated_vals_2stg import InflatedValsRegressor

# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import HistGradientBoostingClassifier
from lightgbm import LGBMRegressor, LGBMClassifier

automl_cls = AutoML()
automl_reg = AutoML()
automl_cls_settings = {
    "metric": "micro_f1",
    "task": "classification",
    "log_file_name": "stg1_cls.log",
    "max_iter": 500,
    "estimator_list":["lgbm", "xgboost"] # if want to try others, need to impute nans
}
automl_reg_settings = {
    "metric": "rmse",
    "task": "regression",
    "log_file_name": "stg2_reg.log",
    "max_iter": 500,
    "estimator_list":["lgbm", "xgboost"]
}
# infl_vals_reg = InflatedValsRegressor(LGBMClassifier(), LGBMRegressor())
infl_vals_reg = InflatedValsRegressor(automl_cls, automl_reg)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y[f"dep_{chosen_idx}_sev"], test_size=test_size, random_state=42
)
infl_vals_reg.fit(
    X_train,
    y_train,
    inflated_vals=inflated_vals[chosen_idx],
    cls_fit_kwargs=automl_cls_settings,
    reg_fit_kwargs=automl_reg_settings,
)


In [None]:
infl_vals_reg.predict(X_test, weighted=True).shape


In [None]:
from sklearn.metrics import r2_score

print(r2_score(y_test, infl_vals_reg.predict(X_test, weighted=True)))
print(r2_score(y_test, infl_vals_reg.predict(X_test, weighted=False)))


In [None]:
plt.scatter(infl_vals_reg.predict(X_test, weighted=True), y_test)
plt.scatter(infl_vals_reg.predict(X_test), y_test)

plt.show()

In [None]:
base_lgbm = LGBMRegressor().fit(X_train, y_train)
print(r2_score(y_test, base_lgbm.predict(X_test)))
plt.scatter(base_lgbm.predict(X_test), y_test)


# Set up as pipelines for different combs
- With / without expanded data, possibly w data extrapolated in different ways
- With / without GDP imputation of different kinds (simple / knn / rf etc.) 
- With / without standardisation (standard / robust etc.)
- With / without target transformation (e.g. log / box-cox)

In [None]:
# try KNN imputer, speak to Arpita about more sophisticated imputers later
# resave w n_conflicts and 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(max_iter=10, random_state=42)  
from sklearn import set_config
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from flaml import AutoML
from flaml.ml import sklearn_metric_loss_score

set_config(display='diagram')

imputer = SimpleImputer()
standardiser = StandardScaler()
automl = AutoML()

automl_pipeline = Pipeline([
    ("imputer",imputer),
    ("standardiser", standardiser),
    ("automl", automl)
])
# automl_pipeline
automl_settings = {
    "time_budget": 60,  # total running time in seconds
    "metric": "mse",  # primary metrics for regression can be chosen from: ['mae','mse','r2']
    "task": "regression",  # task type
    "estimator_list": ["xgboost", "catboost", "lgbm"],
    "log_file_name": f"{comb_name}.log",  # flaml log file
    "seed": 42, # random seed
}
pipeline_settings = {
    f"automl__{key}": value for key, value in automl_settings.items()
}
automl_pipeline.fit(X_train, y_train, **pipeline_settings)

# get automl object back 
automl = automl_pipeline.steps[2][1]
# Get the best config and best learner
print('Best ML learner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1 - automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

# plot basic feature importances  
plt.barh(automl.feature_names_in_, automl.feature_importances_)

# compute different metrics on test set 

print('r2', '=', 1 - sklearn_metric_loss_score('r2', y_pred, y_test))
print('mse', '=', sklearn_metric_loss_score('mse', y_pred, y_test))
print('mae', '=', sklearn_metric_loss_score('mae', y_pred, y_test))