This file grabs everything from the LSMS survey that I think an image could possibly recognize and uses those features to predict consumption. This serves as a "gold standard" for any image-based model. It turns out that the CNN model performs almost as well as this gold standard!

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
BASE_DIR = '..'
NIGHTLIGHTS_DIR = os.path.join(BASE_DIR, 'data/Nightlights/2013/F182013.v4c_web.stable_lights.avg_vis.tif')
COUNTRY = 'malawi_2016'

LSMS_DIR = os.path.join(BASE_DIR, 'countries', COUNTRY, 'LSMS')
PROCESSED_DIR = os.path.join(BASE_DIR, 'countries', COUNTRY, 'processed')

In [3]:
# these vary from one LSMS survey to another
CONSUMPTION_FILE = 'IHS4 Consumption Aggregate.dta'
CONSUMPTION_PH_COL = 'rexpagg' # per household
CONSUMPTION_PC_COL = 'rexpaggpc' # per capita

GEOLOCATION_FILE = 'HouseholdGeovariables_stata11/HouseholdGeovariablesIHS4.dta'
LATITUDE_COL = 'lat_modified'
LONGITUDE_COL = 'lon_modified'

# purchasing power parity for malawi in 2016 (https://data.worldbank.org/indicator/PA.NUS.PRVT.PP?locations=MW)
PPP = 207.238

In [4]:
for file in [CONSUMPTION_FILE, GEOLOCATION_FILE]:
    assert os.path.isfile(os.path.join(LSMS_DIR, file)), print(f'Could not find {file}')

In [5]:
df_geo = pd.read_stata(os.path.join(LSMS_DIR, GEOLOCATION_FILE))
df_hhf = pd.read_stata(os.path.join(LSMS_DIR, 'HH_MOD_F.dta'))
df_plot = pd.read_stata(os.path.join(LSMS_DIR, 'PlotGeovariablesIHS4.dta'))

In [6]:
df_com = pd.read_stata(os.path.join(LSMS_DIR, 'COM_CD.dta'))
df_com2 = pd.read_stata(os.path.join(LSMS_DIR, 'COM_CF1.dta'))

In [7]:
# rooms = df_hhf['hh_f10']
# roof = df_hhf['hh_f08']

# # all distance infrasturcture metrics
# road_type = df_com['com_cd01']
# dist_daily_market = df_com['com_cd16']
# dist_larger_weekly = df_com['com_cd18a']
# dist_perm_admarc = df_com['com_cd20a']
# dist_post_office = df_com['com_cd22a']
# dist_telephone = df_com['com_cd24a']
# dist_gov_prim_school = df_com['com_cd27a']
# dist_gov_sec_school = df_com['com_cd36a']
# dist_comm_sec_school = df_com['com_cd40a']
# dist_medicines = df_com['com_cd49a']
# dist_health_clinic = df_com['com_cd51a']
# dist_doctor = df_com['com_cd60a']
# dist_bank = df_com['com_cd67a']
# dist_microfinance = df_com['com_cd69a']

# dist_agric_exten_officer = df_com2['com_cf08a']

# dist_admarc_outlet = df_geo['dist_admarc']
# dist_agric_market = df_geo['dist_agmrkt']
# dist_tobacco_auction = df_geo['dist_auction']
# dist_boma = df_geo['dist_boma']
# dist_border = df_geo['dist_borderpost']
# dist_popcenter = df_geo['dist_popcenter']
# dist_road = df_geo['dist_road']

# dist_hh = df_plot['dist_hh']

# # temp
# mean_temp = df_geo['af_bio_1']
# mean_temp_wet_q = df_geo['af_bio_8']

# # rain
# mean_rain = df_geo['af_bio_12']
# mean_rain_wet_month = df_geo['af_bio_13']
# mean_rain_wet_q = df_geo['af_bio_16']

In [8]:
# for the purpose of merging dfs with case_ids and ea_ids together
df_tie = pd.read_stata(os.path.join(LSMS_DIR, CONSUMPTION_FILE))[['case_id', 'ea_id']]

hhf_input = df_hhf[['case_id', 'hh_f10', 'hh_f08']]
com_input = df_com[['ea_id', 'com_cd01', 'com_cd16', 'com_cd18a', 'com_cd20a', 'com_cd22a', 'com_cd24a',
                   'com_cd27a', 'com_cd36a', 'com_cd40a', 'com_cd49a', 'com_cd51a', 'com_cd60a', 'com_cd67a',
                   'com_cd69a']]

com2_input = df_com2[['ea_id', 'com_cf08a']]

geo_input = df_geo[['case_id', 'dist_admarc', 'dist_agmrkt', 'dist_auction', 'dist_boma', 'dist_borderpost',
                  'dist_popcenter', 'dist_road', 'af_bio_1', 'af_bio_8', 'af_bio_12', 'af_bio_13', 'af_bio_16', 
                   'lat_modified', 'lon_modified']]
geo_input.rename(columns={'lat_modified': 'cluster_lat', 'lon_modified': 'cluster_lon'}, inplace=True)
geo_input.dropna(inplace=True)

plot_input = df_plot[['case_id', 'dist_hh']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [9]:
df_cons = pd.read_csv(os.path.join(PROCESSED_DIR, 'clusters.csv'))
df_cons.rename(columns={'lat': 'cluster_lat', 'lon': 'cluster_lon'}, inplace=True)
df_cons.head()

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights
0,-17.09515,35.217213,1.477796,0.0
1,-17.092351,35.114643,1.314741,0.0
2,-17.016698,35.079629,1.626932,0.0
3,-16.977243,35.205706,1.733232,0.14876
4,-16.956385,35.168967,1.131669,0.0


In [10]:
def merge_on_lat_lon(df1, df2, keys=['cluster_lat', 'cluster_lon']):
    """
        Allows two dataframes to be merged on lat/lon
        Necessary because pandas has trouble merging on floats
    """
    df1 = df1.copy()
    df2 = df2.copy()
    
    # must use ints for merging, as floats induce errors
    df1['merge_lat'] = (10000 * df1[keys[0]]).astype(int)
    df1['merge_lon'] = (10000 * df1[keys[1]]).astype(int)
    
    df2['merge_lat'] = (10000 * df2[keys[0]]).astype(int)
    df2['merge_lon'] = (10000 * df2[keys[1]]).astype(int)
    
    df2.drop(keys, axis=1, inplace=True)
    merged = pd.merge(df1, df2, on=['merge_lat', 'merge_lon'])
    merged.drop(['merge_lat', 'merge_lon'], axis=1, inplace=True)
    return merged

In [11]:
df_merge = merge_on_lat_lon(df_cons, geo_input)
df_merge = pd.merge(df_merge, hhf_input, on='case_id', how='left')
print(df_merge.shape)
df_merge = pd.merge(df_merge, df_tie, on='case_id', how='left')
df_merge = pd.merge(df_merge, com_input, on='ea_id', how='left')
print(df_merge.shape)
df_merge = pd.merge(df_merge, com2_input, on='ea_id', how='left')
print(df_merge.shape)
df_merge = pd.merge(df_merge, plot_input, on='case_id', how='left')
print(df_merge.shape)

(12444, 19)
(12444, 34)
(12444, 35)
(19865, 36)


In [12]:
df_merge.head()

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights,case_id,dist_admarc,dist_agmrkt,dist_auction,dist_boma,dist_borderpost,...,com_cd27a,com_cd36a,com_cd40a,com_cd49a,com_cd51a,com_cd60a,com_cd67a,com_cd69a,com_cf08a,dist_hh
0,-17.09515,35.217213,1.477796,0.0,311017590042,1.0,21.0,145.0,21.0,4.0,...,,,,,,,,,,1.2
1,-17.09515,35.217213,1.477796,0.0,311017590010,2.0,20.0,145.0,20.0,4.0,...,,,,,,,,,,1.0
2,-17.09515,35.217213,1.477796,0.0,311017590064,2.0,20.0,145.0,20.0,4.0,...,,,,,,,,,,1.7
3,-17.09515,35.217213,1.477796,0.0,311017590064,2.0,20.0,145.0,20.0,4.0,...,,,,,,,,,,1.7
4,-17.09515,35.217213,1.477796,0.0,311017590146,2.0,20.0,145.0,20.0,5.0,...,,,,,,,,,,0.9


In [13]:
df_merge.columns

Index(['cluster_lat', 'cluster_lon', 'cons_pc', 'nightlights', 'case_id',
       'dist_admarc', 'dist_agmrkt', 'dist_auction', 'dist_boma',
       'dist_borderpost', 'dist_popcenter', 'dist_road', 'af_bio_1',
       'af_bio_8', 'af_bio_12', 'af_bio_13', 'af_bio_16', 'hh_f10', 'hh_f08',
       'ea_id', 'com_cd01', 'com_cd16', 'com_cd18a', 'com_cd20a', 'com_cd22a',
       'com_cd24a', 'com_cd27a', 'com_cd36a', 'com_cd40a', 'com_cd49a',
       'com_cd51a', 'com_cd60a', 'com_cd67a', 'com_cd69a', 'com_cf08a',
       'dist_hh'],
      dtype='object')

In [14]:
df_final = df_merge

In [15]:
df_use = df_final.drop(['case_id', 'ea_id'], axis=1)

In [16]:
df_use = pd.get_dummies(df_use)

In [17]:
clusters = df_use.groupby(['cluster_lat', 'cluster_lon'])

In [18]:
cluster_df = clusters.mean().reset_index()

In [19]:
cluster_df

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights,dist_admarc,dist_agmrkt,dist_auction,dist_boma,dist_borderpost,dist_popcenter,...,hh_f08_GRASS,hh_f08_IRON SHEETS,hh_f08_CLAY TILES,hh_f08_CONCRETE,hh_f08_PLASTIC SHEETING,hh_f08_OTHER (SPECIFY),com_cd01_TAR/ASPHALT,com_cd01_GRADED GRAVELED,com_cd01_DIRT ROAD (MAINTAINED),com_cd01_DIRT TRACK
0,-17.095150,35.217213,1.477796,0.00000,1.500000,20.125000,145.000000,20.125000,4.125000,20.125000,...,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-17.092351,35.114643,1.314741,0.00000,8.105263,25.578947,146.368421,25.578947,10.105263,25.578947,...,0.789474,0.210526,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-17.016698,35.079629,1.626932,0.00000,15.761905,23.047619,134.857143,23.047619,21.523810,23.047619,...,0.571429,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-16.977243,35.205706,1.733232,0.14876,6.970588,11.764706,135.764706,11.764706,13.500000,11.764706,...,0.705882,0.294118,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-16.956385,35.168967,1.131669,0.00000,13.000000,13.681818,130.181818,13.681818,20.636364,13.681818,...,0.454545,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,-9.591378,33.057450,1.463979,0.00000,7.666667,26.222222,235.277778,26.222222,5.944444,103.666667,...,0.444444,0.555556,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
776,-9.550397,33.291558,1.290441,0.00000,10.185185,18.370370,228.740741,18.370370,17.481481,82.481481,...,0.407407,0.592593,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
777,-9.519230,33.139193,1.873278,0.00000,5.057143,24.971429,238.400000,24.971429,17.428571,98.828571,...,0.371429,0.628571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
778,-9.507538,33.259649,1.860406,0.00000,4.465116,21.604651,234.209302,21.604651,18.441860,90.000000,...,0.255814,0.744186,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [20]:
# a few columns have a high percentage of NA
nas = cluster_df.isna().sum() / len(cluster_df)
nas[nas > 0]

com_cd16     0.379487
com_cd18a    0.350000
com_cd20a    0.257692
com_cd22a    0.151282
com_cd24a    0.191026
com_cd27a    0.032051
com_cd36a    0.032051
com_cd40a    0.032051
com_cd49a    0.603846
com_cd51a    0.288462
com_cd60a    0.032051
com_cd67a    0.078205
com_cd69a    0.214103
com_cf08a    0.434615
dist_hh      0.042308
dtype: float64

# Modeling

In [26]:
# This is a bunch of code from the Jean et al Github that is modified to work with Python3 and our data

import numpy as np
import pandas as pd
import random
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
import sklearn.linear_model as linear_model
import matplotlib.pyplot as plt
from matplotlib.collections import EllipseCollection
import seaborn as sns


def predict_consumption(
    X, y, k=10, k_inner=5, points=10,
        alpha_low=1, alpha_high=5, margin=0.25):
    """
    Plots predicted consumption
    """
    y_hat, r2 = run_cv(X, y, k, k_inner, points, alpha_low, alpha_high)
    return X, y, y_hat, r2


def run_cv(X, y, k, k_inner, points, alpha_low, alpha_high, randomize=False):
    """
    Runs nested cross-validation to make predictions and compute r-squared.
    """
    alphas = np.logspace(alpha_low, alpha_high, points)
    r2s = np.zeros((k,))
    y_hat = np.zeros_like(y)
    kf = KFold(n_splits=k, shuffle=True)
    fold = 0
    for train_idx, test_idx in kf.split(X):
        r2s, y_hat, fold = evaluate_fold(
            X, y, train_idx, test_idx, k_inner, alphas, r2s, y_hat, fold,
            randomize)
    return y_hat, r2s.mean()


def scale_features(X_train, X_test):
    """
    Scales features using StandardScaler.
    """
    X_scaler = StandardScaler(with_mean=True, with_std=False)
    X_train = X_scaler.fit_transform(X_train)
    X_test = X_scaler.transform(X_test)
    return X_train, X_test


def train_and_predict_ridge(alpha, X_train, y_train, X_test):
    """
    Trains ridge model and predicts test set.
    """
    ridge = linear_model.Ridge(alpha)
    ridge.fit(X_train, y_train)
    y_hat = ridge.predict(X_test)
    return y_hat


def predict_inner_test_fold(X, y, y_hat, train_idx, test_idx, alpha):
    """
    Predicts inner test fold.
    """
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    X_train, X_test = scale_features(X_train, X_test)
    y_hat[test_idx] = train_and_predict_ridge(alpha, X_train, y_train, X_test)
    return y_hat


def find_best_alpha(X, y, k_inner, alphas):
    """
    Finds the best alpha in an inner CV loop.
    """
    kf = KFold(n_splits=k_inner, shuffle=True)
    best_alpha = 0
    best_r2 = 0
    for idx, alpha in enumerate(alphas):
        y_hat = np.zeros_like(y)
        for train_idx, test_idx in kf.split(X):
            y_hat = predict_inner_test_fold(
                X, y, y_hat, train_idx, test_idx, alpha)
        r2 = stats.pearsonr(y, y_hat)[0] ** 2
        if r2 > best_r2:
            best_alpha = alpha
            best_r2 = r2
    print('best alpha', best_alpha)
    return best_alpha


def evaluate_fold(
    X, y, train_idx, test_idx, k_inner, alphas, r2s, y_hat, fold,
        randomize):
    """
    Evaluates one fold of outer CV.
    """
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    if randomize:
        random.shuffle(y_train)
    best_alpha = find_best_alpha(X_train, y_train, k_inner, alphas)
    X_train, X_test = scale_features(X_train, X_test)
    y_test_hat = train_and_predict_ridge(best_alpha, X_train, y_train, X_test)
    r2 = stats.pearsonr(y_test, y_test_hat)[0] ** 2
    r2s[fold] = r2
    y_hat[test_idx] = y_test_hat
    return r2s, y_hat, fold + 1


In [27]:
def nan_handler(df):
    nas = df.isna().sum()
    for c in df:
        if nas[c] > 0:
            df[c] = df[c].fillna(df[c].median())
    return df

In [28]:
cleaned_df = nan_handler(cluster_df)

In [29]:
cleaned_df.head()

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights,dist_admarc,dist_agmrkt,dist_auction,dist_boma,dist_borderpost,dist_popcenter,...,hh_f08_GRASS,hh_f08_IRON SHEETS,hh_f08_CLAY TILES,hh_f08_CONCRETE,hh_f08_PLASTIC SHEETING,hh_f08_OTHER (SPECIFY),com_cd01_TAR/ASPHALT,com_cd01_GRADED GRAVELED,com_cd01_DIRT ROAD (MAINTAINED),com_cd01_DIRT TRACK
0,-17.09515,35.217213,1.477796,0.0,1.5,20.125,145.0,20.125,4.125,20.125,...,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-17.092351,35.114643,1.314741,0.0,8.105263,25.578947,146.368421,25.578947,10.105263,25.578947,...,0.789474,0.210526,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-17.016698,35.079629,1.626932,0.0,15.761905,23.047619,134.857143,23.047619,21.52381,23.047619,...,0.571429,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-16.977243,35.205706,1.733232,0.14876,6.970588,11.764706,135.764706,11.764706,13.5,11.764706,...,0.705882,0.294118,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-16.956385,35.168967,1.131669,0.0,13.0,13.681818,130.181818,13.681818,20.636364,13.681818,...,0.454545,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [30]:
cleaned_df

Unnamed: 0,cluster_lat,cluster_lon,cons_pc,nightlights,dist_admarc,dist_agmrkt,dist_auction,dist_boma,dist_borderpost,dist_popcenter,...,hh_f08_GRASS,hh_f08_IRON SHEETS,hh_f08_CLAY TILES,hh_f08_CONCRETE,hh_f08_PLASTIC SHEETING,hh_f08_OTHER (SPECIFY),com_cd01_TAR/ASPHALT,com_cd01_GRADED GRAVELED,com_cd01_DIRT ROAD (MAINTAINED),com_cd01_DIRT TRACK
0,-17.095150,35.217213,1.477796,0.00000,1.500000,20.125000,145.000000,20.125000,4.125000,20.125000,...,0.666667,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-17.092351,35.114643,1.314741,0.00000,8.105263,25.578947,146.368421,25.578947,10.105263,25.578947,...,0.789474,0.210526,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-17.016698,35.079629,1.626932,0.00000,15.761905,23.047619,134.857143,23.047619,21.523810,23.047619,...,0.571429,0.428571,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-16.977243,35.205706,1.733232,0.14876,6.970588,11.764706,135.764706,11.764706,13.500000,11.764706,...,0.705882,0.294118,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-16.956385,35.168967,1.131669,0.00000,13.000000,13.681818,130.181818,13.681818,20.636364,13.681818,...,0.454545,0.545455,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,-9.591378,33.057450,1.463979,0.00000,7.666667,26.222222,235.277778,26.222222,5.944444,103.666667,...,0.444444,0.555556,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
776,-9.550397,33.291558,1.290441,0.00000,10.185185,18.370370,228.740741,18.370370,17.481481,82.481481,...,0.407407,0.592593,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
777,-9.519230,33.139193,1.873278,0.00000,5.057143,24.971429,238.400000,24.971429,17.428571,98.828571,...,0.371429,0.628571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
778,-9.507538,33.259649,1.860406,0.00000,4.465116,21.604651,234.209302,21.604651,18.441860,90.000000,...,0.255814,0.744186,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
y = cleaned_df['cons_pc'].values
y_log = np.log(y)


to_drop = ['cluster_lat', 'cluster_lon', 'cons_pc', 'nightlights'] # if excluding nightlights
# to_drop = ['cluster_lat', 'cluster_lon', 'cons_pc'] # if not excluding nightlights

x = cleaned_df.drop(to_drop, axis=1).values


In [32]:
_, _, _, r2 = predict_consumption(x, y)
r2

best alpha 10.0
best alpha 27.825594022071243
best alpha 10.0
best alpha 10.0
best alpha 27.825594022071243
best alpha 27.825594022071243
best alpha 10.0
best alpha 77.4263682681127
best alpha 27.825594022071243
best alpha 27.825594022071243


0.3584062988048845

In [33]:
# better than the CNN, but not by that much
_, _, _, r2 = predict_consumption(x, y_log)
r2

best alpha 10.0
best alpha 10.0
best alpha 27.825594022071243
best alpha 10.0
best alpha 10.0
best alpha 10.0
best alpha 10.0
best alpha 10.0
best alpha 10.0
best alpha 10.0


0.5424482816709538

In [34]:
def create_train_valid(x, y, frac=0.7):
    n_train = int(0.7*(len(x)))
    inds = np.arange(len(x))
    train_ind = np.random.choice(inds, n_train, replace=False)
    valid_ind = np.delete(inds, train_ind)

    train_x = x[train_ind]
    valid_x = x[valid_ind]


    train_y = y[train_ind]
    valid_y = y[valid_ind]

    ss = StandardScaler() # standardize features
    train_x = ss.fit_transform(train_x)
    valid_x = ss.transform(valid_x)
    
    return train_x, train_y, valid_x, valid_y

# Train Regression on consumption /w given alphas

In [35]:
train_x, train_y, valid_x, valid_y = create_train_valid(x, y, frac=0.7)

In [36]:
ridge = linear_model.Ridge(alpha=10)
ridge.fit(train_x, train_y)
ridge.score(train_x, train_y)

0.21085390173359875

In [37]:
ridge.score(valid_x, valid_y)

-0.014583835628850572

In [38]:
ridge.intercept_

2.5494785754578766

In [39]:
ridge.coef_

array([ 0.06714898, -0.20467236, -0.27661411,  0.00295354, -0.04218658,
       -0.13483196, -0.04912576,  0.30061663, -0.59166226, -0.73040053,
        0.43987801,  0.28802057, -0.23732649, -0.10606957, -0.05424974,
        0.06558353,  0.18894527,  0.1216902 , -0.05260409,  0.16773298,
        0.04294534,  0.04875405, -0.04593401, -0.05353147,  0.12636595,
       -0.16009769, -0.08138382, -0.08971814, -0.46755835,  0.42122859,
        0.71927555, -0.2856704 , -0.06061381,  0.10525692, -1.28908697,
       -1.32844936, -1.75309966, -1.06274681])

In [40]:
df_imps = pd.DataFrame.from_dict({'columns': cleaned_df.drop(to_drop,axis=1).columns,
                                 'imps': ridge.coef_})
df_imps.sort_values('imps', ascending=False, inplace=True)
df_imps.columns = ['feature_name', 'coefficient']
df_imps

Unnamed: 0,feature_name,coefficient
30,hh_f08_CLAY TILES,0.719276
10,af_bio_13,0.439878
29,hh_f08_IRON SHEETS,0.421229
7,af_bio_1,0.300617
11,af_bio_16,0.288021
16,com_cd22a,0.188945
19,com_cd36a,0.167733
24,com_cd67a,0.126366
17,com_cd24a,0.12169
33,hh_f08_OTHER (SPECIFY),0.105257


# Run regression on log consumption /w given alphas

In [41]:
train_x, train_y, valid_x, valid_y = create_train_valid(x, y_log, frac=0.7)

In [42]:
ridge = linear_model.Ridge(alpha=10)
ridge.fit(train_x, train_y)
ridge.score(train_x, train_y)

0.6428993747898549

In [43]:
ridge.score(valid_x, valid_y)

0.3169874231602804

In [44]:
ridge.intercept_

0.7819475721122713

In [45]:
ridge.coef_

array([-0.02096464,  0.01144844, -0.04195929, -0.0139568 ,  0.03559089,
       -0.01777803, -0.0130366 ,  0.06188971, -0.09656146, -0.20368383,
       -0.02804847,  0.1624083 , -0.04065891,  0.04212039,  0.01185866,
       -0.0229822 ,  0.00034202,  0.03454834, -0.00700139,  0.10402421,
        0.02225692, -0.04242328, -0.02794972, -0.00589722, -0.01877859,
       -0.00447187, -0.00717503,  0.01844509, -0.14434042,  0.14683696,
        0.01814244, -0.01354907, -0.0223502 , -0.00731913,  0.01409451,
       -0.01025753, -0.05899235, -0.01999429])

In [46]:
df_imps = pd.DataFrame.from_dict({'columns': cleaned_df.drop(to_drop,axis=1).columns,
                                 'imps': ridge.coef_})
df_imps.sort_values('imps', ascending=False, inplace=True)
df_imps.columns = ['feature_name', 'coefficient']
df_imps

Unnamed: 0,feature_name,coefficient
11,af_bio_16,0.162408
29,hh_f08_IRON SHEETS,0.146837
19,com_cd36a,0.104024
7,af_bio_1,0.06189
13,com_cd16,0.04212
4,dist_borderpost,0.035591
17,com_cd24a,0.034548
20,com_cd40a,0.022257
27,dist_hh,0.018445
30,hh_f08_CLAY TILES,0.018142


In [48]:
df_imps.feature_name

11                          af_bio_16
29                 hh_f08_IRON SHEETS
19                          com_cd36a
7                            af_bio_1
13                           com_cd16
4                     dist_borderpost
17                          com_cd24a
20                          com_cd40a
27                            dist_hh
30                  hh_f08_CLAY TILES
34               com_cd01_TAR/ASPHALT
14                          com_cd18a
1                         dist_agmrkt
16                          com_cd22a
25                          com_cd69a
23                          com_cd60a
18                          com_cd27a
26                          com_cf08a
33             hh_f08_OTHER (SPECIFY)
35           com_cd01_GRADED GRAVELED
6                           dist_road
31                    hh_f08_CONCRETE
3                           dist_boma
5                      dist_popcenter
24                          com_cd67a
37                com_cd01_DIRT TRACK
0           