# Import libraries

In [148]:
import pandas as pd
import numpy as np

# Download time series from link in GitHub repo

In [149]:
# Adjust filepath
time_series = pd.read_csv('./data/time_series_with_causes_zscore_full.csv', nrows=30)

In [150]:
time_series.columns.values

array(['Unnamed: 0', 'index', 'country', 'admin_code', 'admin_name',
       'centx', 'centy', 'year_month', 'year', 'month', 'fews_ipc',
       'fews_ha', 'fews_proj_near', 'fews_proj_near_ha', 'fews_proj_med',
       'fews_proj_med_ha', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'acled_count',
       'acled_fatalities', 'p_staple_food', 'area', 'cropland_pct', 'pop',
       'ruggedness_mean', 'pasture_pct', 'change_fews', 'land seizures_0',
       'land seizures_1', 'land seizures_2', 'slashed export_0',
       'slashed export_1', 'slashed export_2', 'price rise_0',
       'price rise_1', 'price rise_2', 'mass hunger_0', 'mass hunger_1',
       'mass hunger_2', 'cyclone_0', 'cyclone_1', 'cyclone_2',
       'failed crops_0', 'failed crops_1', 'failed crops_2',
       'disruption to farming_0', 'disruption to farming_1',
       'disruption to farming_2', 'massive starvation_0',
       'massive starvation_1', 'massive starvation_2',
       'abnormall

In [151]:
t_variant_traditional_factors = ['ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 
                                    'acled_count', 'acled_fatalities', 'p_staple_food']
t_invariant_traditional_factors = ['area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']
news_factors = [name for name in time_series.columns.values if '_0' in name]

In [152]:
news_factors[0]

'land seizures_0'

In [153]:
cols_to_drop = ["Unnamed: 0", "centx", "centy", 'change_fews', 'fews_ha', 'fews_proj_med', 'fews_proj_med_ha', 'fews_proj_near_ha'] + [col for col in time_series.columns if col.endswith(('_1', '_2', '_3'))]
time_series.drop(columns=cols_to_drop, inplace=True)

In [154]:
print(f"Shape of time_series: {time_series.shape}")

Shape of time_series: (30, 190)


In [155]:
def get_lagged(x, f, t):
    admin_code = x['admin_code']
    year = x['year']
    month = x['month']
    l_month = ((month-1-t)%12)+1
    l_year = year
    if month-t<=0:
        l_year -= 1
    ts=time_series[time_series['admin_code']==admin_code]
    lagged_year_month = '{}_{}'.format(l_year, l_month)
    if lagged_year_month in ts['year_month'].values:
        ts = ts[ts['year_month']==lagged_year_month]
        return ts[f].values[0]
    else:
        return x[f]
    

In [156]:

def add_time_lagged(features, start=3, end=9, diff=1, agg=True):
    if agg:
        levels = ['', '_province', '_country']
    else:
        levels = ['']
    for suffix in levels:
        for f in features:
            f_s = f+suffix
            for t in range(start,end,diff):
                if '{}_{}'.format(f_s,t) in time_series:
                    continue
                time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)

# Get Admin level mapping

In [157]:
# Adjust filepath (file also in GitHub repository)
admins = pd.read_csv('./data/famine-country-province-district-years-CS.csv')

In [158]:
len(admins.country.unique())

39

In [159]:
admin_names = time_series['admin_name'].unique()
districts = admins['district'].unique()
provinces = admins['province'].unique()
countries = admins['country'].unique()

In [160]:
print("districts in time_series: ", len([d for d in districts if d in admin_names]))
print("provinces in time_series: ", len([p for p in provinces if p in admin_names]))
print("countries in time_series: ", len([c for c in countries if c in admin_names]))

districts in time_series:  1
provinces in time_series:  1
countries in time_series:  0


In [161]:
districts = sorted(districts, key=str) # this has sari pul
# districts

In [162]:
print (len(admin_names), len(districts), len(provinces), len(countries))
print (len(set(admin_names).difference(districts)))
missing_admin_names = set(admin_names).difference(districts)
print (len(missing_admin_names.difference(provinces)))
missing_admin_names = missing_admin_names.difference(provinces)

1 4113 474 39
0
0


In [163]:
import editdistance
from fuzzywuzzy import fuzz
def find_matching(missing, names):
    matching_districts = {}
    for m in missing:
        max_overlap = 0
        nearest_d = None
        for d in names:
            d = str(d)
            dist = fuzz.partial_ratio(m, d)
            if dist > max_overlap:
                max_overlap = dist
                nearest_d = d
        matching_districts[m] = nearest_d
    return matching_districts


matching = find_matching(missing_admin_names, districts)
matching_p = find_matching(missing_admin_names, provinces)
#manually verify matching and update
for k in matching.keys():
    print (k, matching[k], matching_p[k])


In [164]:
# Adjust filepath (file also in GitHub repository)
# After validating the matches, the names are logged in this csv file
valid_matching = pd.read_csv('./data/matching_districts.csv')

In [165]:
matched = valid_matching['missing'].unique()
matched = [m.encode('utf-8').decode("unicode_escape") for m in matched]
missing_admin_names = [m.encode('ascii', 'backslashreplace').decode("unicode_escape") for m in missing_admin_names]
print(len(missing_admin_names), len(matched))
set(missing_admin_names).difference(matched)

0 230


set()

In [166]:
print("matched", matched)

print("...........")
print("missing", missing_admin_names)

matched ['Port-Au-Prince', 'Teso', 'Tanganyka', 'Tayeeglow', 'Kadoma', "Ad Dali'", 'MPongwe', 'Saint-Raphael', 'Butembo', 'Um Kadada', 'Shabelle', 'Lughaye', 'Beitbridge', 'Bulo Burto', 'Trou Du Nord', 'Addabah', 'Muranga', 'Guji', 'Awi/Agew', 'Amran', 'Chipinge', 'Djourouf Al Ahmar', 'Port-Salut', 'Chiengi', 'Gweru', "Bura'", 'Agnuak', 'Bandarbeyla', 'Mbuji-Mayi', 'Sud-Kivu', 'Sheikh', 'Addis Adaba', 'Baydhaba', 'Lubumbashi', 'La PendÃ©', 'Adan', 'Acul Du Nord', 'Kananga', 'Bale.1', 'Lac-LÃ©rÃ©', 'Kelem Wellega', 'Kibale', 'North Shewa(R4)', 'Ceca La Source', 'Adan Yabaal', 'South Gonder', 'Gwanda', 'Gedio', 'East al Gazera', 'Damagaram Takaya', 'Abu Hamad', "Shar'ab Ar Rawnah", 'Gucha', 'Kabia', 'Ad Dinder', 'Maragua', 'Al Faw', 'Iriba', 'Eastern Tigray', 'Gonave', 'Ndjamena', 'Al Gadaref', 'North Shewa(R3)', 'Abu Jubaiyah', 'Nandi North', 'Koibatek', 'Banadir', 'En Nuhud', 'Chegutu', 'Nyala.1', 'Buret', "At Ta'izziyah", 'Kas', 'Sheikan', 'GothÃ¨ye', 'Hirat', 'Galdogob', "Mawza'", 'M

In [167]:
def find_province(x):
    try:
        if x in districts:
            return admins[admins['district']==x]['province'].values[0]
        elif x in provinces:
            return x
        elif x.decode("unicode_escape").encode('ascii', 'backslashreplace') in matched:
            x = x.decode("unicode_escape").encode('ascii', 'backslashreplace')
            v = valid_matching[valid_matching['missing']==x]
            if v['match'].values[0]=='district':
                x = v['district'].values[0]
                return admins[admins['district']==x]['province'].values[0]
            elif v['match'].values[0]=='province':
                return v['province'].values[0]
    except:
        raise Exception("Province not found for: {}".format(x))

In [168]:
admin_to_province = {}
for a in admin_names:
    try:
        admin_to_province[a] = find_province(a)
    except:
        print (a)

In [169]:
time_series['province'] = time_series['admin_name'].apply(lambda x: admin_to_province[x])

# Add province and country aggregate values

In [170]:
def add_agg_factors(features, level='province'):
    grouped_df = time_series.groupby(['year_month', level])[features].mean() 
    # WARN: we added the [features] part as the original code was not working without it as it was trying to aggregate non-numeric colums as well
    # for f in features:
    #     time_series['{}_{}'.format(f, level)] = time_series.apply(lambda x: grouped_df.ix[x['year_month'], x[level]][f], axis=1)
    
    # WARN: The above code used older pandas syntax, which is now deprecated. The new code is as follows:
    
    for f in features:
        time_series['{}_{}'.format(f, level)] = time_series.apply(
            lambda x: grouped_df.loc[x['year_month'], x[level]][f] if (x['year_month'], x[level]) in grouped_df.index else None,
            axis=1
        )

In [171]:
add_agg_factors(news_factors)

  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.fo

In [172]:
add_agg_factors(news_factors, level='country')
add_agg_factors(t_variant_traditional_factors, level='province')
add_agg_factors(t_variant_traditional_factors, level='country')
add_agg_factors(t_invariant_traditional_factors, level='province')
add_agg_factors(t_invariant_traditional_factors, level='country')

  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.format(f, level)] = time_series.apply(
  time_series['{}_{}'.fo

In [173]:
# time_series.to_csv('theirs_agg_province_features_full.csv')

# Add time lagged features

In [174]:
add_time_lagged(t_variant_traditional_factors)

  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)
  time_series['{}_{}

In [175]:
time_series.to_csv('./their_modified_time_series_only_tvariant_30.csv')

In [176]:
# add_time_lagged(news_factors)

In [177]:
# add_time_lagged(['fews_ipc'], end=21, diff=3, agg=False)

In [178]:
# add_time_lagged(['fews_proj_near'], start=3, end=4, diff=1, agg=False)

In [179]:
import math
def diebold_mariano(preds, labels):
    sq_error = [(p-l)**2 for p,l in zip(preds, labels)]
    mean = np.mean(sq_error)
    n = len(preds)
    gammas = {}
    m = max(n,int(math.ceil(np.cbrt(n))+2))
    for k in range(m):
        gammas[k] = 0
        for i in range(k+1, n):
            gammas[k] += (sq_error[i] - mean)*(sq_error[i-k] - mean)
        gammas[k] = gammas[k]/n
    sum_gamma = gammas[0]
    for k in range(1, m):
        sum_gamma += 2*gammas[k]
    return np.sqrt(sum_gamma/n)

# Generate and save data for Fig 3A, B, C

In [180]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc

test_splits = [
    ((2010,7), (2011, 7)), 
    ((2011,7), (2012, 7)),
    ((2012,7), (2013, 7)), 
    ((2013,7), (2014, 7)), 
    ((2014,7), (2015, 7)), 
    ((2015,7), (2016, 7)), 
    ((2016,7), (2017, 7)), 
    ((2017,7), (2018, 7)),
    ((2018,7), (2019, 7)), 
    ((2019,2), (2020, 2)),
]
train_splits = [
    ((2009,7), (2010,4)),
    ((2009,7), (2011,1)),
    ((2009,7), (2011,10)),
    ((2009,7), (2012,7)),
    ((2009,7), (2013,7)),
    ((2009,7), (2014,1)),
    ((2009,7), (2015,1)),
    ((2009,7), (2015,10)),
    ((2009,7), (2016,10)),
    ((2009,7), (2017,2))]
dev_splits = [
    ((2010,4), (2010, 7)),
    ((2011,1), (2011, 7)),
    ((2011,10), (2012, 7)),
    ((2012,7), (2013, 7)),
    ((2013,4), (2014, 7)),
    ((2014,1), (2015, 7)),
    ((2015,1), (2016, 7)),
    ((2015,10), (2017, 7)),
    ((2016,10), (2018, 7)),
    ((2017,2), (2019, 2)),
]
rf = RandomForestRegressor(max_features='auto', n_estimators=100, 
                             min_samples_split=0.5, min_impurity_decrease=0.001, random_state=0)
ols = LinearRegression()

lasso = linear_model.Lasso(alpha=0.1)

def get_agg_lagged_features(factors):
    return ['{}_{}'.format(f, t) for f, t in zip(factors, range(3,9))] + ['{}_province_{}'.format(f, t) for f, t in zip(factors, range(3,9))] + ['{}_country_{}'.format(f, t) for f, t in zip(factors, range(3,9))]
        

features = {
    'traditional': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] + 
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors
    ], 
    'news': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(news_factors)
    ], 
    'traditional+news': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors +
        get_agg_lagged_features(news_factors)
    ],
    'expert': time_series['fews_proj_near_3'],
    'expert+traditional': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] + 
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors
    ],
    'expert+news': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(news_factors)
    ],
    'expert+traditional+news': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors +
        get_agg_lagged_features(news_factors)
    ]
}

labels_df = time_series['fews_ipc']

def get_time_split(df, start, end):
    return df[df['year'] >= start[0] & df['month'] >= start[1] & df['year'] <= end[0] & df['month'] <= end[1]]


fig_3a = pd.DataFrame(columns=['method', 'split', 'features', 'country', 'rmse', 'lower_bound', 'upper_bound'])
fig_3b = pd.DataFrame(columns=['method', 'split', 'features', 'aucpr'])
fig_3c = pd.DataFrame(columns=['method', 'split', 'features', 'recall_at_80p'])

thresholds = {'traditional': (2.236, 3.125), 
              'news': (1.907, 2.712), 
              'traditional+news': (2.105, 3.314),
              'expert': (2, 3),
              'expert+news': (1.912, 2.813),
              'expert+traditional': (2.241, 3.132),
              'expert+traditional+news': (2.172, 3.321)
             }

for train, dev, test in zip(train_splits, dev_splits, test_splits):
    for f, D in features.items():
        X = get_time_split(D, train[0], train[1])
        y = get_time_split(labels_df, test[0], test[1])
        X_test = get_time_split(D, test[0], test[1])
        for name, regr in zip(['RF', 'OLS', 'Lasso'], [rf, ols, lasso]):
            regr.fit(X, y)
            preds = regr.predict(X_test)
            labels = get_time_split(labels_df, test[0], test[1])
            rmse = mean_squared_error(labels, preds, squared=False)
            stderr = diebold_mariano(preds, labels)
            upper_bound = np.sqrt(rmse**2 + 1.96*stderr)
            lower_bound = np.sqrt(rmse**2 - 1.96*stderr)
            precision, recall, thresholds = precision_recall_curve(labels, preds)
            auc_precision_recall = auc(recall, precision)
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 'country': ['all'],
                                           'rmse': [rmse], 'lower_bound': [lower_bound], 'upper_bound': [upper_bound]},
                                          orient='columns')
            fig_3a = pd.concat([fig_3a, _row], axis=0)
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 
                                           'aucpr': [auc_precision_recall]},
                                          orient='columns')
            fig_3b = pd.concat([fig_3b, _row], axis=0)
            print ("Method: {}, Split: {}, Features: {}, AUCPR: {}".format(name, test, f, auc_precision_recall))
            print ("Method: {}, Split: {}, Features: {}, RMSE: {} [{}, {}]".format(name, test, f, rmse, lower_bound, upper_bound))
            
            recall_at_80p = 0
            for p_t, p_t_add_3, p_t_min_3 in zip(preds, preds[3:] + [1,1,1], preds[:-3]+[5,5,5]):
                u_b = thresholds[f]['upper_bound']
                l_b = thresholds[f]['lower_bound']
                if p_t >= u_b and p_t_add_3 >= u_b and p_t_min_3 <= l_b:
                    recall_at_80p += 1
            
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 
                                           'recall_at_80p': [recall_at_80p]},
                                          orient='columns')
            fig_3c = pd.concat([fig_3c, _row], axis=0)
            
            # for country in time_series['country'].unique():
            #     c_id = X_test[X_test['country']==country]
            #     labels_c = labels[c_id]
            #     preds_c = preds[c_id]
            #     rmse = mean_squared_error(labels_c, preds_c, squared=False)
            #     stderr = diebold_mariano(preds_c, labels_c)
            #     upper_bound = np.sqrt(rmse**2 + 1.96*stderr)
            #     lower_bound = np.sqrt(rmse**2 - 1.96*stderr)
            #     _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 'country': [country],
            #                                'rmse': [rmse], 'lower_bound': [lower_bound], 'upper_bound': [upper_bound]},
            #                               orient='columns')
            #     fig_3a = pd.concat([fig_3a, _row], axis=0)
            #     print ("Country: {}, Method: {}, Split: {}, Features: {}, RMSE: {} [{}, {}]".format(country, name, test, f, rmse, lower_bound, upper_bound))

# fig_3a.to_csv('fig_3a.csv')
fig_3b.to_csv('fig_3b.csv')
fig_3c.to_csv('fig_3c.csv')

KeyError: "['fews_ipc_3', 'fews_ipc_6', 'fews_ipc_9', 'fews_ipc_12', 'fews_ipc_15', 'fews_ipc_18'] not in index"