# Import libraries

In [20]:
%pip install pandas numpy gdown matplotlib seaborn scikit-learn editdistance fuzzywuzzy --quiet

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import gdown
import zipfile
import os

In [2]:
def pretty_print_list(list_to_print):
    formatted_columns = "\n- " + "\n- ".join(list_to_print)  
    formatted_columns = sorted(list_to_print)
    print("\n- " + "\n- ".join(formatted_columns))

# Download data from Google Drive

In [4]:
url = "https://drive.google.com/uc?id=1YoQ1hz9RlaLr2xW3KoKCfJPyyO2PErym"
output = "data.zip"

if not os.path.exists("./data"):
    gdown.download(url, output, quiet=False) 
    zipfile.ZipFile('data.zip', 'r').extractall()
else:
    print("You already have the data downloaded and extracted")

You already have the data downloaded and extracted


In [3]:
admins = pd.read_csv('./data/famine-country-province-district-years-CS.csv')
time_series = pd.read_csv('./data/time_series_with_causes_zscore_full.csv')
valid_matching = pd.read_csv('./data/matching_districts.csv')

In [5]:
time_series.head(5)

Unnamed: 0.1,Unnamed: 0,index,country,admin_code,admin_name,centx,centy,year_month,year,month,...,carbon_2,mayhem_0,mayhem_1,mayhem_2,dehydrated_0,dehydrated_1,dehydrated_2,mismanagement_0,mismanagement_1,mismanagement_2
0,0,30,Afghanistan,202,Kandahar,65.709343,31.043618,2009_07,2009,7,...,1.053,0.667,-0.171,-0.833,0.173667,0.168,1.284667,-0.073,-0.427667,0.668333
1,1,33,Afghanistan,202,Kandahar,65.709343,31.043618,2009_10,2009,10,...,-0.660812,-0.63658,-0.520247,-0.782913,-0.671587,-0.612254,-0.926921,-0.510467,-0.625133,-0.452467
2,2,36,Afghanistan,202,Kandahar,65.709343,31.043618,2010_01,2010,1,...,-0.134333,1.447667,-0.844333,0.778667,-0.676,-0.689667,0.293333,0.530333,-0.471333,0.955333
3,3,39,Afghanistan,202,Kandahar,65.709343,31.043618,2010_04,2010,4,...,-0.326927,-0.594877,0.16479,-0.90521,-0.62054,0.165794,0.045794,-1.0116,-0.8106,-0.2056
4,4,42,Afghanistan,202,Kandahar,65.709343,31.043618,2010_07,2010,7,...,-1.085146,-0.709913,-0.867913,-0.770247,-0.787921,-0.974587,-0.946921,-0.611133,-0.7098,-0.6228


In [None]:
time_series.drop_duplicates(inplace=True)
time_series.drop(columns=["Unnamed: 0", "centx", "centy", ], inplace=True) 

In [7]:
print("No. of rows : ", time_series.shape[0])
print("No. of columns : ", time_series.shape[1])

No. of rows :  40952
No. of columns :  530


In [8]:
columns_list = time_series.columns.to_list()

print("\nüóÇÔ∏è The column names in the dataset are as follows:\n")
pretty_print_list(columns_list)


üóÇÔ∏è The column names in the dataset are as follows:


- abnormally low rainfall_0
- abnormally low rainfall_1
- abnormally low rainfall_2
- acled_count
- acled_fatalities
- acute hunger_0
- acute hunger_1
- acute hunger_2
- admin_code
- admin_name
- aid appeal_0
- aid appeal_1
- aid appeal_2
- aid workers died_0
- aid workers died_1
- aid workers died_2
- air attack_0
- air attack_1
- air attack_2
- alarming level_0
- alarming level_1
- alarming level_2
- anti-western policies_0
- anti-western policies_1
- anti-western policies_2
- apathy_0
- apathy_1
- apathy_2
- area
- asylum seekers_0
- asylum seekers_1
- asylum seekers_2
- authoritarian_0
- authoritarian_1
- authoritarian_2
- bad harvests_0
- bad harvests_1
- bad harvests_2
- blockade_0
- blockade_1
- blockade_2
- bombing campaign_0
- bombing campaign_1
- bombing campaign_2
- brain drain_0
- brain drain_1
- brain drain_2
- brutal government_0
- brutal government_1
- brutal government_2
- burning houses_0
- burning houses_1
- b

The **traditional risk factors** used in the study are categorized into **time-variant** (changing over time) and **time-invariant** (fixed for a given district). Below is the mapping between these risk factors and their corresponding **columns in the time series dataset** which I picked directly from the paper itself.:

---

##### **üìå Time-Variant Factors (Change Over Time)**
| **Traditional Risk Factor** | **Time Series Column** | **Description** |
|----------------------------|-----------------------|----------------|
| **Violent Conflict Events** | `acled_count` | Monthly count of conflict events. |
| **Conflict Fatalities per Event** | `acled_fatalities` | Average number of fatalities per conflict event. |
| **Food Prices Index (Log Nominal)** | `p_staple_food` | Monthly log nominal food price index. |
| **Food Prices Year-on-Year Difference** | `p_staple_food_diff` | Change in food price index compared to the previous year. |
| **Evapotranspiration Index (Mean)** | `et_mean` | Monthly mean of evapotranspiration (water loss from soil and plants). |
| **Rainfall Index (Mean)** | `rain_mean` | Monthly mean rainfall in the district. |
| **Rainfall Deviation from Average** | `rain_anom` | Difference between actual rainfall and seasonal average. |
| **Normalized Difference Vegetation Index (Mean)** | `ndvi_mean` | Satellite-derived measure of vegetation health. |
| **Vegetation Deviation from Average** | `ndvi_anom` | Difference between actual NDVI and historical average. |

---

##### **üìå Time-Invariant Factors (Fixed for a District)**
| **Traditional Risk Factor** | **Time Series Column** | **Description** |
|----------------------------|-----------------------|----------------|
| **Population Count** | `pop` | Estimated population in the district. |
| **Terrain Ruggedness Index** | `ruggedness_mean` | Measures how rough the terrain is. |
| **District Size** | `area` | Total land area of the district. |
| **Share of Cropland Use** | `cropland_pct` | Percentage of district area used for cropland. |
| **Share of Pasture Use** | `pasture_pct` | Percentage of district area used for pasture. |

---

In [None]:
t_variant_traditional_factors = ['ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 
                                    'acled_count', 'acled_fatalities', 'p_staple_food'] # 9 traditional variant factors
t_invariant_traditional_factors = ['area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct'] # 5 invariant factors
news_factors = [name for name in time_series.columns.values if '_0' in name]

In [None]:
print(f"The {len(news_factors)} news factors are as follows:\n")
pretty_print_list(news_factors)

There 167 news factors are as follows:


- abnormally low rainfall_0
- acute hunger_0
- aid appeal_0
- aid workers died_0
- air attack_0
- alarming level_0
- anti-western policies_0
- apathy_0
- asylum seekers_0
- authoritarian_0
- bad harvests_0
- blockade_0
- bombing campaign_0
- brain drain_0
- brutal government_0
- burning houses_0
- call for donations_0
- carbon_0
- catastrophe_0
- cattle death_0
- cattle plague_0
- cholera outbreak_0
- civil strife_0
- civilians uprooted_0
- clan battle_0
- clan warfare_0
- clans_0
- climate change_0
- climatic hazards_0
- collapse of government_0
- collapsing economy_0
- conflict_0
- continued deterioration_0
- continued strife_0
- convoys_0
- corrupt government_0
- corruption_0
- coup_0
- cycle of poverty_0
- cyclone_0
- d'etat_0
- dehydrated_0
- destructive pattern_0
- devastated the economy_0
- dictators_0
- displaced_0
- disrupted trade_0
- disruption to farming_0
- drought_0
- dysfunction_0
- ecological crisis_0
- economic crisis_0
- econom

In [None]:
def get_lagged(x, f, t):
    """
    Retrieve the lagged value of a specified feature for a given time lag.

    Parameters:
    x (pd.Series): A row from the time_series DataFrame.
    f (str): The feature/column name for which the lagged value is to be retrieved.
    t (int): The time lag in months.

    Returns:
    float: The lagged value of the specified feature. If the lagged value is not available, returns the current value of the feature.
    """
    admin_code = x['admin_code']
    year = x['year']
    month = x['month']
    l_month = ((month-1-t)%12)+1
    l_year = year
    if month-t <= 0:
        l_year -= 1
    ts = time_series[time_series['admin_code'] == admin_code]
    lagged_year_month = '{}_{}'.format(l_year, l_month)
    if lagged_year_month in ts['year_month'].values:
        ts = ts[ts['year_month'] == lagged_year_month]
        return ts[f].values[0]
    else:
        return x[f]
    

In [None]:

time_series['year_month'] = pd.to_datetime(time_series['year'].astype(str) + '-' + time_series['month'].astype(str))


def create_lagged_features(df, feature, lag):
    df_lagged = df[['admin_code', 'year_month', feature]].copy()
    df_lagged['year_month'] += pd.DateOffset(months=lag)  
    df_lagged.rename(columns={feature: f'{feature}_lag{lag}'}, inplace=True)
    return df_lagged

lagged_features = create_lagged_features(time_series, 'rainfall', 3)

time_series = time_series.merge(lagged_features, on=['admin_code', 'year_month'], how='left')



In [None]:
def add_time_lagged(features, start=3, end=9, diff=1, agg=True):
    if agg:
        levels = ['', '_province', '_country']
    else:
        levels = ['']
    for suffix in levels:
        for f in features:
            f_s = f+suffix
            for t in range(start,end,diff):
                if '{}_{}'.format(f_s,t) in time_series:
                    continue
                time_series['{}_{}'.format(f_s,t)] = time_series.apply(lambda x: get_lagged(x, f_s, t), axis=1)

# Get Admin level mapping

In [13]:
unique_countries_admin_dataset = admins.country.unique()

print(f"\nüåç The unique countries in the admin dataset are as follows:\n")
pretty_print_list(unique_countries_admin_dataset)


üåç The unique countries in the admin dataset are as follows:


- Abyei
- Afghanistan
- Angola
- Burkina Faso
- Burundi
- Cameroon
- Central African Republic
- Chad
- Congo
- Democratic Republic of the Congo
- Djibouti
- El Salvador
- Ethiopia
- Guatemala
- Guinea
- Haiti
- Honduras
- Iilemi triangle
- Kenya
- Liberia
- Madagascar
- Malawi
- Mali
- Mauritania
- Mozambique
- Niger
- Nigeria
- Rwanda
- Senegal
- Sierra Leone
- Somalia
- South Sudan
- Sudan
- Tajikistan
- Tanzania
- Uganda
- Yemen
- Zambia
- Zimbabwe


In [None]:
admin_names = time_series['admin_name'].unique()
districts = admins['district'].unique()
provinces = admins['province'].unique()
countries = admins['country'].unique()

In [None]:
print (len(admin_names), len(districts), len(provinces), len(countries))
print (len(set(admin_names).difference(districts)))
missing_admin_names = set(admin_names).difference(districts)
print (len(missing_admin_names.difference(provinces)))
missing_admin_names = missing_admin_names.difference(provinces)

In [None]:
import editdistance
from fuzzywuzzy import fuzz
def find_matching(missing, names):
    matching_districts = {}
    for m in missing:
        max_overlap = 0
        nearest_d = None
        for d in names:
            d = str(d)
            dist = fuzz.partial_ratio(m, d)
            if dist > max_overlap:
                max_overlap = dist
                nearest_d = d
        matching_districts[m] = nearest_d
    return matching_districts


matching = find_matching(missing_admin_names, districts)
matching_p = find_matching(missing_admin_names, provinces)
#manually verify matching and update
for k in matching.keys():
    print (k, matching[k], matching_p[k])


In [None]:
# Adjust filepath (file also in GitHub repository)
# After validating the matches, the names are logged in this csv file
valid_matching = pd.read_csv('matching_districts.csv')

In [None]:
matched = valid_matching['missing'].unique()
# matched = [bytes(m).decode("unicode_escape") for  m in matched]
missing_admin_names =  [m.decode("unicode_escape").encode('ascii', 'backslashreplace') for m in missing_admin_names]
print (len(missing_admin_names), len(matched))
set(missing_admin_names).difference(matched)

In [None]:
def find_province(x):
    try:
        if x in districts:
            return admins[admins['district']==x]['province'].values[0]
        elif x in provinces:
            return x
        elif x.decode("unicode_escape").encode('ascii', 'backslashreplace') in matched:
            x = x.decode("unicode_escape").encode('ascii', 'backslashreplace')
            v = valid_matching[valid_matching['missing']==x]
            if v['match'].values[0]=='district':
                x = v['district'].values[0]
                return admins[admins['district']==x]['province'].values[0]
            elif v['match'].values[0]=='province':
                return v['province'].values[0]
    except:
        raise Exception("Province not found for: {}".format(x))

In [None]:
admin_to_province = {}
for a in admin_names:
    try:
        admin_to_province[a] = find_province(a)
    except:
        print (a)

In [None]:
time_series['province'] = time_series['admin_name'].apply(lambda x: admin_to_province[x])

# Add province and country aggregate values

In [None]:
def add_agg_factors(features, level='province'):
    grouped_df = time_series.groupby(['year_month', level]).mean()
    for f in features:
        time_series['{}_{}'.format(f, level)] = time_series.apply(lambda x: grouped_df.ix[x['year_month'], x[level]][f], axis=1)

In [None]:
add_agg_factors(news_factors)

In [None]:
add_agg_factors(news_factors, level='country')
add_agg_factors(t_variant_traditional_factors, level='province')
add_agg_factors(t_variant_traditional_factors, level='country')
add_agg_factors(t_invariant_traditional_factors, level='province')
add_agg_factors(t_invariant_traditional_factors, level='country')

In [None]:
time_series.to_csv('agg_province_features.csv')

# Add time lagged features

In [None]:
add_time_lagged(t_variant_traditional_factors)

In [None]:
add_time_lagged(news_factors)

In [None]:
add_time_lagged(['fews_ipc'], end=21, diff=3, agg=False)

In [None]:
add_time_lagged(['fews_proj_near'], start=3, end=4, diff=1, agg=False)

In [None]:
import math
def diebold_mariano(preds, labels):
    sq_error = [(p-l)**2 for p,l in zip(preds, labels)]
    mean = np.mean(sq_error)
    n = len(preds)
    gammas = {}
    m = max(n,int(math.ceil(np.cbrt(n))+2))
    for k in range(m):
        gammas[k] = 0
        for i in range(k+1, n):
            gammas[k] += (sq_error[i] - mean)*(sq_error[i-k] - mean)
        gammas[k] = gammas[k]/n
    sum_gamma = gammas[0]
    for k in range(1, m):
        sum_gamma += 2*gammas[k]
    return np.sqrt(sum_gamma/n)

# Generate and save data for Fig 3A, B, C

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc

test_splits = [
    ((2010,7), (2011, 7)), 
    ((2011,7), (2012, 7)),
    ((2012,7), (2013, 7)), 
    ((2013,7), (2014, 7)), 
    ((2014,7), (2015, 7)), 
    ((2015,7), (2016, 7)), 
    ((2016,7), (2017, 7)), 
    ((2017,7), (2018, 7)),
    ((2018,7), (2019, 7)), 
    ((2019,2), (2020, 2)),
]
train_splits = [
    ((2009,7), (2010,4)),
    ((2009,7), (2011,1)),
    ((2009,7), (2011,10)),
    ((2009,7), (2012,7)),
    ((2009,7), (2013,7)),
    ((2009,7), (2014,1)),
    ((2009,7), (2015,1)),
    ((2009,7), (2015,10)),
    ((2009,7), (2016,10)),
    ((2009,7), (2017,2))]
dev_splits = [
    ((2010,4), (2010, 7)),
    ((2011,1), (2011, 7)),
    ((2011,10), (2012, 7)),
    ((2012,7), (2013, 7)),
    ((2013,4), (2014, 7)),
    ((2014,1), (2015, 7)),
    ((2015,1), (2016, 7)),
    ((2015,10), (2017, 7)),
    ((2016,10), (2018, 7)),
    ((2017,2), (2019, 2)),
]
rf = RandomForestRegressor(max_features='auto', n_estimators=100, 
                             min_samples_split=0.5, min_impurity_decrease=0.001, random_state=0)
ols = LinearRegression()

lasso = linear_model.Lasso(alpha=0.1)

def get_agg_lagged_features(factors):
    return ['{}_{}'.format(f, t) for f, t in zip(factors, range(3,9))] + ['{}_province_{}'.format(f, t) for f, t in zip(factors, range(3,9))] + ['{}_country_{}'.format(f, t) for f, t in zip(factors, range(3,9))]
        

features = {
    'traditional': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] + 
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors
    ], 
    'news': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(news_factors)
    ], 
    'traditional+news': time_series[
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors +
        get_agg_lagged_features(news_factors)
    ],
    'expert': time_series['fews_proj_near_3'],
    'expert+traditional': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] + 
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors
    ],
    'expert+news': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(news_factors)
    ],
    'expert+traditional+news': time_series[
        ['fews_proj_near_3'] +
        ['{}_{}'.format('fews_ipc', t) for t in range(3,21,3)] +
        get_agg_lagged_features(t_variant_traditional_factors) + 
        t_invariant_traditional_factors +
        get_agg_lagged_features(news_factors)
    ]
}

labels_df = time_series['fews_ipc']

def get_time_split(df, start, end):
    return df[df['year'] >= start[0] & df['month'] >= start[1] & df['year'] <= end[0] & df['month'] <= end[1]]


fig_3a = pd.DataFrame(columns=['method', 'split', 'features', 'country', 'rmse', 'lower_bound', 'upper_bound'])
fig_3b = pd.DataFrame(columns=['method', 'split', 'features', 'aucpr'])
fig_3c = pd.DataFrame(columns=['method', 'split', 'features', 'recall_at_80p'])

thresholds = {'traditional': (2.236, 3.125), 
              'news': (1.907, 2.712), 
              'traditional+news': (2.105, 3.314),
              'expert': (2, 3),
              'expert+news': (1.912, 2.813),
              'expert+traditional': (2.241, 3.132),
              'expert+traditional+news': (2.172, 3.321)
             }

for train, dev, test in zip(train_splits, dev_splits, test_splits):
    for f, D in features.items():
        X = get_time_split(D, train[0], train[1])
        y = get_time_split(labels_df, test[0], test[1])
        X_test = get_time_split(D, test[0], test[1])
        for name, regr in zip(['RF', 'OLS', 'Lasso'], [rf, ols, lasso]):
            regr.fit(X, y)
            preds = regr.predict(X_test)
            labels = get_time_split(labels_df, test[0], test[1])
            rmse = mean_squared_error(labels, preds, squared=False)
            stderr = diebold_mariano(preds, labels)
            upper_bound = np.sqrt(rmse**2 + 1.96*stderr)
            lower_bound = np.sqrt(rmse**2 - 1.96*stderr)
            precision, recall, thresholds = precision_recall_curve(labels, preds)
            auc_precision_recall = auc(recall, precision)
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 'country': ['all'],
                                           'rmse': [rmse], 'lower_bound': [lower_bound], 'upper_bound': [upper_bound]},
                                          orient='columns')
            fig_3a = pd.concat([fig_3a, _row], axis=0)
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 
                                           'aucpr': [auc_precision_recall]},
                                          orient='columns')
            fig_3b = pd.concat([fig_3b, _row], axis=0)
            print ("Method: {}, Split: {}, Features: {}, AUCPR: {}".format(name, test, f, auc_precision_recall))
            print ("Method: {}, Split: {}, Features: {}, RMSE: {} [{}, {}]".format(name, test, f, rmse, lower_bound, upper_bound))
            
            recall_at_80p = 0
            for p_t, p_t_add_3, p_t_min_3 in zip(preds, preds[3:] + [1,1,1], preds[:-3]+[5,5,5]):
                u_b = thresholds[f]['upper_bound']
                l_b = thresholds[f]['lower_bound']
                if p_t >= u_b and p_t_add_3 >= u_b and p_t_min_3 <= l_b:
                    recall_at_80p += 1
            
            _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 
                                           'recall_at_80p': [recall_at_80p]},
                                          orient='columns')
            fig_3c = pd.concat([fig_3c, _row], axis=0)
            
            for country in time_series['country'].unique():
                c_id = X_test[X_test['country']==country]
                labels_c = labels[c_id]
                preds_c = preds[c_id]
                rmse = mean_squared_error(labels_c, preds_c, squared=False)
                stderr = diebold_mariano(preds_c, labels_c)
                upper_bound = np.sqrt(rmse**2 + 1.96*stderr)
                lower_bound = np.sqrt(rmse**2 - 1.96*stderr)
                _row = pd.DataFrame.from_dict({'method': [name], 'split': [test], 'features': [f], 'country': [country],
                                           'rmse': [rmse], 'lower_bound': [lower_bound], 'upper_bound': [upper_bound]},
                                          orient='columns')
                fig_3a = pd.concat([fig_3a, _row], axis=0)
                print ("Country: {}, Method: {}, Split: {}, Features: {}, RMSE: {} [{}, {}]".format(country, name, test, f, rmse, lower_bound, upper_bound))

fig_3a.to_csv('fig_3a.csv')
fig_3b.to_csv('fig_3b.csv')
fig_3c.to_csv('fig_3c.csv')