In [6]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

# helper functions
from helper_functions import get_relevant_topics
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score


# Import models
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm

# Helper functions
from helper_functions import create_lag_df
from helper_functions import plot_ConfusionMatrix

In [2]:
# df_southsudan = pd.read_csv("data/food_crises_cleaned.csv", parse_dates=["date"])
df_southsudan = pd.read_csv("data/cleaned_food_crises.csv")

In [3]:
len(df_southsudan['district'].unique())

78

In [28]:
df_cleaned = df_southsudan[(df_southsudan["date"] >= "2008")].copy()

In [29]:
df_cleaned['date'] = pd.to_datetime(df_cleaned['date'])

In [30]:
for i in range(2009, 2020 + 1):
    num_of_articles = len(df_cleaned[(df_cleaned['ipc'].notnull()) & (df_cleaned['date'] >= f"{i}-01-01") & (df_cleaned['date']< f"{i+1}-01-01")])
    print(f"The number of articles in {i}: {num_of_articles}")

The number of articles in 2009: 156
The number of articles in 2010: 312
The number of articles in 2011: 312
The number of articles in 2012: 312
The number of articles in 2013: 312
The number of articles in 2014: 312
The number of articles in 2015: 312
The number of articles in 2016: 234
The number of articles in 2017: 234
The number of articles in 2018: 234
The number of articles in 2019: 234
The number of articles in 2020: 78


In [None]:
# df_cleaned.to_csv("data/cleaned_food_crises.csv")

In [None]:
# selection = ["ipc", "ha", "ndvi_mean", "ndvi_anom", "rain_mean", "rain_anom", "et_mean", "et_anom",
#              "count_violence", "sum_fatalities", "food_price_idx", "area", "cropland_pct", "pop",
#              "ruggedness_mean", "pasture_pct"]
# sns.pairplot(df_cleaned[selection], hue='ipc')

In [2]:
df_cleaned.info()

NameError: name 'df_cleaned' is not defined

NameError: name 'df_cleaned' is not defined

## Data preparation for prediction

In [32]:
df_prediction = df_cleaned.copy()
df_prediction.set_index(["date", "district"], inplace=True) # Set index
df_prediction = create_lag_df(df_prediction, ['count_violence', 'ndvi_anom'], 3, rolling=6) # 3-month-lagged rolling mean window of size 6
df_prediction = create_lag_df(df_prediction, ['food_price_idx'], 3, difference=True, rolling=6) # difference of the 3-month-lagged rolling mean window of size 6
# df_prediction = create_lag_df(df_prediction, ['ipc'], 1, dropna=True) # 1-month-lag
# df_prediction = create_lag_df(df_prediction, ['ipc'], 2, dropna=True) # 2-month-lag
df_prediction = create_lag_df(df_prediction, ['ipc'], 3, dropna=True) # 3-month-lag
df_prediction = create_lag_df(df_prediction, ['ipc'], 6, dropna=True) # 6-month-lag
df_prediction = create_lag_df(df_prediction, ['ipc'], 12, dropna=True) # 12-month-lag
df_prediction = df_prediction.dropna()

In [33]:
len(df_prediction)

2106

In [34]:
drop = ["centx", "centy", "ipc", "Unnamed: 0"]
X_all_available = df_prediction.copy().drop(columns=drop)
Y = df_prediction['ipc']

In [35]:
X_all_available.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2106 entries, (Timestamp('2012-07-01 00:00:00'), 'Bor') to (Timestamp('2020-02-01 00:00:00'), 'Malakal')
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   ha                    2106 non-null   float64
 1   ndvi_mean             2106 non-null   float64
 2   ndvi_anom             2106 non-null   float64
 3   rain_mean             2106 non-null   float64
 4   rain_anom             2106 non-null   float64
 5   et_mean               2106 non-null   float64
 6   et_anom               2106 non-null   float64
 7   count_violence        2106 non-null   int64  
 8   sum_fatalities        2106 non-null   int64  
 9   food_price_idx        2106 non-null   float64
 10  area                  2106 non-null   float64
 11  cropland_pct          2106 non-null   float64
 12  pop                   2106 non-null   float64
 13  ruggedness_mean       2106 non-null   floa

In [36]:
X = X_all_available.copy().iloc[:, 15:]
X.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2106 entries, (Timestamp('2012-07-01 00:00:00'), 'Bor') to (Timestamp('2020-02-01 00:00:00'), 'Malakal')
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   count_violence_lag_3  2106 non-null   float64
 1   ndvi_anom_lag_3       2106 non-null   float64
 2   food_price_idx_lag_3  2106 non-null   float64
 3   ipc_lag_3             2106 non-null   float64
 4   ipc_lag_6             2106 non-null   float64
 5   ipc_lag_12            2106 non-null   float64
dtypes: float64(6)
memory usage: 112.9+ KB


In [37]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
X_train, X_test, y_train, y_test = train_test_split( 
                        X,Y,test_size = 0.30, random_state = 101) 

# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'gamma':['scale', 'auto'],
              'kernel': ['linear']}  
   
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3,n_jobs=-1) 
   
# fitting the model for grid search 
grid.fit(X_train, y_train) 
 
# print best parameter after tuning 
print(grid.best_params_) 
grid_predictions = grid.predict(X_test) 
   
# print classification report 
print(classification_report(y_test, grid_predictions)) 

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[CV 2/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.610 total time=   0.3s
[CV 3/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.583 total time=   0.4s
[CV 4/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.644 total time=   0.4s
[CV 1/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.580 total time=   0.3s
[CV 2/5] END ..C=0.1, gamma=auto, kernel=linear;, score=0.610 total time=   0.3s
[CV 3/5] END ..C=0.1, gamma=auto, kernel=linear;, score=0.583 total time=   0.4s
[CV 5/5] END ..C=0.1, gamma=auto, kernel=linear;, score=0.612 total time=   0.4s
[CV 5/5] END .C=0.1, gamma=scale, kernel=linear;, score=0.612 total time=   0.4s
[CV 4/5] END ..C=0.1, gamma=auto, kernel=linear;, score=0.644 total time=   0.4s
[CV 1/5] END ..C=0.1, gamma=auto, kernel=linear;, score=0.580 total time=   0.4s
[CV 2/5] END ...C=1, gamma=scale, kernel=linear;, score=0.634 total time=   1.0s
[CV 1/5] END ...C=1, gamma=scale, kernel=linear;, score=0.586 total time=   1.0s
[CV 4/5] END ...C=1, gamma=s

In [39]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

rfc=RandomForestClassifier(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)

print(CV_rfc.best_params_) 
rfc_grid_predictions = CV_rfc.predict(X_test) 
   
# print classification report 
print(classification_report(y_test, rfc_grid_predictions)) 

{'criterion': 'entropy', 'max_depth': 8, 'max_features': 'sqrt', 'n_estimators': 200}
              precision    recall  f1-score   support

         1.0       0.75      0.66      0.71       125
         2.0       0.64      0.75      0.69       218
         3.0       0.70      0.76      0.73       235
         4.0       0.45      0.09      0.15        54

    accuracy                           0.68       632
   macro avg       0.64      0.57      0.57       632
weighted avg       0.67      0.68      0.66       632

