In [34]:
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
ipc_load = pd.read_csv("data/food_crises_cleaned.csv", parse_dates=["date"])
ipc_df = ipc_load[["district", "date", "centx", "centy", "ipc", "area", "pop"]].dropna(subset=["ipc"])

In [3]:
conflict_load = pd.read_csv("data/external_data/4_acleddata_2011-05-19-2023-09-28-South_Sudan_with_district.csv",
                            parse_dates=["event_date"])
conflict_df = conflict_load[["event_date", "longitude", "latitude", "event_type", 
                             "fatalities", "closest_district"]].sort_values("event_date")

In [4]:
ipc_df

Unnamed: 0,district,date,centx,centy,ipc,area,pop
30,Bor,2009-07-01,32.00486,6.465644,2.0,14008.3300,256618.0
33,Bor,2009-10-01,32.00486,6.465644,2.0,14008.3300,256618.0
36,Bor,2010-01-01,32.00486,6.465644,1.0,14008.3300,265263.0
39,Bor,2010-04-01,32.00486,6.465644,2.0,14008.3300,265263.0
42,Bor,2010-07-01,32.00486,6.465644,2.0,14008.3300,265263.0
...,...,...,...,...,...,...,...
12307,Malakal,2018-10-01,31.64280,9.658457,3.0,757.7855,118402.0
12311,Malakal,2019-02-01,31.64280,9.658457,3.0,757.7855,102228.0
12315,Malakal,2019-06-01,31.64280,9.658457,3.0,757.7855,102228.0
12319,Malakal,2019-10-01,31.64280,9.658457,3.0,757.7855,102228.0


In [5]:
conflict_df

Unnamed: 0,event_date,longitude,latitude,event_type,fatalities,closest_district
9216,2011-05-19,33.0701,8.0642,Violence against civilians,0,Akobo
9215,2011-07-14,26.1949,7.7150,Violence against civilians,0,Aweil Center
9213,2011-07-15,33.1308,6.7984,Battles,0,Pibor
9214,2011-07-15,32.3906,8.0924,Violence against civilians,0,Uror
9212,2011-07-16,26.0431,8.0262,Violence against civilians,0,Raja
...,...,...,...,...,...,...
4,2023-09-14,31.8134,5.3403,Violence against civilians,2,Terekeka
3,2023-09-16,33.0051,7.7913,Violence against civilians,4,Akobo
2,2023-09-17,34.0993,7.1881,Battles,1,Pochalla
1,2023-09-18,34.0993,7.1881,Battles,13,Pochalla


In [6]:
ipc_dates = ipc_df.date.unique()
ipc_dates

array(['2009-07-01T00:00:00.000000000', '2009-10-01T00:00:00.000000000',
       '2010-01-01T00:00:00.000000000', '2010-04-01T00:00:00.000000000',
       '2010-07-01T00:00:00.000000000', '2010-10-01T00:00:00.000000000',
       '2011-01-01T00:00:00.000000000', '2011-04-01T00:00:00.000000000',
       '2011-07-01T00:00:00.000000000', '2011-10-01T00:00:00.000000000',
       '2012-01-01T00:00:00.000000000', '2012-04-01T00:00:00.000000000',
       '2012-07-01T00:00:00.000000000', '2012-10-01T00:00:00.000000000',
       '2013-01-01T00:00:00.000000000', '2013-04-01T00:00:00.000000000',
       '2013-07-01T00:00:00.000000000', '2013-10-01T00:00:00.000000000',
       '2014-01-01T00:00:00.000000000', '2014-04-01T00:00:00.000000000',
       '2014-07-01T00:00:00.000000000', '2014-10-01T00:00:00.000000000',
       '2015-01-01T00:00:00.000000000', '2015-04-01T00:00:00.000000000',
       '2015-07-01T00:00:00.000000000', '2015-10-01T00:00:00.000000000',
       '2016-02-01T00:00:00.000000000', '2016-06-01

In [7]:
correct_dates = []
for date_conflict in conflict_df.event_date:
    future_date = date_conflict + pd.DateOffset(months=3)
    future_dates = ipc_dates[ipc_dates >= future_date]
    if len(future_dates) == 0:
        correct_dates.append(float("nan"))
    else:
        correct_dates.append(future_dates[0])
conflict_df["date_future"] = correct_dates

In [11]:
conflict_df

Unnamed: 0,event_date,longitude,latitude,event_type,fatalities,closest_district,date_future
9216,2011-05-19,33.0701,8.0642,Violence against civilians,0,Akobo,2011-10-01
9215,2011-07-14,26.1949,7.7150,Violence against civilians,0,Aweil Center,2012-01-01
9213,2011-07-15,33.1308,6.7984,Battles,0,Pibor,2012-01-01
9214,2011-07-15,32.3906,8.0924,Violence against civilians,0,Uror,2012-01-01
9212,2011-07-16,26.0431,8.0262,Violence against civilians,0,Raja,2012-01-01
...,...,...,...,...,...,...,...
4,2023-09-14,31.8134,5.3403,Violence against civilians,2,Terekeka,NaT
3,2023-09-16,33.0051,7.7913,Violence against civilians,4,Akobo,NaT
2,2023-09-17,34.0993,7.1881,Battles,1,Pochalla,NaT
1,2023-09-18,34.0993,7.1881,Battles,13,Pochalla,NaT


In [18]:
grouped = conflict_df.groupby(["closest_district", 
                               "date_future"]).agg({"event_type": "count", 
                                                    "fatalities": "mean"})[["event_type", "fatalities"]].reset_index()
grouped_conflict = grouped.fillna({"fatalities":0}).rename(columns={"event_type": "conflict_count"}).round({"fatalities":2})
grouped_conflict["date_future"] = grouped_conflict["date_future"].tolist()

In [19]:
grouped_conflict

Unnamed: 0,closest_district,date_future,conflict_count,fatalities
0,Abiemnhom,2012-10-01,2,3.50
1,Abiemnhom,2013-10-01,1,8.00
2,Abiemnhom,2014-04-01,2,4.00
3,Abiemnhom,2015-07-01,1,0.00
4,Abiemnhom,2015-10-01,3,13.67
...,...,...,...,...
1126,Yirol West,2018-02-01,3,7.00
1127,Yirol West,2019-02-01,1,19.00
1128,Yirol West,2019-06-01,8,3.88
1129,Yirol West,2019-10-01,3,0.67


In [20]:
ipc_df

Unnamed: 0,district,date,centx,centy,ipc,area,pop
30,Bor,2009-07-01,32.00486,6.465644,2.0,14008.3300,256618.0
33,Bor,2009-10-01,32.00486,6.465644,2.0,14008.3300,256618.0
36,Bor,2010-01-01,32.00486,6.465644,1.0,14008.3300,265263.0
39,Bor,2010-04-01,32.00486,6.465644,2.0,14008.3300,265263.0
42,Bor,2010-07-01,32.00486,6.465644,2.0,14008.3300,265263.0
...,...,...,...,...,...,...,...
12307,Malakal,2018-10-01,31.64280,9.658457,3.0,757.7855,118402.0
12311,Malakal,2019-02-01,31.64280,9.658457,3.0,757.7855,102228.0
12315,Malakal,2019-06-01,31.64280,9.658457,3.0,757.7855,102228.0
12319,Malakal,2019-10-01,31.64280,9.658457,3.0,757.7855,102228.0


In [21]:
merged_data = ipc_df.merge(grouped_conflict, left_on=["district", "date"], 
                            right_on=["closest_district", "date_future"])
merged_data

Unnamed: 0,district,date,centx,centy,ipc,area,pop,closest_district,date_future,conflict_count,fatalities
0,Bor,2012-07-01,32.00486,6.465644,2.0,14008.3300,277671.0,Bor,2012-07-01,2,11.00
1,Bor,2012-10-01,32.00486,6.465644,2.0,14008.3300,277671.0,Bor,2012-10-01,1,0.00
2,Bor,2013-04-01,32.00486,6.465644,2.0,14008.3300,283876.0,Bor,2013-04-01,2,3.00
3,Bor,2013-07-01,32.00486,6.465644,2.0,14008.3300,283876.0,Bor,2013-07-01,3,1.67
4,Bor,2013-10-01,32.00486,6.465644,1.0,14008.3300,283876.0,Bor,2013-10-01,3,5.67
...,...,...,...,...,...,...,...,...,...,...,...
1126,Malakal,2018-02-01,31.64280,9.658457,4.0,757.7855,118402.0,Malakal,2018-02-01,1,2.00
1127,Malakal,2018-06-01,31.64280,9.658457,3.0,757.7855,118402.0,Malakal,2018-06-01,1,0.00
1128,Malakal,2018-10-01,31.64280,9.658457,3.0,757.7855,118402.0,Malakal,2018-10-01,1,0.00
1129,Malakal,2019-02-01,31.64280,9.658457,3.0,757.7855,102228.0,Malakal,2019-02-01,2,0.00


In [69]:
merged_data["fatalities_per_pop"] = merged_data["fatalities"] / merged_data["pop"]
merged_data["fatalities_per_area"] = merged_data["fatalities"] / merged_data["area"]
merged_data["fatalities_per_poparea"]  = merged_data["fatalities"] / (merged_data["pop"] / merged_data["area"])
merged_data["conflicts_per_pop"] = merged_data["conflict_count"] / merged_data["pop"]
merged_data["conflicts_per_area"] = merged_data["conflict_count"] / merged_data["area"]
merged_data["conflicts_per_poparea"]  = merged_data["conflict_count"] / (merged_data["pop"] / merged_data["area"])

In [70]:
merged_data.to_csv(r"C:\Users\yamez\Downloads\data\merged_data.csv", index=False)

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [72]:
X = merged_data[['conflict_count', 'fatalities', 'pop', 'area', "fatalities_per_pop", 
                 "fatalities_per_area", "fatalities_per_poparea", "conflicts_per_pop", 
                 "conflicts_per_area", "conflicts_per_poparea"]]
y = merged_data['ipc']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [74]:
model = LinearRegression()
model.fit(X_train, y_train)

In [75]:
y_pred = model.predict(X_test)
y_pred[:5]

array([2.66753446, 2.46959034, 2.60103474, 2.62860234, 2.55846481])

In [76]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
rmse, r2

(0.8803052676862726, 0.014517585245570896)

In [77]:
mean_squared_error(y_test, [y_test.mean() for i in y_test], squared=False)

0.8867656486404322