In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline

In [2]:
# Conflict Data - admin1 (State)/admin2 (County)
# Health Facilies - state/county
# Migration - admin1(State)/admin2 (County) - Population
# Demographics - Just Demographics (State/County)

### Cleaning Conflict Data
Source: </br>
https://en.wikipedia.org/wiki/Counties_of_South_Sudan

In [3]:
# Event date, year, admin1, admin2, fatalities 
# Plan to slice down to year county/fatalities
conflict = pd.read_csv('conflict_data.csv')

# Slicing Conflict Data
conflict_sliced = conflict[['event_date', 'year', 'admin1', 'admin2', 'fatalities']]

In [4]:
conflict_sliced['year'].unique()

array([2015, 2014, 2013, 2012, 2011], dtype=int64)

In [4]:
conflict_sliced.head()

Unnamed: 0,event_date,year,admin1,admin2,fatalities
0,31 December 2015,2015,Western Bahr el Ghazal,Wau,0
1,30 December 2015,2015,Western Bahr el Ghazal,Jur River,5
2,30 December 2015,2015,Western Equatoria,Yambio,0
3,28 December 2015,2015,Western Equatoria,Yambio,0
4,28 December 2015,2015,Western Bahr el Ghazal,Wau,1


In [5]:
# Validating representation (76 of 79 Counties.)
len(conflict_sliced['admin2'].unique())

76

In [6]:
# Validating this Dataset has information in the correct years 
conflict_sliced['year'].unique()

array([2015, 2014, 2013, 2012, 2011], dtype=int64)

In [7]:
# Grouping by year/region
conflict_sliced_grouped = conflict_sliced.groupby(['admin2', 'year']).sum()

#conflict_sliced_grouped['year'].unique()
conflict_sliced_grouped.to_csv('aggregated_conflict_data.csv')

# Reading out to CSV then back in fixes index
merged_conflict_data = pd.read_csv('aggregated_conflict_data.csv')

# Reordering columns
merged_conflict_data = merged_conflict_data[['year', 'admin2', 'fatalities']]

# Printing the DF for Merging
merged_conflict_data.head()

Unnamed: 0,year,admin2,fatalities
0,2012,Abiemnhom,7
1,2013,Abiemnhom,8
2,2014,Abiemnhom,0
3,2011,Akobo,1
4,2012,Akobo,98


### Hospital Data 
Source: </br>
http://www.southsudanmedicaljournal.com/assets/files/Journals/vol_12_iss_2_mar_19/SSMJ%2012%202%20Issue%20Final.pdf

In [8]:
# Note nothing on hospital size or construction date in the DF
hospitals = pd.read_excel('health_facilities.xls')

In [9]:
# PHCU - Primary Health Care Unit
# PHCC - Primary Health Care Centers
hospitals['HF_TYPE'].value_counts()

PHCU                    1079
PHCC                     306
County Hospital           28
Private Clinic            15
Specialized Hospital      13
Other                     10
State Hospital             8
Teaching Hospital          3
Hospital                   1
Name: HF_TYPE, dtype: int64

In [10]:
print("There are {} medical facilities in South Sudan in {} of the 79 counties.".format(hospitals.shape[0],len(hospitals['COUNTY'].unique())))
print("This dataset does not contain information on size or construction date of the hospitals.")

There are 1463 medical facilities in South Sudan in 77 of the 79 counties.
This dataset does not contain information on size or construction date of the hospitals.


In [11]:
# Filtering hospitals
# 1463 Total Entries, 784 NaN, 648 Functional, 31 Non Fuctional. Assumed NAs were functional   
hospitals['FUNCTIONAL Status'].value_counts()

Functional        648
Non Functional     31
Name: FUNCTIONAL Status, dtype: int64

In [12]:
# Removing Non Functional Hospitals
hospitals_filtered = hospitals.loc[~(hospitals['FUNCTIONAL Status'] == 'Non Functional')]

# To Avoid Copy/View Warning.
hospitals_filtered = hospitals_filtered.copy() 

# Replacing all of the NaNs with Functional
hospitals_filtered['FUNCTIONAL Status'] = hospitals_filtered['FUNCTIONAL Status'].fillna(value='Functional')

# Dropping ID and source
del hospitals_filtered['ID']
del hospitals_filtered['Source']

hospitals_filtered.head(3)

Unnamed: 0,STATE,COUNTY,PAYAM,FACIL_NAME,FACIL_NAME2,LATITUDE,LONGITUDE,Q_CODE,HF_CODE,HF_TYPE,FUNCTIONAL Status
0,Central Equatoria,Juba,Juba Town,AIC BULUK,Aic Buluk,4.84718,31.59355,92093,15,PHCU,Functional
1,Central Equatoria,Juba,Lokiliri,ARU HEALTH UNIT,Aru Health Unit,4.36354,31.9846,92235,15,PHCU,Functional
2,Central Equatoria,Juba,Juba Town,BELPHAM MEDICAL HQ,Belpham Medical Hq,4.8189,31.57712,92206,14,PHCC,Functional


In [13]:
# Slicing the hospital
hospitals_filtered = hospitals_filtered[['STATE', 'COUNTY', 'FACIL_NAME2', 'HF_CODE', 'HF_TYPE']]

# Getting a count of hospitals by region. 
count_total = hospitals_filtered.groupby('COUNTY')['FACIL_NAME2'].count().rename("county_hospitals").reset_index()

# Combining back together
hospitals_filtered_added =  hospitals_filtered.merge(count_total)

# Dropping duplicates 
hospitals_county_level = hospitals_filtered_added.drop_duplicates('COUNTY')

# Slicing to just state county and hospital number
hospitals_county_level = hospitals_county_level[['STATE', 'COUNTY','county_hospitals']]
hospitals_county_level.head()

Unnamed: 0,STATE,COUNTY,county_hospitals
0,Central Equatoria,Juba,76
76,Central Equatoria,Kajo-Keji,46
122,Central Equatoria,Lainya,33
155,Central Equatoria,Morobo,12
167,Central Equatoria,Terekeka,45


In [14]:
print("Even after these operations the dataframe still retained the original {} counties.".format(len(hospitals_county_level['COUNTY'].unique())))

Even after these operations the dataframe still retained the original 77 counties.


### Merging Conflict and Hospital Data

In [15]:
# Hospitals and Conflict Merge
hos_conf_merged = pd.merge(merged_conflict_data, hospitals_county_level, left_on='admin2', right_on='COUNTY',how='left')

# Renaming 
hos_conf_merged = hos_conf_merged.rename(columns={'STATE':'state','COUNTY':'county'})

# Reordering Dataframe
hos_conf_merged = hos_conf_merged[['year', 'state', 'county','fatalities', 'county_hospitals']]

# Checking the DF
hos_conf_merged.head()

Unnamed: 0,year,state,county,fatalities,county_hospitals
0,2012,Unity,Abiemnhom,7,5.0
1,2013,Unity,Abiemnhom,8,5.0
2,2014,Unity,Abiemnhom,0,5.0
3,2011,Jonglei,Akobo,1,14.0
4,2012,Jonglei,Akobo,98,14.0


### Demographics
Population: </br>
https://www.worldometers.info/world-population/south-sudan-population/

In [25]:
# Reading in Data 
demographics = pd.read_excel('demographics_2020.xlsx')

# Printing the head
demographics.head(3)

Unnamed: 0,State,Pcode1,County,Pcode2,2019 Baseline Population,Children <18yrs,No. male children under 5,No. female children under 5,No. male children aged 5 - 17 years,No. female children aged 5 - 17 years,No. male adults aged 18 - 60,No. female adults aged 18 - 60,No. male adults aged over 60,No. female adults aged over 60
0,Central Equatoria,92.0,Juba,9202.0,499537.632115,241976.028996,36965.784777,48654.965368,76928.795346,79426.483506,115892.730651,120388.56934,11739.134355,9541.168773
1,Central Equatoria,92.0,Kajo-keji,9206.0,221902.149132,35948.148159,3617.005031,6679.254689,12759.373575,12892.514865,72783.904915,71008.687722,28181.57294,13979.835395
2,Central Equatoria,92.0,Lainya,9203.0,110281.690307,46318.309929,11579.577482,11138.450721,12130.985934,11469.295792,31540.563428,25805.915532,2977.605638,3639.29578


In [27]:
print("This DF has one extra county {} instead of 79.".format(len(demographics['County'].unique())))

This DF has one extra county 80 instead of 79.


### Migration - Is not annual?
Current Dataframe: hos_conf_merged

In [19]:
# Reading in the Correct Sheet
migration = pd.read_excel('migration.xlsx', sheet_name='Displaced by county')

# Dropping to the correct location
migration = migration.iloc[4:,]

# Setting the top row to column names
migration.columns = migration.iloc[0]

# Removing top row
migration = migration.iloc[1:,]

# Only the left section was people leaving by county, slicing appropriately.
migration_sliced = migration.iloc[:,0:4]

# Printing the Head
migration_sliced.head()

4,State,County,Sum of 30 Jan 2015 IDPs,Change from 15 Nov
5,Central Equatoria,Juba,45578,1595
6,Central Equatoria,Lainya,0,0
7,Central Equatoria,Terekeka,18303,0
8,Central Equatoria,Yei,77,0
9,Central Equatoria,Kajo-Keji,0,0


In [22]:
print("This dataset represents {} of the 79 counties".format(len(migration_sliced['County'].unique())))

This dataset represents 73 of the 79 counties


### Population

In [24]:
population = pd.read_excel('population.xlsx')

# Printing the head
population.head()

Unnamed: 0,admin2Pcod,Admin_1,Admin_2,SS2008,WP2010,LS2011,LS2012,LS2013,LS2014,UN2014,WP2015,ETHIOPIA,UGANDA,"IDP - Dec. 3, 2015",Internal Out-Migration,Refugees,Net Migration,Estimated 2015 Population,PopChange
0,SS1302,Unity,Abiemnhom,17012.0,22025,19393,21754.0,23203.0,24080,23369,31683,0,0,14217,11735.715071,0,2481.284929,25850.284929,0.106178
1,SS1207,Jonglei,Akobo,136210.0,132141,157474,176578.0,184131.0,191996,170211,188273,22885,0,34387,45815.7821,22885,-11428.7821,135897.2179,-0.201596
2,SS2205,Northern Bahr el Ghazal,Aweil Centre,41827.0,58488,49174,55547.0,56193.0,57442,106523,87394,0,0,0,0.0,0,0.0,106523.0,0.0
3,SS2202,Northern Bahr el Ghazal,Aweil East,309921.0,299846,353806,398384.0,417754.0,436824,529098,431148,0,0,0,0.0,0,0.0,529098.0,0.0
4,SS2201,Northern Bahr el Ghazal,Aweil North,129127.0,142807,146303,164515.0,174104.0,184654,267215,206227,0,0,0,0.0,0,0.0,267215.0,0.0


In [17]:
len(population['Admin_2'].unique())

79

In [117]:
population['Admin_2'].unique()

array(['Abiemnhom', 'Akobo', 'Aweil Centre', 'Aweil East', 'Aweil North',
       'Aweil South', 'Aweil West', 'Awerial', 'Ayod', 'Baliet',
       'Bor South', 'Budi', 'Canal/Pigi', 'Cueibet', 'Duk', 'Ezo',
       'Fangak', 'Fashoda', 'Gogrial East', 'Gogrial West', 'Guit',
       'Ibba', 'Ikotos', 'Juba', 'Jur River', 'Kajo-keji', 'Kapoeta East',
       'Kapoeta North', 'Kapoeta South', 'Koch', 'Lafon', 'Lainya',
       'Leer', 'Longochuk', 'Luakpiny/Nasir', 'Maban', 'Magwi', 'Maiwut',
       'Malakal', 'Manyo', 'Maridi', 'Mayendit', 'Mayom', 'Melut',
       'Morobo', 'Mundri East', 'Mundri West', 'Mvolo', 'Nagero',
       'Nyirol', 'Nzara', 'Panyijiar', 'Panyikang', 'Pariang', 'Pibor',
       'Pochalla', 'Raga', 'Renk', 'Rubkona', 'Rumbek Centre',
       'Rumbek East', 'Rumbek North', 'Tambura', 'Terekeka', 'Tonj East',
       'Tonj North', 'Tonj South', 'Torit', 'Twic', 'Twic East', 'Ulang',
       'Uror', 'Wau', 'Wulu', 'Yambio', 'Yei', 'Yirol East', 'Yirol West',
       nan], dtype

### Swing

In [11]:
# #conflict_sliced_grouped['year'].unique()
# conflict_sliced_grouped.to_csv('aggregated_conflict_data.csv')

# # Reading out to CSV then back in fixes index
# merged_conflict_data = pd.read_csv('aggregated_conflict_data.csv')

# merged_conflict_data['year'].unique()

# # Slicing to just state county and hospital number
# hospitals_county_level = hospitals_county_level[['STATE', 'COUNTY','county_hospitals']]
# hospitals_county_level.head()

# # Getting a count of hospitals by region. 
# # Grouping and counting to get the total registered voters by precinct
# count_total = hospitals_filtered.groupby('COUNTY')['FACIL_NAME2'].count().rename("county_hospitals").reset_index()

# # Combining back together
# hospitals_filtered_added =  hospitals_filtered.merge(count_total)

# # Dropping duplicates 
# hospitals_county_level = hospitals_filtered_added.drop_duplicates('COUNTY')

# # Here I will begin merging the conflict data with the hospital data. 
# merged_conflict_data.head(1)

# len(merged_conflict_data['admin2'].unique())

# hospitals_filtered.head(1)

# len(hospitals_filtered['COUNTY'].unique())

# hospitals_filtered.columns

# # Renaming 
# hos_conf_merged = hos_conf_merged.rename(columns={'STATE':'state','COUNTY':'county'})

# # Reordering Dataframe
# hos_conf_merged = hos_conf_merged[['year', 'state', 'county','fatalities', 'county_hospitals']]

# # Checking the DF
# hos_conf_merged.head()