# Post-processing: Flooding Events

**Tasks**
* Combine Google Earth Engine data
* Difference the before's and after's of the FC dataset
* Perform a point-in-polygon test to match admin boundaries between GEE data & FC data
* Merge and clean final dataset
* Runtime: < 10 seconds

In [1]:
import json
import geodaisy.converters as convert
import geopandas as gpd
import pandas as pd
import timeit

start = timeit.default_timer()

## Combine GEE data
* For all countries of interest, collate event data extracted from GEE
* NOTE: Make sure the file paths match the file names of the saved GEE CSV's

In [2]:
# Load & Combine Burkina Faso events
bf1 = pd.read_csv('Data/GEE/Burkina Faso/FloodedDiff 2016-09-14 BF.csv')
bf2 = pd.read_csv('Data/GEE/Burkina Faso/FloodedDiff 2018-08-20 BF.csv')
bf3 = pd.read_csv('Data/GEE/Burkina Faso/FloodedDiff 2018-09-01 BF.csv')

bf1 = bf1.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)
bf2 = bf2.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)
bf3 = bf3.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)

bf1['flood_date'] = '2016-09-14'
bf2['flood_date'] = '2018-08-20'
bf3['flood_date'] = '2018-09-01'

bf = pd.DataFrame( pd.concat([bf1,bf2,bf3], ignore_index=True) )


In [3]:
# Load Mali events
ml1 = pd.read_csv('Data/GEE/Mali/FloodedDiff 2018-08-20 ML.csv')
ml2 = pd.read_csv('Data/GEE/Mali/FloodedDiff 2018-09-01 ML.csv')

ml1 = ml1.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)
ml2 = ml2.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)


ml1['flood_date'] = '2018-08-20'
ml2['flood_date'] = '2018-09-01'

ml = pd.DataFrame( pd.concat([ml1,ml2], ignore_index=True) )

In [4]:
# Load Niger events
ne1 = pd.read_csv('Data/GEE/Niger/FloodedDiff 2016-09-14 NE.csv')
ne2 = pd.read_csv('Data/GEE/Niger/FloodedDiff 2018-08-20 NE.csv')
ne3 = pd.read_csv('Data/GEE/Niger/FloodedDiff 2018-09-01 NE.csv')

ne1 = ne1.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)
ne2 = ne2.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)
ne3 = ne3.drop(['system:index', 'ADM0_CODE', 'ADM1_CODE', 'ADM1_NAME', 'ADM2_CODE', 'DISP_AREA', 'EXP2_YEAR', 'STATUS',
                'STR2_YEAR', 'Shape_Area', 'Shape_Leng'], axis=1)

ne1['flood_date'] = '2016-09-14'
ne2['flood_date'] = '2018-08-20'
ne3['flood_date'] = '2018-09-01'

ne = pd.DataFrame( pd.concat([ne1,ne2,ne3], ignore_index=True) )

In [5]:
# Combine all GEE data
gee = pd.DataFrame( pd.concat([bf,ml,ne], ignore_index=True) )
gee = gee[['ADM0_NAME', 'ADM2_NAME', 'flood_date', 'Diff', '.geo']]
gee.rename(columns = {'ADM0_NAME': 'country', 'ADM2_NAME':'admin_name'}, inplace = True)
gee['flood_date'] = pd.to_datetime(gee['flood_date'])
gee = gee.sort_values(by=['country', 'admin_name'])
gee = gee.reset_index()
gee = gee.drop(['index'], axis=1)


## Difference the FC dataset

* With the before/after FC data, take the difference between covariate data with
the 'State' labels before/after (for each admin, for every event)

In [6]:
# Load FC0 data for differencing
FC0 = pd.read_csv('Data/FC_before_after_data.csv')
FC0 = FC0.drop(['geometry'], axis=1)
# Drop flood_dates that could not be extracted from GEE
index_names = FC0[(FC0['country'] == 'Mali') & (FC0['flood_date'] == '2016-09-14 00:00:00')].index
FC0.drop(index_names, inplace = True)
index_names = FC0[(FC0['country'] == 'Niger') & (FC0['flood_date'] == '2015-08-30 00:00:00')].index
FC0.drop(index_names, inplace = True)
FC0 = FC0.reset_index()
FC0 = FC0.drop(['index'], axis=1)
print(list(FC0.columns))

['country', 'admin_name', 'centx', 'centy', 'state', 'datetime', 'flood_date', 'fews_ipc', 'fews_ha', 'ndvi_mean', 'rain_mean', 'et_mean', 'acled_count', 'acled_fatalities', 'p_staple_food', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct', 'spacelag', 'timelag1', 'timelag2']


In [7]:
col = ['country', 'admin_name', 'centx', 'centy', 'flood_date', 'fews_ipc','fews_ha', 'ndvi_mean', 'rain_mean', 'et_mean',
       'acled_count', 'acled_fatalities', 'p_staple_food', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
       'spacelag', 'timelag1', 'timelag2']

FC = pd.DataFrame(columns=col)

x = int(len(FC0)/2)
skip = [n*2 for n in range(0, x)]
counter = 0
for i in skip:
    before = i
    after = i+2
    counter = i+1
    diff = FC0[FC0.columns[7:]][before:after].diff()
    FC = FC.append(diff)
    FC.loc[counter, ['country']] = FC0.loc[counter, ['country']]
    FC.loc[counter, ['admin_name']] = FC0.loc[counter, ['admin_name']]
    FC.loc[counter, ['flood_date']] = FC0.loc[counter, ['flood_date']]
    FC.loc[counter, ['centx']] = FC0.loc[counter, ['centx']]
    FC.loc[counter, ['centy']] = FC0.loc[counter, ['centy']]

In [8]:
FC = FC.dropna(how='all')
FC = FC.reset_index()
FC = FC.drop(['index'], axis=1)

## Point-in-polygon test

Check the dataframe lengths because the FC shapefiles are likely to be different from GEE Admin 2 shapefiles.
Different countries may require different techniques to overcome this issue. For the data at hand, a
point-in-polygon test will be used.



In [9]:
# There are differences in file length - so the admin codes are not matching
print('FC data length: ', len(FC))
print('GEE data length: ', len(gee))
test_FC = FC.drop_duplicates(subset=['country', 'admin_name'])
test_gee = gee.drop_duplicates(subset=['country', 'admin_name'])
print('Number of FC admin codes: ', len(test_FC))
print('Number of GEE admin codes: ', len(test_gee))

FC data length:  436
GEE data length:  349
Number of FC admin codes:  162
Number of GEE admin codes:  133


Create shapefiles from GEE geometry column

In [10]:

for i in test_gee.index:
    x = test_gee.loc[i, ['.geo']]
    x = json.loads(x[0])
    try:
        test_gee.loc[i, ['.geo']] = convert.geojson_to_wkt(x)
    except: # Error handling for geometry collections
        y = list(x.values())
        y = y[1]
        test_gee.loc[i, ['.geo']] = convert.geojson_to_wkt(y[1])
        continue

test_gee['.geo'] = gpd.GeoSeries.from_wkt(test_gee['.geo'])
gee_gdf = gpd.GeoDataFrame(test_gee, geometry='.geo')
gee_gdf.rename(columns = {'.geo':'geometry'}, inplace = True)
gee_gdf = gee_gdf.reset_index()
gee_gdf = gee_gdf.drop(['index'], axis=1)
gee_gdf = gee_gdf.set_crs('epsg:3857')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Create points from the FC data.

In [11]:
# Create points
FC_gdf = gpd.GeoDataFrame(FC, geometry=gpd.points_from_xy(FC.centx, FC.centy))
FC_gdf = FC_gdf.set_crs('epsg:3857')

If the FC admin point is within any of the GEE polygons assign that that point the GEE admin name

In [12]:
for i in gee_gdf.index:
    for j in FC_gdf.index:
        polygon = gee_gdf['geometry'][i]
        point = FC_gdf['geometry'][j]
        if polygon.contains(point) == True:
            FC['admin_name'][j] = gee_gdf['admin_name'][i]
        else:
            continue


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FC['admin_name'][j] = gee_gdf['admin_name'][i]


Check differences between admin names after point-in-polygon test

In [13]:
test_FC = FC.drop_duplicates(subset=['country', 'admin_name'])
print('Number of FC admin codes post processing: ',len(test_FC))

# Check strings and fix differences
gee.drop(gee.index[gee['admin_name'] == 'Commune 2'], inplace=True)
gee.drop(gee.index[gee['admin_name'] == 'Commune 3'], inplace=True)
l = set(FC['admin_name'].tolist())
x = set(gee['admin_name'].tolist())
print(set(list(l - x)))
print(set(list(x - l)))


Number of FC admin codes post processing:  131
set()
set()


In the FC data, take the mean of values with the same country, admin name and
flood date

In [14]:
FC = FC.groupby(['country', 'admin_name', 'flood_date']).mean().reset_index()
print(len(FC))
print(len(gee))

343
343


## Combine GEE & FC Data

In [15]:
# Merge data on admin_name
df = pd.merge(FC, gee, on='admin_name')

# Reorder columns
df = df.drop(['country_y','flood_date_y','.geo'], axis=1)
df.rename(columns = {'flood_date_x':'flood_date', 'country_x':'country', 'Diff':'flooding_diff'}, inplace = True)
df = df[['country', 'admin_name', 'flood_date', 'fews_ipc', 'fews_ha', 'ndvi_mean', 'rain_mean', 'et_mean', 'acled_count',
         'acled_fatalities', 'p_staple_food', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
         'flooding_diff', 'spacelag', 'timelag1', 'timelag2']]

# Group duplicates on mean
df = df.groupby(['country', 'admin_name', 'flood_date']).mean().reset_index()


In [16]:
# Download data
df.to_csv('Data/FC_flood_differenced_data.csv', index = False)

stop = timeit.default_timer()
print('Running Time: ', stop - start, 'seconds')

Running Time:  8.9361186 seconds
