Goal:
Clean survey dataset
- unify commata to decimals
- clean columns from typos
- define correct datatype to the columns
- select cols wich could be later needed for model development
- combine targets based on compared predictators


For reasons of reusability and comparability the workflow of the data preprocessing was kept similar to previous preprocessings of the same questionnaire. This was achieved by dealing with missing or erroneous values in a similar way likewise erroneous values with commas or points in the beginning were converted to decimal numbers beginning with zero. 

In [645]:
import numpy as np
import pandas as pd
import geopandas as gpd

import re

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)
plt.figure(figsize=(20, 10))
sns.set_style('darkgrid')

<Figure size 2000x1000 with 0 Axes>

In [646]:
# load raw survey data

raw_data = pd.read_excel("../input_survey_data/all-attributes_shophouses.xlsx")#, thousands=',')
raw_data.tail(3)

Unnamed: 0,Q0.1,Q0.2.Ward,Q0.2.District,Q0.3,Q0.4,Q0.5,Q0.6,Q0.7,Pre.Q1,Pre.Q2,P1Q1,P1Q1.specify,P1Q2.1.1,P1Q2.2.1,P1Q2.3.1,P1Q2.4.1,P1Q2.5.1.0,P1Q2.5.1.1,P1Q2.5.1.2,P1Q2.5.1.3,P1Q2.5.1.4,P1Q2.5.1.88,P1Q2.5.1.99,P1Q2.5.1.specify,P1Q2.6.1,P1Q2.7.1.1,P1Q2.7.1.2,P1Q2.7.1.3,P1Q2.7.1.88,P1Q2.7.1.99,P1Q2.7.1.specify,P1Q2.8.1.1,P1Q2.8.1.2,P1Q2.8.1.3,P1Q2.8.1.4,P1Q2.8.1.5,P1Q2.8.1.6,P1Q2.8.1.7,P1Q2.8.1.8,P1Q2.8.1.9,P1Q2.8.1.10,P1Q2.8.1.99,P1Q2.8.1.specify,P1Q2.9.1,P1Q2.10.1.1,P1Q2.10.1.2,P1Q2.10.1.3,P1Q2.10.1.4,P1Q2.10.1.5,P1Q2.10.1.6,P1Q2.10.1.7,P1Q2.10.1.8,P1Q2.10.1.9,P1Q2.10.1.88,P1Q2.10.1.99,P1Q2.10.1.specify,P1Q2.11.1.1,P1Q2.11.1.2,P1Q2.11.1.3,P1Q2.11.1.4,P1Q2.11.1.5,P1Q2.11.1.6,P1Q2.11.1.7,P1Q2.11.1.8,P1Q2.11.1.9,P1Q2.11.1.88,P1Q2.11.1.99,P1Q2.11.1.specify,P1Q3.2.1,P1Q3.3.1,P1Q3.4.1,P1Q3.5.1,P1Q3.6.1,P1Q3.7.1,P1Q3.88.1,P1Q3.88.1.specify,P1Q3.8.1,P1Q3.9.1.man,P1Q3.9.1.days,P1Q3.10.1.1,P1Q3.10.1.2,P1Q3.10.1.3,P1Q3.10.1.4,P1Q3.10.1.5,P1Q3.10.1.6,P1Q3.10.1.88,P1Q3.10.1.99,P1Q3.10.1.specify,P1Q3.11.1,P1Q4.2.1,P1Q4.3.1,P1Q4.4.1,P1Q4.5.1,P1Q4.88.1,P1Q4.88.1.specify,P1Q4.6.1,P1Q5.2.1,P1Q5.3.1,P1Q5.4.1,P1Q5.5.1,P1Q5.88.1,P1Q5.88.1.specify,P1Q5.6.1,P1Q5.7.1,P1Q5.8.1,P1Q5.9.1,P1Q6.2.1.1.relationship,P1Q6.2.1.1.specify,P1Q6.2.1.1.Sex,P1Q6.2.1.1.Age,P1Q6.3.1.1.Rank1,P1Q6.3.1.1.Rank2,P1Q6.3.1.1.Rank3,P1Q6.3.1.1.Rank4,P1Q6.3.1.1.Rank5,P1Q6.3.1.1.Rank88,P1Q6.3.1.1.specify,P1Q6.4.1.1,P1Q6.2.1.2.relationship,P1Q6.2.1.2.specify,P1Q6.2.1.2.Sex,P1Q6.2.1.2.Age,P1Q6.3.1.2.Rank1,P1Q6.3.1.2.Rank2,P1Q6.3.1.2.Rank3,P1Q6.3.1.2.Rank4,P1Q6.3.1.2.Rank5,P1Q6.3.1.2.Rank88,P1Q6.3.1.2.specify,P1Q6.4.1.2,P1Q6.2.1.3.relationship,P1Q6.2.1.3.specify,P1Q6.2.1.3.Sex,P1Q6.2.1.3.Age,P1Q6.3.1.3.Rank1,P1Q6.3.1.3.Rank2,P1Q6.3.1.3.Rank3,P1Q6.3.1.3.Rank4,P1Q6.3.1.3.Rank5,P1Q6.3.1.3.Rank88,P1Q6.3.1.3.specify,P1Q6.4.1.3,P1Q7.2.1.1,P1Q7.2.1.2,P1Q7.2.1.3,P1Q7.2.1.4,P1Q7.2.1.5,P1Q7.2.1.88,P1Q7.2.1.98,P1Q7.2.1.99,P1Q7.2.1.specify,P1Q7.3.1.1,P1Q7.3.1.2,P1Q7.3.1.3,P1Q7.3.1.4,P1Q7.3.1.5,P1Q7.3.1.6,P1Q7.3.1.88,P1Q7.3.1.98,P1Q7.3.1.99,P1Q7.3.1.specify,P1Q2.1.2,P1Q2.2.2,P1Q2.3.2,P1Q2.4.2,P1Q2.5.2.0,P1Q2.5.2.1,P1Q2.5.2.2,P1Q2.5.2.3,P1Q2.5.2.4,P1Q2.5.2.88,P1Q2.5.2.99,P1Q2.5.2.specify,P1Q2.6.2,P1Q2.7.2.1,P1Q2.7.2.2,P1Q2.7.2.3,P1Q2.7.2.88,P1Q2.7.2.99,P1Q2.7.2.specify,P1Q2.8.2.1,P1Q2.8.2.2,P1Q2.8.2.3,P1Q2.8.2.4,P1Q2.8.2.5,P1Q2.8.2.6,P1Q2.8.2.7,P1Q2.8.2.8,P1Q2.8.2.9,P1Q2.8.2.10,P1Q2.8.2.99,P1Q2.8.2.specify,P1Q2.9.2,P1Q2.10.2.1,P1Q2.10.2.2,P1Q2.10.2.3,P1Q2.10.2.4,P1Q2.10.2.5,P1Q2.10.2.6,P1Q2.10.2.7,P1Q2.10.2.8,P1Q2.10.2.9,P1Q2.10.2.88,P1Q2.10.2.99,P1Q2.10.2.specify,P1Q2.11.2.1,P1Q2.11.2.2,P1Q2.11.2.3,P1Q2.11.2.4,P1Q2.11.2.5,P1Q2.11.2.6,P1Q2.11.2.7,P1Q2.11.2.8,P1Q2.11.2.9,P1Q2.11.2.88,P1Q2.11.2.99,P1Q2.11.2.specify,P1Q3.2.2,P1Q3.3.2,P1Q3.4.2,P1Q3.5.2,P1Q3.6.2,P1Q3.7.2,P1Q3.88.2,P1Q3.88.2.specify,P1Q3.8.2,P1Q3.9.2.man,P1Q3.9.2.days,P1Q3.10.2.1,P1Q3.10.2.2,P1Q3.10.2.3,P1Q3.10.2.4,P1Q3.10.2.5,P1Q3.10.2.6,P1Q3.10.2.88,P1Q3.10.2.99,P1Q3.10.2.specify,P1Q3.11.2,P1Q4.2.2,P1Q4.3.2,P1Q4.4.2,P1Q4.5.2,P1Q4.88.2,P1Q4.88.2.specify,P1Q4.6.2,P1Q5.2.2,P1Q5.3.2,P1Q5.4.2,P1Q5.5.2,P1Q5.88.2,P1Q5.88.2.specify,P1Q5.6.2,P1Q5.7.2,P1Q5.8.2,P1Q5.9.2,P1Q6.2.2.1.relationship,P1Q6.2.2.1.specify,P1Q6.2.2.1.Sex,P1Q6.2.2.1.Age,P1Q6.3.2.1.Rank1,P1Q6.3.2.1.Rank2,P1Q6.3.2.1.Rank3,P1Q6.3.2.1.Rank4,P1Q6.3.2.1.Rank5,P1Q6.3.2.1.Rank88,P1Q6.3.2.1.specify,P1Q6.4.2.1,P1Q6.2.2.2.relationship,P1Q6.2.2.2.specify,P1Q6.2.2.2.Sex,P1Q6.2.2.2.Age,P1Q6.3.2.2.Rank1,P1Q6.3.2.2.Rank2,P1Q6.3.2.2.Rank3,P1Q6.3.2.2.Rank4,P1Q6.3.2.2.Rank5,P1Q6.3.2.2.Rank88,P1Q6.3.2.2.specify,P1Q6.4.2.2,P1Q6.2.2.3.relationship,P1Q6.2.2.3.specify,P1Q6.2.2.3.Sex,P1Q6.2.2.3.Age,P1Q6.3.2.3.Rank1,P1Q6.3.2.3.Rank2,P1Q6.3.2.3.Rank3,P1Q6.3.2.3.Rank4,P1Q6.3.2.3.Rank5,P1Q6.3.2.3.Rank88,P1Q6.3.2.3.specify,P1Q6.4.2.3,P1Q7.2.2.1,P1Q7.2.2.2,P1Q7.2.2.3,P1Q7.2.2.4,P1Q7.2.2.5,P1Q7.2.2.88,P1Q7.2.2.98,P1Q7.2.2.99,P1Q7.2.2.specify,P1Q7.3.2.1,P1Q7.3.2.2,P1Q7.3.2.3,P1Q7.3.2.4,P1Q7.3.2.5,P1Q7.3.2.6,P1Q7.3.2.88,P1Q7.3.2.98,P1Q7.3.2.99,P1Q7.3.2.specify,P2Q1.1.implement,P2Q1.1.spend,P2Q1.2.implement,P2Q1.2.spend,P2Q1.3.implement,P2Q1.3.spend,P2Q1.4.implement,P2Q1.4.spend,P2Q1.5.implement,P2Q1.5.spend,P2Q1.6.implement,P2Q1.6.spend,P2Q1.7.implement,P2Q1.7.spend,P2Q2.1,P2Q2.2,P2Q2.3,P2Q2.3.specify,P2Q2.4,P2Q2.5.man,P2Q2.5.days,P2Q2.6.1,P2Q2.6.2,P2Q2.6.3,P2Q2.6.4,P2Q2.6.5,P2Q2.6.6,P2Q2.6.88,P2Q2.6.98,P2Q2.6.99,P2Q2.6.specify,P2Q3.1.1,P2Q3.1.2,P2Q3.2.1,P2Q3.2.2,P3Q1.1,P3Q1.2,P3Q1.3,P3Q1.4,P3Q1.5,P3Q1.6,P3Q1.7,P3Q2.1,P3Q2.2,P3Q2.3.Rank1,P3Q2.3.Rank2,P3Q2.3.Rank3,P3Q2.3.Rank4,P3Q2.3.Rank5,P3Q2.3.Rank6,P3Q2.3.Rank7,P3Q2.3.Rank88,P3Q2.3.Specify,P3Q2.4,P3Q2.4.Specify,P3Q2.5,P3Q2.5.Specify,P3Q2.6,P3Q2.6.Specify,P3Q3.1,P3Q3.2,P3Q3.3,P3Q3.4,P3Q3.5,P3Q3.6,P3Q3.7,P3Q3.8,P3Q3.9,P3Q3.10,P3Q3.11,P4Q1.1,P4Q1.2,P4Q1.3,P4Q1.4,P4Q1.5.0,P4Q1.5.1,P4Q1.5.2,P4Q1.5.3,P4Q1.5.4,P4Q1.5.5,P4Q1.5.6,P4Q1.5.7,P4Q1.5.99,P4Q1.5.Specify,P4Q1.6,P4Q1.7.1,P4Q1.7.2,P4Q1.7.3,P4Q1.7.4,P4Q1.7.5,P4Q1.7.99,P4Q1.7.Specify,P4Q1.8,P4Q1.9,P4Q1.10,P4Q1.11,P4Q2.1,P4Q2.2,P4Q2.3,P4Q2.4,P4Q2.5,P4Q3.1.Rank1,P4Q3.1.Rank2,P4Q3.1.Rank3,P4Q3.1.Rank4,P4Q3.1.Rank5,P4Q3.1.Rank6,P4Q3.1.Rank88,P4Q3.1.Specify,P4Q3.2.Rank1,P4Q3.2.Rank2,P4Q3.2.Rank3,P4Q3.2.Rank4,P4Q3.2.Rank5,P4Q3.2.Rank88,P4Q3.2.Specify,P4Q3.3.Rank1,P4Q3.3.Rank2,P4Q3.3.Rank3,P4Q3.3.Rank4,P4Q3.3.Rank5,P4Q3.3.Rank6,P4Q3.3.Rank7,P4Q3.3.Rank8,P4Q3.3.Rank88,P4Q3.3.Specify,P4Q3.4.Rank1,P4Q3.4.Rank2,P4Q3.4.Rank3,P4Q3.4.Rank4,P4Q3.4.Rank5,P4Q3.4.Rank6,P4Q3.4.Rank7,P4Q3.4.Rank8,P4Q3.4.Rank9,P4Q3.4.Rank10,P4Q3.4.Rank88,P4Q3.4.Specify,P4Q3.5.Rank1,P4Q3.5.Rank2,P4Q3.5.Rank3,P4Q3.5.Rank4,P4Q3.5.Rank5,P4Q3.5.Rank88,P4Q3.5.Specify,P4Q3.6,P4Q3.6.cm,P4Q4.1,P4Q4.2.1,P4Q4.3.1.1,P4Q4.3.1.2,P4Q4.3.1.3,P4Q4.3.1.4,P4Q4.3.1.5,P4Q4.3.1.6,P4Q4.3.1.88,P4Q4.3.1.Specify,P4Q4.4.1.1,P4Q4.4.1.2,P4Q4.4.1.3,P4Q4.4.1.4,P4Q4.4.1.5,P4Q4.4.1.6,P4Q4.4.1.88,P4Q4.4.1.99,P4Q4.4.1.Specify,P4Q4.5.1,P4Q4.2.2,P4Q4.3.2.1,P4Q4.3.2.2,P4Q4.3.2.3,P4Q4.3.2.4,P4Q4.3.2.5,P4Q4.3.2.6,P4Q4.3.2.88,P4Q4.3.2.Specify,P4Q4.4.2.1,P4Q4.4.2.2,P4Q4.4.2.3,P4Q4.4.2.4,P4Q4.4.2.5,P4Q4.4.2.6,P4Q4.4.2.88,P4Q4.4.2.99,P4Q4.4.2.Specify,P4Q4.5.2,P4Q5.1,P4Q5.2,P4Q5.3,P4Q5.4,P4Q5.5,P4Q5.6,P4Q5.7,P4Q5.8,P4Q5.9,P4Q5.10,P4Q5.12,P4Q5.13,P4Q5.14,P4Q5.15,P4Q5.88,P4Q5.Specify,P5Q1.1,P5Q1.1.Specify,P5Q1.2,P5Q1.3,P5Q1.3.Specify,P5Q1.4,P5Q1.5,P5Q1.6.1,P5Q1.6.2,P5Q1.6.3,P5Q1.6.4,P5Q1.6.5,P5Q1.6.88,P5Q1.6.Specify,P5Q1.7,P5Q1.8.1,P5Q1.8.2,P5Q1.8.3,P5Q1.8.88,P5Q1.8.Specify,P5Q1.9.1,P5Q1.9.2,P5Q1.9.3,P5Q1.9.4,P5Q1.9.5,P5Q1.9.6,P5Q1.10,P5Q1.11,P5Q1.12,P5Q1.13,GPS,NoPhotos,places,Q0.10,Q0.11,Q0.12,Q0.13,Q0.14
249,8040020000000,Nh?n ??c,Nhà Bè,8/27 ?p 3,1,Võ Th? Hoa,14:00:00,16,1,1,4,,Ng?p 2017,08/99/2017,5,20,0,1,0,0,1,0,0,,1,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,1,0,0,0,0,0,0,1,0,Xây kè xung quanh nhà,0,1,1,1,0,0,0,0,0,0,0,,2,2,1,1,1,98,,,2,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,99,1,1,1,1,,,0,1,1,1,1,,,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,Ng?p 2017,08/99/2017,5,20,0,1,0,0,1,0,0,,1,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,1,0,0,0,0,0,0,1,0,Xây kè xung quanh nhà,0,1,1,1,0,0,0,0,0,0,0,,2,2,1,1,1,98,,,2,0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,99,1,1,1,1,,,2,1,1,1,1,,,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,3,0,5,0,5,0,5,0,5,0,5,0,3,99,2005,120,2,,150,10,30,0,0,0,0,0,0,0,1,0,,2,,2,,1,5,1,1,1,5,1,1,1,3,2,1.0,,,,,,,2,,2,,3,,5,1,5,1,1,5,1,5,1,1,1,7,0,0,2,1,0,0,0,0,0,0,0,0,,2,0,0,0,0,0,0,,5,2,4,2,1984,2005,85,1,400,3,,,,,,,,3,4,,,,,,2,,,,,,,,,,2,,,,,,,,,,,,3,,,,,,,2,30,1,99/2020,0,0,0,0,0,0,1,S?a nhà v? sinh,0,0,0,1,0,0,0,0,,25,99/2019,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Nâng sân tr??c,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,,10.0,1,0,6,0,1,1,1,3,3,1,0,0,4,2,0,,3,,2005,11,,5,3,1,0,0,0,0,0,,300,0,0,1,0,,1,1,1,0,0,0,2,1,2,2,"10.678169,106.699339",0,,14:30:00,30,0,,2020-10-04
250,8040020000000,Nh?n ??c,Nhà Bè,1465/46/17 Lê V?n L??ng,1,Tô V?n D?ng,09:56:00,16,1,1,4,,??t ng?p do tri?u c??ng dâng cao 2019,10/99/2019,3,5,0,1,0,0,1,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,1,1,1,0,0,1,0,0,0,0,,0,1,0,1,0,1,0,0,0,0,0,,3,3,1,1,1,98,,,120000000,5,30,,,,,,,,,,600,2,1,1,1,,,0,1,1,1,4,,,2,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,??t ng?p do tri?u c??ng dâng cao 2019,10/99/2019,3,5,0,1,0,0,1,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,1,1,1,0,0,1,0,0,0,0,,0,1,0,1,0,1,0,0,0,0,0,,3,3,1,1,1,98,,,120000000,5,30,,,,,,,,,,600,2,1,1,1,,,2,1,1,1,4,,,2,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,5,0,3,",3",5,0,4,120,5,0,5,0,5,0,2019,50,1,,60,5,30,1,0,0,0,0,0,0,0,0,,2,,2,,5,5,1,1,1,5,1,2,2,3,2,,,,,,,,1,,3,,2,,5,1,5,1,5,4,5,5,5,1,1,4,0,0,0,1,0,0,0,0,0,0,0,0,,2,0,0,0,0,0,0,,4,2,4,3,2015,2014,60,1,1000,3,5.0,1.0,,,,,,3,4,,,,,,2,1.0,,,,,,,,,2,,,,,,,,,,,,4,,,,,,,3,50,1,2023-11-19 00:00:00,1,0,0,0,0,0,0,,0,0,0,0,1,0,0,0,,120,,,,,,,,,,,,,,,,,,,,2,1,4,0,1,2,2,6,2,1,0,0,2,0,0,,3,,2016,17,Bán t?p hóa,2,2,1,0,0,0,0,0,,100,1,0,1,0,,1,1,0,0,0,0,2,1,1,2,"10.685735,106.703296",0,,10:30:00,34,0,,2020-10-04
251,8040020000000,Nh?n ??c,Nhà Bè,1465/46/6 Nh?n ??c,1,Nguy?n Nh?t Tân,10:34:00,16,1,1,4,,2018,09/99/2018,1,20,0,1,0,0,0,1,0,Bùn sình,3,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,0,0,0,0,0,0,1,0,0,,0,1,1,0,0,0,0,0,0,0,0,,1,2,1,3,1,98,,,60000000,4,20,,,,,,,,,,100,2,1,1,1,,,0,1,1,1,1,,,0,0,0,0,1.0,,1.0,70.0,4.0,,,,,,,3.0,8.0,,1.0,8.0,4.0,,,,,,,2.0,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,2018,09/99/2018,1,20,0,1,0,0,0,1,0,Sình,3,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,0,0,0,0,0,0,1,0,0,,0,1,1,0,0,0,0,0,0,0,0,,1,2,1,3,1,98,,,60000000,4,20,,,,,,,,,,100,2,1,1,1,,,2,1,1,1,1,,,0,0,0,0,1.0,,1.0,70.0,4.0,,,,,,,3.0,8.0,,1.0,8.0,4.0,,,,,,,2.0,,,,,,,,,,,,,0,0,0,0,0,0,1,0,,0,0,0,0,0,0,0,1,0,,5,0,5,0,3,15,4,54,5,0,4,2,5,0,2018,50,1,,30,4,15,0,0,0,0,0,0,0,1,0,,2,,2,,1,3,1,1,3,5,2,4,4,2,3,,,,,,,,3,,3,,3,,5,2,4,3,1,1,4,4,2,1,1,7,1,0,2,0,1,0,1,0,0,0,1,0,Huy?t áp,2,0,0,0,0,0,0,,3,2,3,3,2014,2014,42,1,1000,1,3.0,5.0,6.0,2.0,,,,1,3,4.0,,,,,1,2.0,5.0,,,,,,,,2,,,,,,,,,,,,4,,,,,,,3,40,1,2023-11-18 00:00:00,1,0,1,0,0,0,1,H? th?ng ???ng ?ng,0,0,0,1,1,0,0,0,,60,,,,,,,,,,,,,,,,,,,,4,0,6,0,1,2,1,4,4,1,1,0,4,0,0,,2,,2019,17,"T?p hóa, cafe",1,1,1,0,0,0,0,0,,7,1,0,0,0,,1,0,0,0,0,0,3,1,1,2,"10.685886,106.703268",0,,11:00:00,26,0,,2020-10-04


### rename columns

All variables based on the most recent event are ending with "_r", all variables for the most serious event since 2010 are ending with "_s".



*Targets*

In [647]:
df = raw_data

# target var for direct cost on content loss [VND]
df.insert(0, "Target_directloss_mVND_r", df.pop("P1Q5.6.1")) 
df.insert(1, "Target_directloss_mVND_s", df.pop("P1Q5.6.2"))

# explanatory var: monthly reduction of business [%] 
df.insert(2, "Target_businessreduction_r", df.pop("P1Q5.9.1"))  
df.insert(3, "Target_businessreduction_s", df.pop("P1Q5.9.2"))



  df.insert(0, "Target_directloss_mVND_r", df.pop("P1Q5.6.1"))
  df.insert(1, "Target_directloss_mVND_s", df.pop("P1Q5.6.2"))
  df.insert(2, "Target_businessreduction_r", df.pop("P1Q5.9.1"))
  df.insert(3, "Target_businessreduction_s", df.pop("P1Q5.9.2"))


*candidates predictors and further important columns*

In [648]:
col_names = {        'P1Q1':'flood_experience',  
                     'P1Q2.2.1':'flood_time_r', 
                     'P1Q2.2.2':'flood_time_s',
                     'P1Q2.3.1':'inundation_duration_h_r',
                     'P1Q2.3.2':'inundation_duration_h_s',
                     'P1Q2.4.1':'water_depth_cm_r',
                     'P1Q2.4.2':'water_depth_cm_s',
                     'P1Q2.5.1':'contaminations_r',                 
                     'P1Q2.5.2':'contaminations_s',   
                     'P1Q2.6.1':'flowvelocity_r',  
                     'P1Q2.6.2':'flowvelocity_s', 
                     'P1Q2.8.1':'warning_type_r', 
                     'P1Q2.8.2':'warning_type_s',  
                     'P1Q2.9.1':'warning_time_h_r', 
                     'P1Q2.9.2':'warning_time_h_s',  
                     'P1Q2.10.1':'emergency_measures_r', 
                     'P1Q2.10.2':'emergency_measures_s', 
                     'P1Q2.11.1':'overall_problem_house_r', 
                     'P1Q2.11.2':'overall_problem_house_s',
                     

                    #  'P1Q3.2.1':'damage_level_floor_r',
                    #  'P1Q3.2.2':'damage_level_floor_s',
                    #  'P1Q3.3.1':'damage_level_walls_r',
                    #  'P1Q3.3.2':'damage_level_walls_s',
                    #  'P1Q3.4.1':'damage_level_foundation_r',
                    #  'P1Q3.4.2':'damage_level_foundation_s',
                    #  'P1Q3.5.1':'damage_level_doors_r',
                    #  'P1Q3.5.2':'damage_level_doors_s',
                    #  'P1Q3.6.1':'damage_level_roof_r',
                    #  'P1Q3.6.2':'damage_level_roof_s',
                    #  'P1Q3.7.1':'damage_level_basement_r',
                    #  'P1Q3.7.2':'damage_level_basement_s',
                    #  'P1Q3.88.1':'damage_level_other_r',
                    #  'P1Q3.88.2':'damage_level_other_s',
                     'P1Q3.88.1.specify':'damage_level_specify_r',
                     'P1Q3.88.2.specify':'damage_level_specify_s',
                     'P1Q3.8.1':'repair_costs_b_VND_r',
                     'P1Q3.8.2':'repair_costs_b_VND_s',
                     'P1Q3.10.1':'reason_why_not_repaired_business_r', 
                     'P1Q3.10.2':'reason_why_not_repaired_business_s',
                     'P1Q3.11.1':'repair_costs_b_complete_mVND_r',
                     'P1Q3.11.2':'repair_costs_building_complete_mVND_s',
                     'P1Q5.2.1':'shpdamage_level_furniture_r',
                     'P1Q5.3.1':'shpdamage_level_electronics_r',
                     'P1Q5.4.1':'shpdamage_level_equipment_r',
                     'P1Q5.5.1':'shpdamage_level_products_r',
                     'P1Q5.88.1':'shpdamage_level_others_r',
                     'P1Q5.2.2':'shpdamage_level_furniture_s',
                     'P1Q5.3.2':'shpdamage_level_electronics_s',
                     'P1Q5.4.2':'shpdamage_level_equipment_s',
                     'P1Q5.5.2':'shpdamage_level_products_s',
                     'P1Q5.88.2':'shpdamage_level_others_s',

                     'P1Q5.7.1':'shp_closed_d_r',
                     'P1Q5.7.2':'shp_closed_d_s',
                     'P1Q5.8.1':'shp_duration_back2normal_r',
                     'P1Q5.8.2':'shp_duration_back2normal_s',

                    # 'P1Q6.4.1':'hh_drop_out_from_work_d_r',
                    # 'P1Q6.4.2': 'hh_drop_out_from_work_d_s',

                     'P2Q1.1.implement':'protect_valuables_impl',
                     'P2Q1.1.spend':'protect_valuables_VND_spnd',
                     'P2Q1.2.implement':'water_barries_impl',
                     'P2Q1.2.spend':'water_barries_VND_spnd',
                     'P2Q1.3.implement':'pumping_equipment_impl',
                     'P2Q1.3.spend':'pumping_equipment_VND_spnd',
                     'P2Q1.4.implement':'elevation_building_impl',
                     'P2Q1.4.spend':'elevation_building_VND_spnd',
                     'P2Q1.5.implement':'resistant_material_building_impl',
                     'P2Q1.5.spend':'resistant_material_building_VND_spnd',
                     'P2Q1.6.implement':'electricity_higher_impl',
                     'P2Q1.6.spend':'electricity_higher_VND_spnd',
                     'P2Q1.7.implement':'flood_protections_impl',
                     'P2Q1.7.spend':'flood_protections_VND_spnd',
                     'P2Q2.1.1':'elevation_building_year',
                     'P2Q2.2':'elevation_building_height_cm',
                     'P2Q2.3':'elevation_building_elements',
                     'P2Q2.4':'elevation_building_material_costs_VND',
                     'P2Q3.1.1':'insurance_building_VND',
                     'P2Q3.2.1':'insurance_business_VND',
                     
                     'P3Q1.1':'resilience_city_protection',
                     'P3Q1.2':'resilience_more_future_affected',
                     'P3Q1.3':'resilience_govern_warnings_helpful',
                     'P3Q1.4':'resilience_govern_careing',
                     'P3Q1.5':'resilience_govern_careing_increases',
                     'P3Q1.6':'resilience_left_alone',
                     'P3Q1.7':'resilience_neighbor_management',
                      # currently not impl P3Q2.1-2: flood perception (eg. if flood changed during last 10 years or expected to change)
                     'P3Q2.3':'perception_who_responsible4protection',  
                     'P3Q2.4':'perception_govern_support_past',  
                     'P3Q2.5':'perception_govern_support_future',  
                     'P3Q2.6':'perception_private_economy_future',  
  
                     # curr not impl P3Q3.1-11: if worst flood happens 3 times more every year (eg. how likely income losses, traffic syste collaps, shop house resist in such extreme flood scenario?)
  
                     #'P4Q1.6':'people_com',	
                     'P4Q1.8':'hh_education',
                     #'P4Q1.9':'poverty_cert',                     
                     'P4Q1.10':'hh_monthly_income_cat',
                     'P4Q2.1':'b_movingin',
                     'P4Q2.2':'b_year', 
                     'P4Q2.3':'b_floorsize_sqm',
                     'P4Q2.5':'b_value_mVND',
                     'P4Q2.4':'lu_cert',
                     'P4Q3.1':'b_material_foundation', 
                     'P4Q3.2':'b_material_floor',
                     'P4Q3.3':'b_material_wall',
                     'P4Q3.4':'b_material_roof',
                     'P4Q3.5':'b_material_doors',
                     r'P4Q3.6$':'b_elevation_rel2surrounding_cat', 
                    # 'P4Q3.6.cm':'b_elevation_rel2surrounding_cm',
                    # 'P4Q4.1': 'renovation_since_2010',  # binary: 1=yes, 2=no, 99=dont know
                     'P4Q4.2.1':'ren1',  
                     'P4Q4.2.2':'ren2', 
                     'P4Q4.4.1':'building_renovation_reasons_r',   
                     'P4Q4.4.2':'building_renovation_reasons_s', 
                     'P4Q4.5.1':'building_renovation_cost_mVND_r', 
                     'P4Q4.5.2':'building_renovation_cost_mVND_s',  

                     r'P5Q1.1$':'shp_owner',
                     'P5Q1.2':'shp_established',
                     'P5Q1.3':'shp_sector', 
                     'P5Q1.4':'shp_employees',
                     'P5Q1.5':'shp_avgmonthly_sale_catego',
                     'P5Q1.6':'shp_finance_investments',
                     'P5Q1.7':'shp_capital_mVND',
                     'P5Q1.9':'shp_suppliers_location',
                     'P5Q1.10':'shp_profits_last5years',
                     'P5Q1.11':'shp_risk_tolerance',
                     'P5Q1.12':'shp_monetary_resources4prevention',
}

for k, v in col_names.items():
    df.rename(columns ={ i: re.sub(k, v, i) for i in  df.columns }, inplace=True )


### Flood times 


In [649]:
#d = "99/2015"
d = "99/99/2015"
if (len(str(d))>=7):  # only month or day and year exists
    y = d.split("/")[-1]
    print(y)
    df.flood_time_r[i] = pd.to_datetime(d).strftime('%Y')


2015


DateParseError: Invalid date specified (99/99), at position 0

In [650]:
## clean flood time, convert to uniform date format

for i, d in enumerate(df.flood_time_r):
    try:
        df.flood_time_r[i] = pd.to_datetime(d, format="mixed").strftime('%m/%d/%Y')
    except:
        pass
    try:
        m, y = d.split("/",1)  # only day or month and year exists
        d = f"{m}/01/{y}"
        df.flood_time_r[i] = pd.to_datetime(d).strftime('%m/%d/%Y')        
    except:
        pass
    try: 
        d = d.split("/")[-1] # only year
        df.flood_time_r[i] = pd.to_datetime(d).strftime('%Y')
    except:
        df.flood_time_r[i] = pd.NaT

    # except:
    #     if (len(str(d))==4):  # only year exists
    #         d = f"01/01/{d}"
    #         df.flood_time_r[i] = pd.to_datetime(d, format='mixed').strftime('%m/%d/%Y')

    #     if (len(str(d))==7):  # only month or day and year exists
    #         m, y = d.split("/",1)
    #         d = f"{m}/01/{y}"
    #         df.flood_time_r[i] = pd.to_datetime(d).strftime('%m/%d/%Y')
    #     else:  # catch weird values such as 99/99
    #         print("NAT", i)
    #         df.flood_time_r[i] = pd.NaT

#  for i, d in enumerate(df.flood_time_r):
#     try: 
#         df.flood_time_r[i] = pd.to_datetime(d).strftime('%m/%d/%Y')
#         #df["flood_time_r_modified"][i] = False
#     except:
#         print(df.flood_time_r[i])
#         df.flood_time_r[i] = d.replace("99", "01")
#         df["flood_time_r_modified"][i] = True
   
for i, d in enumerate(df.flood_time_s):
    try:
        df.flood_time_s[i] = pd.to_datetime(d, format="mixed").strftime('%m/%d/%Y')
    except:
        pass
    try:
        d = d.split("/")[-1] # only year
        df.flood_time_s[i] = pd.to_datetime(d).strftime('%Y')
    except:
        df.flood_time_s[i] = pd.NaT
      
# ## make to datetime obj - but this would set all unknown months and days to 01
#df.flood_time_r = pd.to_datetime(df.flood_time_r, format='mixed')
#df.flood_time_s = pd.to_datetime(df.flood_time_s, format='mixed') 

## set flood times with unknown years to NAN, as well as for typos
df.flood_time_r = df.flood_time_r.replace("01/01/1970", pd.NaT)
df.flood_time_s = df.flood_time_s.replace("01/01/1970", pd.NaT)
df.flood_time_s = df.flood_time_s.replace("08/20/2023", pd.NaT)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.flood_time_r[i] = pd.to_datetime(d).strftime('%Y')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.flood_time_r[i] = pd.to_datetime(d).strftime('%Y')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.flood_time_r[i] = pd.to_datetime(d).strftime('%Y')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.flo

In [651]:
df.flood_time_r

0      2015
1      2017
2      2020
3      2020
4      2020
       ... 
247    2019
248    2019
249    2017
250    2019
251    2018
Name: flood_time_r, Length: 252, dtype: object

If the month or day of the flood time is missing only the year was selected. Pandas datetime fills missing information by 01 which would change the original information e.g if only the year is known, datetime would set it to the first January of the year. Due to this reason, the flood times were kept as integer or ojects in two possible formats:as month/day/year and only the year if month or day information is missing.

In [595]:
print("Most recent events for which day or month is missing;", (df.flood_time_r.str.len()==4).sum())
print("Most serious events for which day or month is missing;", (df.flood_time_s.str.len()==4).sum())

Most recent events for which day or month is missing; 0
Most serious events for which day or month is missing; 0


### Damage Variables

Damage variables comprises hydrological variables, emergency measures and variables about damage levels contents


In [596]:
## Caution: compared to the Rscript further variables are included in "vars_dam" such as flood time and damae levels of business contents
vars_dam = pd.concat([df.loc[ :, "flood_time_r":"flowvelocity_r"],
                      df.loc[ :, "warning_time_h_r":"overall_problem_house_r.99"],
                      df.loc[ :, "shpdamage_level_furniture_r":"shpdamage_level_products_r"],
                      df.loc[ :, "flood_time_s":"flowvelocity_s"],
                      df.loc[ :, "warning_time_h_s":"overall_problem_house_s.99"],
                      df.loc[ :, "shpdamage_level_furniture_s":"shpdamage_level_products_s"],
                     ], axis=1)

# drop string columns (all cols ending with 88, 99, specify)
vars_dam = vars_dam.loc[:, ~vars_dam.columns.str.contains(r"(.88)$|(.99)$|(.specify)$")]
vars_dam


vars_dam["contaminations_r.0"] = vars_dam["contaminations_r.0"].replace(np.nan, 0)
vars_dam["contaminations_r.1"] = vars_dam["contaminations_r.1"].replace(np.nan, 0)
vars_dam["contaminations_r.2"] = vars_dam["contaminations_r.2"].replace(np.nan, 0)
vars_dam["contaminations_r.3"] = vars_dam["contaminations_r.3"].replace(np.nan, 0)
vars_dam["contaminations_r.4"] = vars_dam["contaminations_r.4"].replace(np.nan, 0)

vars_dam["contaminations_s.0"] = vars_dam["contaminations_s.0"].replace(np.nan, 0)
vars_dam["contaminations_s.1"] = vars_dam["contaminations_s.1"].replace(np.nan, 0)
vars_dam["contaminations_s.2"] = vars_dam["contaminations_s.2"].replace(np.nan, 0)
vars_dam["contaminations_s.3"] = vars_dam["contaminations_s.3"].replace(np.nan, 0)
vars_dam["contaminations_s.4"] = vars_dam["contaminations_s.4"].replace(np.nan, 0)


vars_dam.warning_time_h_r = vars_dam.warning_time_h_r.replace(np.nan, 99)
vars_dam.warning_time_h_s = vars_dam.warning_time_h_s.replace(np.nan, 99)


  vars_dam = vars_dam.loc[:, ~vars_dam.columns.str.contains(r"(.88)$|(.99)$|(.specify)$")]


Missing information about contamination type or warning time are set to 0 or 99 respectively.

### Identical events

Same matrix indicates 
- 0 for a given damage variable when businesses have different values for most recent & the most serious event or different flood times, 
- 1 when businesses have same value for recent & serious event and identical flood times
In a subsequent step the information about same damage variables in combination with same flood times are used to identify identical events. 

0 = different damage vars
1 = identical damage vars

In [597]:
## iterate over cols -> check if each value in col for recent events is identical with value in the other respective col for serious events

col_len = len(vars_dam.columns)//2
df_same = pd.DataFrame(index=range(len(vars_dam)), columns=range(col_len)) # init binary df - indicating if certain cols are the same

for c in range(col_len):
    for r in range(len(vars_dam)):
        if vars_dam.iloc[r, c] == vars_dam.iloc[r, c + col_len]:  # identical damage variabels and flood times
            df_same.iloc[r, c] = 1
        if vars_dam.iloc[r, c] != vars_dam.iloc[r, c + col_len]:  # different damage variables or flood times
            df_same.iloc[r, c] = 0


df_same.head(3)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
2,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,0,1,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1


In [598]:
## create indidcator in one col, showing if events are equal
vars_dam["same"] = pd.Series() # init column indicating if both events are the same

for r in range(len(vars_dam)):
    vars_dam.same[r] = (df_same.iloc[r, :] == 0).any()  # if any damage variable is different than set indicator to True
#  vars_dam.same : True= different events, False= identical events

## test code :
# df_same.iloc[r, :] = 31*[1] + 1*[0]   # case with nearly identical values except one --> same indicator should recognize as different events
# (df_same.iloc[r, :] == 0).any() 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_dam.same[r] = (df_same.iloc[r, :] == 0).any()  # if any damage variable is different than set indicator to True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_dam.same[r] = (df_same.iloc[r, :] == 0).any()  # if any damage variable is different than set indicator to True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_dam.same[r] = (df_same.iloc[r, :] == 0).any()  # if any damage variable is different than set indicator to True
A value is trying to be s

In [599]:
print("Cases with different damage variables and/or different flood times :", vars_dam.same.value_counts()[0])
print("Cases with identical damage variables and identical flood times:", vars_dam.same.value_counts()[1])


Cases with different damage variables and/or different flood times : 145
Cases with identical damage variables and identical flood times: 107


Including further variables ( flood time and damage levels of business contents) to the identification which events are identical reduces the number of identified identical events from 118 to 107.


### Precautionary measures 

Variables are transformed to binary values [0,1] 
- 0 when the measure is implemented before the event 
-  options 1 or 3 for the serious event and options 2 or 3 for the recent event. 
- Options 1, 2 or 3 if recent is also the serious event.


In [600]:
# initialize empty columns for precautionary measurment implementations
prec_measures_impl_colnames_r = []
prec_measures_impl_colnames_s = []

for c in  df.filter(regex="_impl$", axis=1).columns:
    prec_measures_impl_colnames_r.append(c + "_r")
    prec_measures_impl_colnames_s.append(c + "_s")

vars_dam[prec_measures_impl_colnames_r] = 0
vars_dam[prec_measures_impl_colnames_s] = 0

## add flood experience 
vars_dam["flood_experience"]  = df.flood_experience


In [601]:
## 1 - before serious, 2 - before recent, 3 - before both, 4 - after both, 5 - did not implement

## Precautionary measures   
pre_vars = df.filter(regex="_impl$", axis=1)


## different events 
## Recent 
vars_dam[prec_measures_impl_colnames_r] = pre_vars.replace( {2:1, 3:1}) # set precautionary measures to 1 before event happend
## Serious
vars_dam[prec_measures_impl_colnames_s] = pre_vars.replace({1:1, 3:1}) # set precautionary measures to 1 before event happend



## identical events [Options 1, 2 or 3]
idx_identical_events = vars_dam.loc[vars_dam.same==True, :].index

vars_dam.loc[idx_identical_events, prec_measures_impl_colnames_r] = pre_vars.loc[idx_identical_events,:].replace( {1:1, 2:1, 4:1}).values
vars_dam.loc[idx_identical_events, prec_measures_impl_colnames_s] = pre_vars.loc[idx_identical_events,:].replace( {1:1, 2:1, 3:1}).values


### Socio-economic variables


In [602]:
part4Q1_cols_list = [r"Target_directloss_*", r"Target_businessreduction_*", r'^household_*', 
                        r"shp_*",  # = e.g "shp_closed_d*", r"shp_sector$", "shp_owner",
                    #'people_com', 'poverty_cert'
                    ]


pattern_part4Q1_cols = re.compile('|'.join(part4Q1_cols_list))
vars_soc = df.filter(regex=pattern_part4Q1_cols, axis=1)

#vars_soc["people_com"] = vars_soc["people_com"].replace(2, 0)
#vars_soc["poverty_cert"] = vars_soc["poverty_cert"].replace(2, 0)

## data cleaning
vars_soc = vars_soc.loc[:, ~vars_soc.columns.str.contains(r"(.88)$|(.99)$|(.specify)$|(.Specify)$|(others)")]  # ## drop object columns
vars_soc = vars_soc.replace(' ', np.nan)
vars_soc = vars_soc.replace(r'^,', '0.', regex=True) # convert e.g ,5 -> 0,5
vars_soc = vars_soc.replace(',', '.', regex=True).astype(float) 
vars_soc = vars_soc.astype("float")


#For building variables - during serious and recent events - building age; how long has the 
#householder lived in the location. LU certificate, building cost.
vars_bui = df.loc[:,['lu_cert','b_value_mVND']]
vars_bui["ba"] = df.b_floorsize_sqm.replace(r'^,', '0.', regex=True) # convert e.g ,5 -> 0,5
vars_bui.ba = vars_bui.ba.replace(',', '.', regex=True).astype(float)  # building_floorsize_sqm = P4Q2.3
flood_year_r = pd.to_datetime(df["flood_time_r"], format='mixed').dt.strftime('%Y')
flood_year_s = pd.to_datetime(df["flood_time_s"], format='mixed').dt.strftime('%Y')
 
vars_bui["flood_year_r"] = flood_year_r
vars_bui["flood_year_s"] = flood_year_s

  vars_soc = vars_soc.loc[:, ~vars_soc.columns.str.contains(r"(.88)$|(.99)$|(.specify)$|(.Specify)$|(others)")]  # ## drop object columns


DateParseError: Invalid date specified (99/99), at position 0

In [None]:
#df["flood_time_s"][15]
#flood_year_s.value_counts()  # check for weird years
df["flood_time_s"]
flood_year_s

0      2015
1      2017
2       NaN
3       NaN
4      2020
       ... 
247    2017
248    2017
249    2017
250    2019
251    2018
Name: flood_time_s, Length: 252, dtype: object

### Perception variables
The individual perception influences the behavior

In [None]:
part3Q1Q2_cols_list = [r"resilience_*", r"^perception_*"] 

pattern_part3Q1Q2_cols = re.compile('|'.join(part3Q1Q2_cols_list))
vars_perception = df.filter(regex=pattern_part3Q1Q2_cols, axis=1)
vars_perception.drop(list(vars_perception.filter(regex = '\.specify$')), axis = 1, inplace = True) 

vars_soc = pd.concat([vars_soc, vars_perception], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_perception.drop(list(vars_perception.filter(regex = '\.specify$')), axis = 1, inplace = True)


### building variables

In [None]:
df.b_movingin = df.b_movingin.replace(99, np.nan)  # P4Q2.1  
df.b_year = df.b_year.replace(99, np.nan)  # P4Q2.2

## extract cases where building construction or moving in of the households was after the flood event
vars_bui["occ_yrs_r"] = flood_year_r.astype("Int64") - df.b_movingin.astype("Int64")  
vars_bui["occ_yrs_s"] = flood_year_s.astype("Int64") - df.b_movingin.astype("Int64")

vars_bui["bage_r"] = flood_year_r.astype("Int64") - df.b_year.astype("Int64")  # building age since flood event occured
vars_bui["bage_s"] = flood_year_s.astype("Int64") - df.b_year.astype("Int64")


print((vars_bui.occ_yrs_r < 0.0).sum(), "cases in which moving is after recent flood events")
print((vars_bui.occ_yrs_s < 0.0).sum(), "cases in which moving is after serious flood events")
print((vars_bui.bage_r < 0.0).sum(), "cases in which building was constructed after the recent flood event")
print((vars_bui.bage_s < 0.0).sum(), "cases in which building was constructed after the serious flood event")

#For physical damage, if the householder did not live 
#in this house during the reported flood events, we remove those records from further analysis.
#Valid = 1; not valid = 0
vars_bui['valid_r'] = 1
vars_bui['valid_s'] = 1

vars_bui.loc[vars_bui.occ_yrs_r < 0.0, "valid_r"] = 0  # set all cases which moved in or where building was constructed after event to zero
vars_bui.loc[vars_bui.occ_yrs_s < 0.0, "valid_s"] = 0




0 cases in which moving is after recent flood events
0 cases in which moving is after serious flood events
7 cases in which building was constructed after the recent flood event
6 cases in which building was constructed after the serious flood event


### Renovation

In [None]:
#df["ren1"] 
ren1#.filter("99/1")

0      01/01/2019
1      11/17/2023
2      01/01/2013
3      01/01/2020
4      01/01/2013
          ...    
247    01/01/2019
248    01/01/1970
249    01/01/2020
250    11/19/2023
251    11/18/2023
Name: ren1, Length: 252, dtype: object

In [587]:
## If major renovation -> change bage to that. Otherwise, original bage

ren1= df["ren1"] # first renovation
ren2 = df["ren2"]  # second renovaton


## clean renonvation time, convert into uniform date format
ren1 = ren1.replace({r"^99":"01", np.nan:"01/01/1970", " ": "01/01/1970"}, regex=True)
ren2 = ren2.replace({r"^99":"01", np.nan:"01/01/1970", " ": "01/01/1970"}, regex=True)


## set missing days and months to 01
print(df["ren1"][2])
#print(df["ren1"] [31])
#ren1 = pd.to_datetime(ren1, format='mixed')
#ren2 = pd.to_datetime(ren2, format='mixed')
print(ren1[2])#31])

## fix dates for which only year exists and pass to renovation time
for i, d in enumerate(ren1):
    
    if (type(d) == int) & (len(str(d))==4):  # only year exists
        d = f"01/01/{d}"
        ren1[i] = pd.to_datetime(d, format='mixed').strftime('%m/%d/%Y')

    if (len(str(d))==7):  # only month or day and year exists
        #print(d)
        m, y = d.split("/",1)
        d = f"{m}/01/{y}"
        ren1[i] = pd.to_datetime(d).strftime('%m/%d/%Y')


for i, d  in enumerate(ren2):
    
    if (type(d) == int) & (len(str(d))==4):  # only year exists
        d = f"01/01/{d}"
        ren2[i] = pd.to_datetime(d, format='mixed').strftime('%m/%d/%Y')
    
    if (len(str(d))==7):  # only month or day and year exists
        #print(d)
        m, y = d.split("/",1)
        d = f"{m}/01/{y}"
        ren2[i] = pd.to_datetime(d).strftime('%m/%d/%Y')

print(ren1[2])#31])

ren1 = pd.to_datetime(ren1, format='mixed').dt.strftime('%m/%d/%Y') # convert remaining mix of formats to date
ren2 = pd.to_datetime(ren2, format='mixed').dt.strftime('%m/%d/%Y')

ren1_year = pd.to_datetime(ren1).dt.year
ren2_year =pd.to_datetime(ren2).dt.year

ren1_year = ren1_year.replace({1970:np.nan})
ren2_year = ren2_year.replace({1970:np.nan})


2013
2013
01/01/2013


In [589]:
ren2[ren2== "12/19/2023"]
#ren2.value_counts()
#df["ren2"].value_counts()
df["ren2"][92] 
ren2

0      01/01/1970
1      01/01/1970
2      01/01/2018
3      01/01/2013
4      01/01/1970
          ...    
247    01/01/2017
248    01/01/1970
249    01/01/2019
250    01/01/1970
251    01/01/1970
Name: ren2, Length: 252, dtype: object

In [473]:
## Recent events
ren1_ev_r = flood_year_r.astype("Int64") - (ren1_year).astype("Int64")
ren2_ev_r = flood_year_r.astype("Int64") - (ren2_year).astype("Int64")

ren1_ev_r.loc[ren1_ev_r < 0] = np.nan   # replace when renovations were before flood events
ren2_ev_r.loc[ren2_ev_r < 0] = np.nan

rec_ren_year = pd.concat([ren1_ev_r, ren2_ev_r], axis=1).min(axis=1, skipna=True)


In [477]:
## Serious events 
ren1_ev_s = flood_year_s.astype("Int64") - (ren1_year).astype("Int64")
ren2_ev_s = flood_year_s.astype("Int64") - (ren2_year).astype("Int64")

ren1_ev_s[ren1_ev_s < 0] = np.nan  # replace when renovations were before flood events
ren2_ev_s[ren2_ev_s < 0] = np.nan

ext_ren_year = pd.concat([ren1_ev_s, ren2_ev_s], axis=1).min(axis=1, skipna=True)
#ext_ren_year.loc[ext_ren_year<0] = df.b_year.astype("Int64")[ext_ren_year<0]
ext_ren_year.loc[ext_ren_year == 99] = np.nan

In [499]:
flood_year_s#ren1_ev_s[:20]

0      2015
1      2017
2       NaN
3       NaN
4      2020
       ... 
247    2017
248    2017
249    2017
250    2019
251    2018
Name: flood_time_s, Length: 252, dtype: object

In [483]:
## repalce all negative renov years by renov year
ext_ren_year = pd.concat([ren1_ev_s, ren2_ev_s], axis=1).min(axis=1, skipna=True)
ext_ren_year[:20]

0     <NA>
1     <NA>
2     <NA>
3     <NA>
4        7
5        1
6        0
7        6
8     <NA>
9     <NA>
10    <NA>
11    <NA>
12    <NA>
13    <NA>
14    <NA>
15    <NA>
16    <NA>
17       7
18       0
19    <NA>
dtype: object

In [334]:
vars_bui["bage_ren1"] = vars_bui.bage_r.copy()
vars_bui["bage_ren2"] = vars_bui.bage_s.copy()

## if not nan than replace value in bage_ren
vars_bui.bage_ren1 = np.where( ~rec_ren_year.isna(), rec_ren_year, vars_bui.bage_ren1)
vars_bui.bage_ren2 = np.where( ~ext_ren_year.isna(), ext_ren_year, vars_bui.bage_ren2)

vars_bui.bage_ren1 = pd.to_numeric(vars_bui.bage_ren1).astype("Int64")  # convert while preseving NAN
vars_bui.bage_ren2 = pd.to_numeric(vars_bui.bage_ren2).astype("Int64") 

vars_bui.loc[vars_bui.bage_ren1 < 0, "bage_ren1"] = np.nan
vars_bui.loc[vars_bui.bage_ren2 < 0, "bage_ren2"] = np.nan

vars_bui.bage_r[vars_bui.bage_r < 0] = np.nan
vars_bui.bage_s[vars_bui.bage_s < 0] = np.nan


In [335]:
# pattern_l = [
#     r'^overall_problem_house_r\..$',  # =P1Q2.11.1.1 - P1Q2.11.1.9    # except ending with two or more letters/digits [.88, .99, .specify]
#     'damage_level_floor_r', 'damage_level_walls_r', 'damage_level_foundation_r', 'damage_level_doors_r', 'damage_level_roof_r' # ='P1Q3.2.1','P1Q3.3.1','P1Q3.4.1','P1Q3.5.1','P1Q3.6.1'
# ]
# pattern = re.compile('|'.join(pattern_l))

# df.filter(regex=pattern, axis=1)


### building damage variables


In [336]:
# pattern_l = [
#     r'^overall_problem_house_r\..$',  # = P1Q2.11.1.1 - P1Q2.11.1.9    # except ending with two or more letters/digits [.88, .99, .specify]
#     'damage_level_floor_r', 'damage_level_walls_r', 'damage_level_foundation_r', 'damage_level_doors_r', 'damage_level_roof_r' # ='P1Q3.2.1','P1Q3.3.1','P1Q3.4.1','P1Q3.5.1','P1Q3.6.1'
# ]   ## TODO check why not included in Rscript: *.7 = damage_level_basement_r

# pattern = re.compile('|'.join(pattern_l))
# damage_ev_r = df.filter(regex=pattern, axis=1)
abs_loss_ev1 = df["repair_costs_building_VND_r"]  #:'P1Q3.8.1'

# Id abs_loss is 2, then no repairs are made. One of the plausible reasons for no repairs is no damage or very minor damage. We replace these 2s by 0. 
# Find zero-loss values
# # Damage level (1-5): 98: not applicable; 99: I don’t know, 1: No damage; 2: Minor damages - Usable; 3: Moderate damages; 4: Major damages – needs repair; 5: Complete damage – needs replacement
list_zero_loss_values = [1, 99, 98]
abs_loss_ev1 = np.where(
        (
            (df["reason_why_not_repaired_business_r.4"]==1) | # P1Q3.10.1.4: It’s not necessary because it will be flooded soon again (1: true)
            (df["reason_why_not_repaired_business_r.3"]==1) | # P1Q3.10.1.3: It’s not necessary (minor damages, still usable, livable) (1: true)
            (df["overall_problem_house_r.1"]==1) |   # P1Q2.11.1.1: No problem (1==true)
            (
                (df["damage_level_floor_r"].isin(list_zero_loss_values)) &  # P1Q3.2.1
                (df["damage_level_walls_r"].isin(list_zero_loss_values)) &  # P1Q3.3.1
                (df["damage_level_foundation_r"].isin(list_zero_loss_values)) & # P1Q3.4.1
                (df["damage_level_doors_r"].isin(list_zero_loss_values)) &  # P1Q3.5.1
                (df["damage_level_roof_r"].isin(list_zero_loss_values)) &  # P1Q3.6.1
                (df["damage_level_basement_r"].isin(list_zero_loss_values)) # P1Q3.7.1
            )
            ) & (df["repair_costs_building_VND_r"] == 2),  # P1Q3.8.1 (2: I did not repair anything)
        0, abs_loss_ev1  
    )

#If no evidence of minor damage or residual damage, approximate it to what would have occured if you repaired the house completely.
abs_loss_ev1 =  np.where(
    (abs_loss_ev1==2) & (df["repair_costs_building_complete_mVND_r"] != 99),   # condition
    df["repair_costs_building_complete_mVND_r"] * 1000000,  ## # condition fullfilled, set to costs to VND
    abs_loss_ev1   # condition not fullfilled, set to 99
)

abs_loss_ev1 =  np.where(
    (abs_loss_ev1==99) & (df["repair_costs_building_complete_mVND_r"] != 99),   # condition
    df["repair_costs_building_complete_mVND_r"] * 1000000,  ## condition fullfilled, set to costs to VND
    abs_loss_ev1  ## condition not fullfilled, set to 99
)

abs_loss_ev1 = pd.Series(abs_loss_ev1).astype("Int64")  # Int64 = handles nan
abs_loss_ev1[abs_loss_ev1==99]= np.nan

In [337]:
## most serious

# pattern_l = [
#     r'^overall_problem_house_s\..$',  # = P1Q2.11.2.1 - P1Q2.11.2.9    # except ending with two or more letters/digits [.88, .99, .specify]
#     'damage_level_floor_s', 'damage_level_walls_s', 'damage_level_foundation_s', 'damage_level_doors_s', 'damage_level_roof_s' # ='P1Q3.2.2','P1Q3.3.2','P1Q3.4.2','P1Q3.5.2','P1Q3.6.2'
# ]   ## TODO check why not included in Rscript: *.7 = damage_level_basement_r

# pattern = re.compile('|'.join(pattern_l))
# damage_ev_s = df.filter(regex=pattern, axis=1)
abs_loss_ev2 = df["repair_costs_building_VND_s"]  #:'P1Q3.8.2'

# Id abs_loss is 2, then no repairs are made. One of the plausible reasons for no repairs is no damage or very minor damage. We replace these 2s by 0. 
# Find zero-loss values
# # Damage level (1-5): 98: not applicable; 99: I don’t know, 1: No damage; 2: Minor damages - Usable; 3: Moderate damages; 4: Major damages – needs repair; 5: Complete damage – needs replacement
list_zero_loss_values = [1, 99, 98]
abs_loss_ev2 = np.where(
        (
            (df["reason_why_not_repaired_business_s.4"]==1) | # P1Q3.10.2.4: It’s not necessary because it will be flooded soon again (1: true)
            (df["reason_why_not_repaired_business_s.3"]==1) | # P1Q3.10.2.3: It’s not necessary (minor damages, still usable, livable) (1: true)
            (df["overall_problem_house_s.1"]==1) |   # P1Q2.11.2.1: No problem (1==true)
            (
                (df["damage_level_floor_s"].isin(list_zero_loss_values)) &  # P1Q3.2.2
                (df["damage_level_walls_s"].isin(list_zero_loss_values)) &  # P1Q3.3.2
                (df["damage_level_foundation_s"].isin(list_zero_loss_values)) & # P1Q3.4.2
                (df["damage_level_doors_s"].isin(list_zero_loss_values)) &  # P1Q3.5.2
                (df["damage_level_roof_s"].isin(list_zero_loss_values)) &  # P1Q3.6.2
                (df["damage_level_basement_s"].isin(list_zero_loss_values)) # P1Q3.7.2
            )
        ) & (df["repair_costs_building_VND_s"] == 2),  # P1Q3.8.2 (2: I did not repair anything)
        0,    # if condition fullfilled
        abs_loss_ev2  # else 
    )

## absolute losses in [VND]
#If no evidence of minor damage or residual damage, approximate it to what would have occured if you repaired the house completely.
abs_loss_ev2 =  np.where(
    (abs_loss_ev2 == 2) & (df["repair_costs_building_complete_mVND_s"] != 99),   # condition
    df["repair_costs_building_complete_mVND_s"] * 1000000,  ## # condition fullfilled, set to costs to VND
    abs_loss_ev2   # condition not fullfilled, set to 99
)

abs_loss_ev2 =  np.where(
    (abs_loss_ev2 == 99) & (df["repair_costs_building_complete_mVND_s"] != 99),   # condition
    df["repair_costs_building_complete_mVND_s"] * 1000000,  ## condition fullfilled, set to costs to VND
    abs_loss_ev2  ## condition not fullfilled, set to 99
)

abs_loss_ev2 = pd.Series(abs_loss_ev2).astype("Int64")  # Int64 = can keep nan
abs_loss_ev2[abs_loss_ev2==99]= np.nan


#### Building value


In [338]:
vars_bui["building_value_mVND"] = df["building_value_mVND"].astype("Int64")  # = P4Q2.5: building value if sell or rebuild completely
vars_bui.building_value_mVND[vars_bui.building_value_mVND == 99.0] = np.nan

## abs_loss in VND, but bv in mVND
rloss_ev1 = (abs_loss_ev1 / 1000000) / vars_bui.building_value_mVND
rloss_ev1[ rloss_ev1 > 1 ] = 1

rloss_ev2 = (abs_loss_ev2 / 1000000) / vars_bui.building_value_mVND
rloss_ev2[ rloss_ev2 > 1 ] = 1


In [339]:
rloss_ev1.describe()

count   216.0
mean      0.2
std       0.3
min       0.0
25%       0.0
50%       0.0
75%       0.2
max       1.0
dtype: Float64

#### Content value

Derive content value (cv) from building value (bv)

Registered capital  is based on bv + cv (if interviewee is owner)  
US HAZUS uses bv = 100% cv (trade, serivces), bv =150% cv (light+heavy industry, food and metall) 

similar work regarding cv: Chinh 2015, Paprotny2020

In [340]:
vars_soc.shp_capital_mVND[vars_soc.shp_capital_mVND == 99] = np.nan
vars_soc.shp_capital_mVND = vars_soc.shp_capital_mVND.astype("Float64")

vars_bui["shp_building_value_mVND"] = vars_bui.building_value_mVND / 2
vars_bui["shp_building_value_mVND"] = vars_bui["shp_building_value_mVND"].astype("Int64")  # Int64 = keep nan
# vars_bui.building_value_mVND = vars_bui.building_value_mVND.replace(99,0)
# vars_soc.shp_capital_mVND = vars_soc.shp_capital_mVND.replace(99,0)
print("Number of businesses for which registered capital is less than value for commercial building part:", (vars_soc.shp_capital_mVND < vars_bui.shp_building_value_mVND).sum())
#print("No building value information: ", vars_bui.shp_building_value_mVND.isna().sum())
#print("No registered capital information: ", vars_soc.shp_capital_mVND.isna().sum())

# ## quick check number of business owner
print(vars_soc.shp_owner.value_counts())  # 1- owner , 2- manager 3 - both, 88 - other


Number of businesses for which registered capital is less than value for commercial building part: 185
shp_owner
3.0     125
1.0     108
2.0      14
88.0      5
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vars_soc.shp_capital_mVND[vars_soc.shp_capital_mVND == 99] = np.nan


In [341]:
# Alternative: 
## if first approach doesnt give good result --> than assumption than houses have avg. around 2 levels (ground floor+ one resid. floor above)
## --> this would give 50% to cv if assumed that 100% bv for an entire business house = 100% business cv

vars_bui["shp_content_value_mVND"] = vars_bui.building_value_mVND / 2
vars_bui["shp_content_value_mVND"] = vars_bui["shp_content_value_mVND"].astype("Int64")  # Int64 = keep nan
vars_bui["shp_content_value_mVND"].describe()  # mean ~17 000 €, max= 213 640€
## avg cv for residential part for Can Tho city: 2710,15 euros [87-38746€]

count    216.0
mean     452.8
std      558.7
min       30.0
25%      200.0
50%      350.0
75%      500.0
max     5500.0
Name: shp_content_value_mVND, dtype: Float64

In [342]:
vars_soc.shp_capital_mVND.mean()  # <19000.0, mean 178.660
vars_bui.building_value_mVND.mean()  # < 11000, mean 905.694
vars_bui.shp_building_value_mVND.max()  #  < 5500, mean 452,84
vars_bui.shp_content_value_mVND.mean()     # <5500.0 , mean 452

452.84722222222223

### Extract shop locations

In [344]:
# get coords in readable format for gpd

vars_bui = gpd.GeoDataFrame(vars_bui,  
            geometry=gpd.points_from_xy( 
                    df["GPS"].str.split(",").str[0],
                    df["GPS"].str.split(",").str[1]
            )
        )

## save shp locations to disk, 
# extract elevation based on shop locations in datapoints_vars_bui.shp via QGIS due to loading size and process with gdal
print(vars_bui.crs)
vars_bui = vars_bui.set_crs(4326) 
vars_bui.to_file('../input_survey_data/DEM_LiDAR/datapoints_vars_bui.shp')  

  vars_bui.to_file('../input_survey_data/DEM_LiDAR/datapoints_vars_bui.shp')


None


### Spatial variable

In [424]:

dem__lufi_pts = gpd.read_file("../input_survey_data/DEM_LiDAR/HCMC_Lidar_2020_DEM_4326_LuFi_points.shp")  # shop locations including elevation height based on original and interpolated DEM

## fixed: correct order of lats and lons which resulted from switched lats lons in LuFI-DEM_InternalUseOnly.tif
dem__lufi_pts = gpd.GeoDataFrame(dem__lufi_pts,  
            geometry=gpd.points_from_xy( 
                     dem__lufi_pts["geometry"].y,
                     dem__lufi_pts["geometry"].x
            )
        )

## aligne crs to a geometric crs. here 32648 to extract distance between points in meters
print(vars_bui.crs)
print(dem_pts.crs)
print(dem__lufi_pts.crs)

EPSG:4326
EPSG:4326
EPSG:4326


In [425]:
## assigne DEM point locations to shp location of dataframe, silght differences in coordinates due to conversion

vars_bui = gpd.sjoin_nearest(vars_bui, dem__lufi_pts, how='left')#
vars_bui = vars_bui.set_geometry('geometry')
vars_bui = vars_bui.drop(["index_right"], axis=1)#.HCMC_Lidar.isna().sum()
vars_bui




Unnamed: 0,lu_cert,building_value_mVND,ba,flood_year_r,flood_year_s,occ_yrs_r,occ_yrs_s,bage_r,bage_s,valid_r,valid_s,bage_ren1,bage_ren2,shp_building_value_mVND,shp_content_value_mVND,geometry,HCMC_Lidar,LuFIDEMInt
0,1,800,156.0,2020,2017,50,47,45,42,1,1,45,42,400,400,POINT (10.72441 106.60230),1.4,1.4
1,1,600,118.0,2016,2018,18,20,18,20,1,1,18,20,300,300,POINT (10.73377 106.60899),1.6,0.6
2,1,600,114.0,2013,2013,1,1,1,1,1,1,1,1,300,300,POINT (10.82429 106.73315),1.5,0.8
3,1,4000,27.0,2020,2019,6,5,6,5,1,1,3,2,2000,2000,POINT (10.72696 106.62999),1.1,2.3
4,1,800,350.0,2020,2012,45,37,18,10,1,1,8,0,400,400,POINT (10.72129 106.63284),1.1,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,1,200,70.0,2019,2019,35,35,14,14,1,1,2,2,100,100,POINT (10.70510 106.68803),2.0,1.8
248,1,500,35.0,2020,2020,45,45,45,45,1,1,45,45,250,250,POINT (10.79664 106.69938),1.4,-4.7
249,1,800,53.0,2020,2019,18,17,18,17,1,1,4,3,400,400,POINT (10.73278 106.60842),1.6,0.7
250,2,800,55.0,2020,2020,7,7,20,20,1,1,1,1,400,400,POINT (10.79759 106.70033),1.5,-7.5


In [426]:
## replace locations with missing elevation by interpolated DEM (all locations with missing elevation height have value 0.0 )
print(vars_bui[vars_bui.HCMC_Lidar==0.0])
vars_bui.HCMC_Lidar = np.where(vars_bui.HCMC_Lidar==0.0, vars_bui.LuFIDEMInt, vars_bui.HCMC_Lidar)

     lu_cert  building_value_mVND    ba flood_year_r flood_year_s  occ_yrs_r   
98         1                 3000 850.0         2020         2011         19  \
219        1                  300  80.0         2018         2019         35   

     occ_yrs_s  bage_r  bage_s  valid_r  valid_s  bage_ren1  bage_ren2   
98          10      17       8        1        1          9          0  \
219         36      18      19        1        1         18         19   

     shp_building_value_mVND  shp_content_value_mVND   
98                      1500                    1500  \
219                      150                     150   

                       geometry  HCMC_Lidar  LuFIDEMInt  
98   POINT (10.77247 106.62747)         0.0        -0.1  
219  POINT (10.69663 106.68412)         0.0         0.9  


In [428]:
vars_bui = vars_bui.rename(columns={"HCMC_Lidar":"elevation_m"})
vars_bui = vars_bui.drop("LuFIDEMInt", axis =1)
#vars_bui.elevation_m = np.round(vars_bui.elevation_m, 2)
vars_bui.insert(len(vars_bui.columns)-2, "elevation_m", vars_bui.pop("elevation_m"))
vars_bui


Unnamed: 0,lu_cert,building_value_mVND,ba,flood_year_r,flood_year_s,occ_yrs_r,occ_yrs_s,bage_r,bage_s,valid_r,valid_s,bage_ren1,bage_ren2,shp_building_value_mVND,shp_content_value_mVND,elevation_m,geometry
0,1,800,156.0,2020,2017,50,47,45,42,1,1,45,42,400,400,1.4,POINT (10.72441 106.60230)
1,1,600,118.0,2016,2018,18,20,18,20,1,1,18,20,300,300,1.6,POINT (10.73377 106.60899)
2,1,600,114.0,2013,2013,1,1,1,1,1,1,1,1,300,300,1.5,POINT (10.82429 106.73315)
3,1,4000,27.0,2020,2019,6,5,6,5,1,1,3,2,2000,2000,1.1,POINT (10.72696 106.62999)
4,1,800,350.0,2020,2012,45,37,18,10,1,1,8,0,400,400,1.1,POINT (10.72129 106.63284)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,1,200,70.0,2019,2019,35,35,14,14,1,1,2,2,100,100,2.0,POINT (10.70510 106.68803)
248,1,500,35.0,2020,2020,45,45,45,45,1,1,45,45,250,250,1.4,POINT (10.79664 106.69938)
249,1,800,53.0,2020,2019,18,17,18,17,1,1,4,3,400,400,1.6,POINT (10.73278 106.60842)
250,2,800,55.0,2020,2020,7,7,20,20,1,1,1,1,400,400,1.5,POINT (10.79759 106.70033)


## Select predictors and merge identical events


In [None]:
#print(vars_dam.columns)
vars_bui = vars_bui.drop(['occ_yrs_r', 'occ_yrs_s', 'valid_r', 'valid_s'],axis=1)
#vars_bui = vars_bui[['lu_cert','building_value_mVND', 'shp_building_value_mVND', 'shp_content_value_mVND', 'ba','bage_r','bage_s','bage_ren1','bage_ren2', "lat", "lon", "geometry", "elevation_m"]]
vars_bui.rename(columns={
    "ba":'building_area'}, 
    inplace=True)

## column renameing for vars_soc and vars_dam were previously done 
vars_dam["rloss_1"] = rloss_ev1
vars_dam["rloss_2"] = rloss_ev2
vars_dam["bloss_1"] = abs_loss_ev1
vars_dam["bloss_2"] = abs_loss_ev2


In [None]:
vars_dam["id"] = range(len(df))

vars_dam = vars_dam.replace(" ", np.nan) # fill empty cells, otherwise not append of cols is possible
vars_dam = vars_dam.replace("", np.nan) # fill empty cells, otherwise not append of cols is possible
vars_dam = vars_dam.replace("^,", "0.", regex=True) 
vars_dam = vars_dam.replace(",", ".", regex=True) 


## select cases with recent events and non specific cols
data_ip1 = pd.concat(
  [vars_dam.loc[:, "flood_time_r":"overall_problem_house_r.9"],  # flood vars, damage vars
    vars_dam.loc[:,"same":"flood_protections_impl_r"],  #  same, precaution measures, 
    vars_dam.loc[:,"flood_experience":"rloss_1"], # flood_experience, rloss1,rloss2
    vars_dam.loc[:, "bloss_1"],  # rloss1,rloss2, bloss2, R: 96, 97, 99
 ], axis=1
)

print(f"Non-identical cases are: {(vars_dam.same==0).sum()}")

## drop unique endings to obtian identical column names of both dfs
data_ip1.columns = data_ip1.columns.str.replace('_r', '')
data_ip1.columns = data_ip1.columns.str.replace('_1', '') 


## if not identical event select serious events
for i in range(len(data_ip1.loc[:,:])):  
  if (data_ip1.same[i]==0): 
    sev = pd.concat(
        [vars_dam.loc[i, "flood_time_s" : "same"],  # flood-vars, damage-vars, same
         #vars_dam.loc[i, "protect_valuables_impl_s" : "rloss_1"], # TODO: Q: this line - right or mistake diff col. names from sev+ data_ip1
        #vars_dam.loc[i, ["bloss_1", "id"]]  # # laut R: idx: 98,100 = bloss_1  id
         vars_dam.loc[i, "protect_valuables_impl_s" : "flood_experience"],
         vars_dam.loc[i, ["rloss_2", "bloss_2"]]
        ]
    )

    ## convert to 1-row df, drop unique endings to merge both dfs by cols names
    sev = pd.DataFrame(sev).T
    sev.columns =sev.columns.str.replace('_s', '') 
    sev.columns =sev.columns.str.replace('_2', '') 
    sev[1:] = sev[1:].apply(pd.to_numeric)  # exclude datetime-column: flood_time

    ## append events which are not identical to a recent event
    data_ip1 = pd.concat([data_ip1, sev], ignore_index=True)


# ## convert obj columns to numeric by keeping nan values
# data_ip1_obj = data_ip1.select_dtypes(include=object).apply(pd.to_numeric)
# data_ip1['inundation_duration_h'] = data_ip1_obj['inundation_duration_h']
# data_ip1['warning_time_h'] = data_ip1_obj['warning_time_h']

# data_ip1 = data_ip1.apply(pd.to_numeric)
print(data_ip1.info())


In [None]:
data_ip1.columns

In [None]:
vars_bui["id"] = range(len(df))

## select cases with recent events and non specific cols
data_ip2 = pd.concat(
  [vars_bui.loc[:, :"flood_year_r"],#[:,:4],
    vars_bui.loc[:, ["bage_r", "bage_ren1"]],
    vars_bui.loc[:, "shp_building_value_mVND": "id"],#[:,[5,7]],
    ], axis=1
)
data_ip2.columns = data_ip2.columns.str.replace('1','')
data_ip2.columns = data_ip2.columns.str.replace('_r$', '', regex=True) 


## if not identical event select serious events
for i in range(len(data_ip2.loc[:,:])): 
  if (vars_dam.same[i]==0):
    sev = pd.concat(
        [vars_bui.loc[i, "lu_cert":"building_area"],
            vars_bui.loc[i, ["flood_year_s", "bage_s", 'bage_ren2']],  # eig. [4,6,7]
            vars_bui.loc[i, "shp_building_value_mVND": "id"]
        ]
      )
    ## convert to 1-row df, drop unique endings to merge both dfs by cols names
    sev = pd.DataFrame(sev).T
    sev.columns = sev.columns.str.replace('_s', '')
    sev.columns = sev.columns.str.replace('2$', '', regex=True) 

    ## append events which are not identical to a recent event
    data_ip2 = pd.concat([data_ip2, sev], ignore_index=True)


In [None]:
vars_soc["id"] = range(len(df))

## select cases with recent events and non specific cols
data_ip3 = pd.concat(
  [vars_soc.loc[:, ["Target_directloss_mVND_r", "Target_businessreduction_r", "shp_closed_d_r", "hh_education"]],#[:,:4],
    vars_soc.loc[:, "hh_monthly_income_cat":]#[:,[5,7]],
 ], axis=1
)

#sev_list = []
data_ip3.columns = data_ip3.columns.str.replace('_r$', '', regex=True) 

## if not identical event select serious events
for i in range(len(data_ip3.loc[:,:])): 
  if (vars_dam.same[i]==0):  
    sev = pd.concat(
        [vars_soc.loc[i, ["Target_directloss_mVND_s", "Target_businessreduction_s", "shp_closed_d_s", "hh_education"]],
            vars_soc.loc[i, "hh_monthly_income_cat":]
        ], 
      )
    ## convert to 1-row df, drop unique endings to merge both dfs by cols names
    sev = pd.DataFrame(sev).T
    sev.columns = sev.columns.str.replace('_s$', '', regex=True) 

    #sev_list.append(sev)
    ## append not identical events to end of df
    data_ip3 = pd.concat([data_ip3, sev], ignore_index=True)


In [None]:
data_ip3.columns;

### Combine all important variabels 

In [None]:
## merged cases
all_input = pd.concat(
        [data_ip1.loc[:, data_ip1.columns != 'id'], 
        data_ip2.loc[:, data_ip2.columns != 'id'],
        data_ip3
        ], axis=1
)

In [None]:
print(all_input.shape) # is larger than from Rscript, due that col "same" incoporates flood times and more varibles are used.
# rscript based df has shape: 387rows x 78cols, in my df i dont have org1-5= is in organisation (Veteran, Youth org..) 
# - but have: Target, busin_red, flood_time, flood_experience, 'resilience*, perception* variables


#### data cleaning from commas and 99

Empty cells or the interviewee could not answer the question (“I don’t know”) were all set to missing number (not a number: NaN) and for dates to missing date (not a date: NaT). Nulls were left unchanged, ie. for the target variables for which it is assumed that the interviewee had no direct or indirect content losses.

In [None]:
## fix obj columns - convert to numeric by keeping nan values

all_input_obj = all_input.select_dtypes(include=object)  # all obj cols
#all_input_obj = all_input_obj.drop(["geometry"], axis=1)# TODO rm as soon as spatial var included

for c in all_input_obj.columns:
    all_input_obj[c] = all_input_obj[c].replace({'':np.nan, ' ':np.nan}, regex=True)
    all_input_obj[c] = all_input_obj[c].replace({r'^,':'0.', ',':'.'}, regex=True)
    all_input[f"{c}"]  = all_input_obj[f"{c}"].apply(pd.to_numeric)  # convert to int or float,  handles NAN

all_input = all_input.replace(99, np.nan)

## replace 98 values in damage levels
all_input[["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"]] = all_input[["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"]].replace(98, np.nan)


#### Unify monetary values to euros in 2020 

In [None]:
## check for 99
vars_money = all_input.filter(regex="_mVND", axis=1)


# hp_building_value_mVND	shp_content_value_mVND	Target_directloss_mVND	hh_monthly_income_catego	shp_avgmonthly_sale_catego	shp_capital_mVND

## TODO les welche vars (reconst_cost, income, capital etc) inflation bereinigt
## TODO check ob shp_capital_mVND, income, target_direct etc wegen conversion to mVND und dann zu int so ungenau sind 
##     -- excpeted mVND im captial, income, target_direct mit decimal stelen


vars_money.tail(3)#[vars_money.shp_capital_mVND <=1.0]
#vars_money.building_value_mVND[8] * 1000000

In [None]:
## covnert all columns with million VND to VND
vars_money = all_input.filter(regex="_mVND", axis=1)

vars_money = np.where( (vars_money.values != np.nan),
            vars_money.values * 1000000, # convert to VND
            vars_money.values)

## rename columns
new_cols = all_input.filter(regex="_mVND", axis=1).columns.str.replace("_mVND", "_VND")
vars_money = pd.DataFrame(vars_money, columns=new_cols)
vars_money

##### list of all monetary vars
- building_value_mVND	# price level for 2020 (year when survey was done)
- shp_building_value_mVND	# price level for 2020
- shp_content_value_mVND	# price level for 2020
- Target_directloss_mVND	# price levels based on flood time
- shp_capital_mVND    # price level for 2020
- hh_monthly_income_cat  # categircal [value ranges in mVND], # price level for 2020
- shp_avgmonthly_sale_cat   # categorical [value ranges in mVND], # price level for 2020

keep categorical monetary variables unchanged (hh_monthly_income_cat, shp_avgmonthly_sale_cat) [in mVND]
All other vars are inlfation corrected based on flood time or when survey was done 
cpi_2020 = 168.8  # 2020 = year when the survey was done
   

##### Inflation correction
adapt to direct losses to price level of 2021

Based on JRC, p.7:
The price-level update is based on global CPI information from World Bank (2015). 
Correction is performed using the following equation: 

damage2021 = damageyear_of_issue * (CPI2021 ) / (CPIyear_of_issue) 

where: 
- max_damageyear_of_issue = maximum damage in year of issue 
- max_damage2021 = maximum damage for price level 2021 
- CPIyear_of_issue = CPI for year of issue 
- CPI2020 = CPI for 2020


##### Conversion of VND to euro (or US$)

Based on JRC, p.8 and Paprotny2018, eg.p245
The reported maximum damage values have been converted to Euro using the following exchange rates for the year 2010 (mean annual value)

Vietnam (Dong) 0.000039, for 2022 €
   - for jan-juli2023: from oanda: 0.0000393304 for €, 
    - 0.0000424798 for dollar
*Source:* 
- www.oanda.com/currency/historical-rates
-  www.ecb.europa.eu/stats/exchange/eurofxref/html/eurofxref-graph-idr.en.html

Equation: 
damage2021_euro = damage2021_vnd * avg_exchange_rate


*Further sources*
Paprotny 2018: also used country-level GDP deflators for adjusting nomnal to real losses in 2011 prices , p153, p244
Sairam 2020


In [None]:
## CPI for VND
cpi_year_of_issue = {
    2010.0: 100.0,
    2011.0: 118.7,
    2012.0: 129.5,
    2013.0: 138.0,
    2014.0: 143.6,
    2015.0: 144.6,
    2016.0: 148.4,
    2017.0: 153.6,
    2018.0: 159.1,
    2019.0: 163.5,
    2020.0: 168.8
}
cpi_2020 = 168.8 
cpi_year = all_input.flood_year.replace(cpi_year_of_issue)  # series of cpi for each year of flood event

## exchange rate between VND and euro (in year 2020)
annual_avg_exchange_rate = 1 / 27155  #  dong-> euro (based on eurostat: https://ec.europa.eu/eurostat/databrowser/view/ERT_BIL_EUR_A/default/line?lang=en )
print(annual_avg_exchange_rate) #= ~ 0.0000368

In [None]:
vars_money.building_value_VND.info()

In [None]:
##  only direct losses needs inflation correction in respect to flood time
for r in range(len(vars_money.Target_directloss_VND)):
    ## inflation correction [VND_2020]
    vars_money.Target_directloss_VND[r] = vars_money.Target_directloss_VND[r] * cpi_2020 / cpi_year[r]
    ## convert VND_2020 to €_2020 [euro in 2020]
    vars_money.Target_directloss_VND[r] = round((vars_money.Target_directloss_VND[r]*annual_avg_exchange_rate), 1)#.astype(int)


# ##  for all other monetary continous vars: only need exchange conversion
for c in vars_money.drop("Target_directloss_VND", axis=1).columns:
    for r in range(len(vars_money[c])):
        ## convert VND_2020 to €_2020
        vars_money[c][r] = round((vars_money[c][r]*annual_avg_exchange_rate), 1)#.astype(int)


## rename columns
new_cols = vars_money.filter(regex="_VND", axis=1).columns.str.replace("_VND", "_euro")
vars_money.columns = new_cols

vars_money

In [None]:
# update all_input with unified VND
all_input.drop(all_input.filter(regex="_mVND", axis=1).columns, axis=1, inplace=True) 
all_input = pd.concat([all_input, vars_money], axis=1)
all_input.filter(regex="euro", axis=1).columns

# print(all_input.filter(regex="VND", axis=1).head(3))


In [None]:
all_input[all_input["shp_content_value_euro"] >= 100000.0];

In [None]:
#all_input_obj = all_input.select_dtypes(include=object)  # all obj cols
#all_input_obj.columns

# ## convert floats such as "2.6", ".55" from mVND to VND in columns for precaution costs (ending with "*.spend$")

# for c in df_only_numeric_vars.filter(regex="_spnd$").columns:  # exclude floats
#     df_only_numeric_vars[c] = np.where(df_only_numeric_vars[c] % 1 != 0, df_only_numeric_vars[c] * 1000000, df_only_numeric_vars[c])


df.protect_valuables_impl.value_counts();#filter(regex="_impl", axis=1)


##### Test: get information about number of stories
check amount of buildings with number of stories in HCMC by utulizing ohsome API


In [None]:
import requests
#from ohsome import OhsomeClient
URL = 'https://api.ohsome.org/v1/elements/count'
data = {"bboxes": "106.593238,10.6971085,106.7740687,10.8401006", "format": "json", "filter": "building=* and builing!=no and building:levels=* or level=* or building:level=* or stories=* or levels=* or building:part:levels=*"}
response = requests.post(URL, data=data)
print(response.json())

## --> building tagged in HCMC: 64495.0 from those have information about stories 2746.0 --> ~ <5% story information

#### write to disk

In [None]:
## drop empty column: shp_suppliers_location.6
## less tha n40 data points: perception_who_responsible4protection.Rank4, Rank5, Rank6, Rank7, Rank88
all_input = all_input.drop([
    "bloss", "rloss",
    "perception_who_responsible4protection.Rank4",
    "perception_who_responsible4protection.Rank5",
    "perception_who_responsible4protection.Rank6",
    "perception_who_responsible4protection.Rank7",
    "perception_who_responsible4protection.Rank88",
    "shp_suppliers_location.6",
    'overall_problem_house.1', 'overall_problem_house.2', 
    'overall_problem_house.3', 'overall_problem_house.4',
    'overall_problem_house.5', 'overall_problem_house.6', 
    'overall_problem_house.7', 'overall_problem_house.8', 'shp_finance_investments.88'
        ], axis=1)
    

In [None]:
## move targets to beginning , spatial vars to end

all_input.insert(0, "Target_directloss_euro", all_input.pop("Target_directloss_euro"))
all_input.insert(1, "Target_businessreduction", all_input.pop("Target_businessreduction"))

all_input.insert(len(all_input.columns)-1, "geometry", all_input.pop("geometry"))
all_input.insert(len(all_input.columns)-2, "lat", all_input.pop("lat"))
all_input.insert(len(all_input.columns)-3, "lon", all_input.pop("lon"))
all_input.insert(len(all_input.columns)-4, "elevation_m", all_input.pop("elevation_m"))


In [None]:
## save to 
all_input.to_excel("../input_survey_data/input_data_business.xlsx", index=False)


In [None]:
## export distribution statistics 
pd.set_option('display.float_format', lambda x: '%.1f' % x)
tbl = all_input.describe().T
tbl = tbl.drop(["flood_time","id", "lat", "lon"], axis=0)
tbl.to_excel("../input_survey_data/input_data_business_distrib.xlsx", index=True)
tbl#.head(20)

In [None]:
## non merged cases

all_cases = pd.concat(
        [vars_dam.loc[:, vars_dam.columns != 'id'], 
        vars_bui.loc[:, vars_bui.columns != 'id'], 
        vars_soc
        ], axis=1
)
all_cases.insert(0, "Target_directloss_mVND_r", all_cases.pop("Target_directloss_mVND_r"))
all_cases.insert(1, "Target_directloss_mVND_s", all_cases.pop("Target_directloss_mVND_s"))
all_cases.insert(2, "Target_businessreduction_r", all_cases.pop("Target_businessreduction_r"))
all_cases.insert(3, "Target_businessreduction_s", all_cases.pop("Target_businessreduction_s"))
print(all_cases.shape)


## select only locations within HCMC  - 2 businesses lay outside HCMC
all_cases = all_cases[all_cases.lon >= 10.6000]


## fix weird values, eg. dot instead commata
all_cases_obj = all_cases.select_dtypes(include=object)  # all obj cols

for c in all_cases_obj.columns:
    all_cases_obj[c] = all_cases_obj[c].replace({'':np.nan, ' ':np.nan}, regex=True)
    all_cases_obj[c] = all_cases_obj[c].replace({r'^,':'0.', ',':'.'}, regex=True)
    all_cases[f"{c}"]  = all_cases_obj[f"{c}"].apply(pd.to_numeric)  # convert to int or float,  handles NAN

all_cases = all_cases.replace(99, np.nan)


## save to disk
all_cases.to_excel("../input_survey_data/input_data_business_notmerged.xlsx", index=False)


##   rests   ########

In [None]:
all_input.shp_avgmonthly_sale_cat.describe()

In [None]:
### damage levels
#print(all_input.filter("shpdamage_level_furniture*", axis=1).columns)
#all_input.damage_level_doors_r

#print(all_input.shpdamage_level_products.value_counts())  
#print(all_input.shpdamage_level_electronics.isna().sum())  

all_input[["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"]] = all_input[["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"]].replace(98, np.nan)
print(all_input.shpdamage_level_products.describe())  

In [None]:
### damage levels
df_damage_levels = all_input[["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"]] 

df_damage_levels_melt = df_damage_levels.melt(value_vars=["shpdamage_level_furniture","shpdamage_level_equipment","shpdamage_level_electronics","shpdamage_level_products"])
#id_vars=
#df_damage_levels

fig, ax = plt.subplots()
#sns.barplot(x = df_damage_levels_melt["variable"], y = df_damage_levels_melt["value"])#, estimator = median)
#sns.boxplot(x=df_damage_levels_melt)#.set(title=f"{c}", xlabel=None)
sns.histplot(data=df_damage_levels, stat='count', multiple="stack")#.set(title=f"{c}", xlabel=None) # bins=12, stat="percent"

plt.xlabel("damage levels [1-no damage, 4-major damage]")#['furniture', 'equipment', 'electronics', 'products'])
plt.title("damage levels of business content")



In [None]:
df.shpdamage_level_equipment_r.value_counts()#.sum()
#df_damage_levels.describe()# .groupby("variable")["value"]
#t = df_damage_levels.replace(1.0, np.nan)

#t = all_input.Target_businessreduction.replace(0.0, np.nan)
#t.describe()

In [None]:
# #all_input.groupby(all_input.columns.tolist(),as_index=False).size()
# t = all_input.drop_duplicates()
# t
fig, ax = plt.subplots()

d = all_input.filter(regex="business",axis=1)
d = d.replace(r'^,', '0.', regex=True) 
d = d.replace(r',', '.', regex=True) 
d = d.replace(' ', 99) 

d = d.replace("", np.nan) 
d = d.replace(np.nan, 99) 

d#.value_counts()
sns.histplot(data=d.Target_businessreduction)
#ax.set_yscale('log')
#ax.set_ylim(0,100)
#ax.set_xlim(0,100)


In [None]:
fig, ax = plt.subplots()
d = all_input.filter(regex="Target",axis=1)
#d = d.drop(["contaminations_s.99", "contaminations_s.88"], axis=1)
#sns.histplot(bins=3, data=d, multiple="stack")
d = d.replace(r'^,', '0.', regex=True) 
d = d.replace(',', '.') 
sns.histplot(data=d)

#ax.set_yscale('log')
#ax.set_ylabel("")
#ax.legend(['no contamination', 'sewage water', 'fuel oil', 'chemicals', 'garbage'])

ax.set_title(f"Contamination types: \nContamination with fuel oil occures {all_input['contaminations.3'].value_counts()[1]} times")


### Distribution targets 


In [None]:
df = df.replace("^,", "0.", regex=True) 
df = df.replace(",", ".", regex=True) 


df_targets = df[["Target_directloss_mVND_r", "Target_directloss_mVND_s"]]
fig, (ax_hist) = plt.subplots(len(df_targets), 2, sharex=False, figsize=(7, 40), constrained_layout=True)#, gridspec_kw={"height_ratios": (.15, .85)})
#fig.tight_layout()  # alternative for tight_layout() and subplots_adjust(): constrained_layout=True
fig.suptitle('Distributions of target variables')

for i, c in enumerate(df_targets.columns[:]):
    #plt.subplots_adjust(hspace=.2)
    sns.boxplot(x=df_targets[c].astype(float), ax=ax_hist[i, 0]).set(title=f"{c}", xlabel=None)
    sns.histplot(x=df_targets[c].astype(float), stat='count', ax=ax_hist[i, 1]).set(title=f"{c}", xlabel=None) # bins=12, stat="percent"


## TODO : shp_closed, shp_duration_back2normal, shp_damage_level from conitnous -> intervals

# sns.despine(ax=ax_hist)  # arrange boxplots above bar charts
# sns.despine(ax=ax_box, left=True)
# ax_box.set(yticks=[])


In [None]:
fig, ax = plt.subplots()
d = all_input.filter(regex="Target",axis=1)
#d = d.drop(["contaminations_s.99", "contaminations_s.88"], axis=1)
#sns.histplot(bins=3, data=d, multiple="stack")
d = d.replace(r'^,', '0.', regex=True) 
d = d.replace(',', '.') 
sns.histplot(data=d)

#ax.set_yscale('log')
#ax.set_ylabel("")
#ax.legend(['no contamination', 'sewage water', 'fuel oil', 'chemicals', 'garbage'])

ax.set_title(f"Contamination types: \nContamination with fuel oil occures {all_input['contaminations.3'].value_counts()[1]} times")


In [None]:
df_targets = df[["Target_direct_r", "Target_direct_s"]]
fig, (ax_hist) = plt.subplots(len(df_targets), 2, sharex=False, figsize=(7, 40), constrained_layout=True)#, gridspec_kw={"height_ratios": (.15, .85)})
#fig.tight_layout()  # alternative for tight_layout() and subplots_adjust(): constrained_layout=True
fig.suptitle('Distributions of target variables')

for i, c in enumerate(df_targets.columns[:]):
    #plt.subplots_adjust(hspace=.2)
    sns.boxplot(x=df_targets[c].astype(float), ax=ax_hist[i, 0]).set(title=f"{c}", xlabel=None)
    sns.histplot(x=df_targets[c].astype(float), stat='count', ax=ax_hist[i, 1]).set(title=f"{c}", xlabel=None) # bins=12, stat="percent"


## TODO : shp_closed, shp_duration_back2normal, shpdamage_level from conitnous -> intervals

# sns.despine(ax=ax_hist)  # arrange boxplots above bar charts
# sns.despine(ax=ax_box, left=True)
# ax_box.set(yticks=[])


In [None]:
## save to disk
df_candidates_combined.to_excel("../input_survey_data/survey_data_candidates_coords.xlsx", index=False)



## TODO possible further candidates which needs to be checked and maybe developed: 
## - Indicator of flood warning information 
## - Lead time period elapsed without using it for emergency measure
## - Precautionary measure indicator
## - Knowledge of flood hazard	- as binary
## - Building quality	(Content value in USD - not in HCMC survey data, only indirect derivable via shp_sector)
## - Socioeconomic status according to Plapp [31]
##      'P1Q5.2.2':'shp_damage_level_furniture', 'P1Q5.3.2':'shp_damage_level_electronics', "shp_damage_level_others"

### Aggregate multiple choice answers

In [None]:
df_a = df_only_numeric_vars.copy() #deep=True)

for c in col_names.values():
    #c = "contaminations_r"
    df_agg = df_only_numeric_vars.filter(regex=f'^{c}$|^{c}' + r'.*[^y]$', axis=1)  # exclude col ending with "specifcy"
    #df_agg = df_only_numeric_vars.filter(regex=f'^{c}$|^{c}' + r'|.*[^(specify)]', axis=1)  # exclude col ending with "specifcy"
    df_agg = df_agg.select_dtypes(include=np.number) # agg only numeric cols

    ##  count occurences of yes within multiple choice answers and leaves single-choice answers unchanged (e.g water_depth_cm)
    if len(df_agg.columns) >= 2:
        df_a[c] = df_agg.eq(1).sum(axis=1)
    else:
        df_a[c] = df_agg.sum(axis=1)  

## df_a contains aggregated and org columns e.g conatimnations_r.3, conatimnations_r


    #df_a[c] = df_agg.agg("max", axis="columns")  #  binary values for multile choice answers

    #print(df_a[c].head(1))
    ## TODO make nicer: .{,2}$ = matches also col.names ending with 89, 99 etc.
    #df_agg = df_only_numeric_vars.filter(regex=f'^{c}'+ r'.{,2}$|.*[^88][^99][^y]$', axis=1)  # match columns with specfied names and not containing non-numeric values
    #print(df_agg.columns.sort_values(ascending=False))#[:-1])
    #df_a[c]  =  df_a[df_agg.columns[:]].apply(lambda x: ','.join(x.dropna().astype(str)),  axis=1) # rm old cols and keep new aggregated column

    #if len(df_agg.columns) >= 1:
    #    df_a = df_a.drop(df_agg.columns.sort_values(ascending=False)[:-1], axis=1)


In [None]:
## fix numeric variables containing integers
#df_only_numeric_vars = df_only_numeric_vars_org

df_only_numeric_vars  = df_only_numeric_vars.replace(r'^,', '', regex=True)  # fix remaining values beginning with random commas e.g. ,5
df_only_numeric_vars  = df_only_numeric_vars.replace(r' ', None, regex=True)

for c in df_only_numeric_vars.columns.drop(df_only_numeric_vars.filter(regex=f'{pattern_float_cols}|others_.$|specify$|^GPS').columns):  # exclude floats
   # df_only_numeric_vars[f"{c}"]  = df_only_numeric_vars[f"{c}"].astype(str).apply(lambda x: np.where(x.isdigit(), x, np.nan)) # set remaining errors as nan e.g 1,5
   ## df_only_numeric_vars[f"{c}"] = df_only_numeric_vars[f"{c}"].loc[:, df_only_numeric_vars.columns != 'GPS'].astype(float).astype(pd.Int64Dtype())
    df_only_numeric_vars[f"{c}"]  = df_only_numeric_vars[f"{c}"].astype(float)#.astype(pd.Int64Dtype())

## TODO fix: adds somehow more rows
#df_only_numeric_vars = pd.concat([df_only_numeric_vars, df.GPS])
#df_only_numeric_vars["GPS"] = df_only_numeric_vars[0]


In [None]:
## set all 99 (i dont know) and 2 (didnt cost anything) to 0

pattern_int_cols = re.compile("repair_costs_building_VND_*|repair_costs_building_complete_*|repair_costs_residentail_contents_*|insurance_b*|Rank|shp_capital_mVND")
df_only_numeric_vars_99 = df_only_numeric_vars.filter(regex=re.compile(pattern_int_cols.pattern + pattern_float_cols.pattern), axis=1).replace(99, 0) 


for c in df_only_numeric_vars_99.columns:
    df_only_numeric_vars[c] = df_only_numeric_vars_99[c]  

## TODO fix float and int cells with 2 and 1, 88
## poss float cols
## INT P1.3.8 (r+s) # repair_costs_building_VND_* (2,99) = 0
# INT .311.1 +.2 # repair_costs_building_complete_* 99=0
#  INT 4.6.1 + .2 crepair_costs_damaged_contents_* ,set 99 to 0 and ,1=0
# FLOAT 5.6 target 99=0
# FLOAT P2. _spend costs P2.1.x.spend  2,99=0
# FLOAT P2Q2.4 elevation_building_material_costs 1,99=0 , keep 3,95 as FLOAT ,rest INT
# INT P2Q3.1.1 insurance_buildinge , 2+99=0
# INT P2Q3.2.1 insurance_business 2+99=0
# INT P3Q2.3 (r+s) Rank 99,88 = 0
# FLOAT P4Q2.3 building_floorsize_sqm (99=real number)
# 4.5 renov costs (r+s) ",5"-> 0.5 , 99=0
# FLOAT P5Q1.7 shp_capital_mVND, 99=0

## Create indicators
indictator for precautionary measure
- as a ratio between b measures implemented prior to the flood (nI) divided by the nb measures potentailly could have implemented (nP)
- (https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2020WR027649)
- ! check out paper from Sieg et al. (2017) - they didnt combined adapt+mitig+emerg= have more predictors

Maybe socio-economic status indicator:
- according to Plapp 2003, applied by Thieken et al 2005

indicator of warning information, 
indicator of emergency measures, 
perception of efficiency of private precaution, 
building quality, 
building/content value
