## preprocessing of Can Tho Dataset and compariosn with HCMC dataset
AIm: Valdiate that both datasets dont differ too much from each other, so that they can be compared with each other


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Data preprocessing for Can Tho survey dataset"""

__author__ = "Anna Buch, Heidelberg University"
__email__ = "anna.buch@uni-heidelberg.de"


import sys, os
import re
from pathlib import Path
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import to_rgba

import src.settings as s

SRC_PATH = os.path.join(os.path.abspath(""), "../", "src")
OUTPATH_FIGURES = Path("../../figures")
OUTPATH_FIGURES.mkdir(parents=True, exist_ok=True)

#s.init()
# seed = s.seed

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', None)
pd.set_option('use_inf_as_na', True)
plt.figure(figsize=(20, 10))



In [None]:
## load DS for flood event from 2011 in CanTHo
df_cantho_raw = pd.read_excel(f"../{s.INPATH_DATA}/input_data_cantho_2013_raw_no_multiheader.xlsx") # removed mulitple header rows

# ## load HCMC DS for flood events between 2010-2020
df_hcmc_rloss = pd.read_excel(f"../{s.INPATH_DATA}/input_data_contentloss_tueb.xlsx")
df_hcmc_bred = pd.read_excel(f"../{s.INPATH_DATA}/input_data_businessreduction_tueb.xlsx")

df_hcmc_bothtargets = pd.read_excel(f"../{s.INPATH_DATA}/input_data_bothtargets_tueb.xlsx")



In [None]:
df_cantho = df_cantho_raw

## missing values
df_cantho = df_cantho.replace({99.0: np.nan, 88.0: np.nan})

In [None]:

df_cantho_raw[["1.5"]].describe()

### rename columns 
rename cols to the ones used in HCMC ds

In [None]:

## assigne new columns which have to be created in this script, eg relative contnet loss
df_cantho["Target_relative_contentloss_euro"] = None
df_cantho["Target_contentloss_euro"] = None



col_names = {
       r'3.31$' : 'Target_eloss_VND', 
       r'^3.34$' : 'Target_gloss_VND', 
       r'3.40$' : 'Target_businessreduction',

       r'3.13.b$' : 'inundation_duration_day/month',
       r'3.13.c$' :'inundation_duration_hour/day',
       r'^3.12$' : 'water_depth_cm', 
       r'3.16$' : 'flowvelocity', 
       r'3.14$' : 'contaminations', 
       r'^2.2$' : 'flood_experience', 

       '3.7.' : 'emergency_measures', 
       '4.1.' : 'precautionary_measures',  
       # not incl pumping euqipmentn and saving of valuables, 
       # but incl cheap funriture and low-value usage

       r'7.3$' : 'bage', 
       r'11.8$' : 'b_area',
       r'11.9$' : 'floors',
       r'7.4$' : 'building_value_cat',
       '3.18.' : 'overall_problem_house',
       r'11.6$' : 'content_value_g_VND',
       r'11.7$' : 'content_value_e_VND',
                  #  "P1Q5.8.1":"shp_business_limitation_r",
                  #    "P1Q5.8.2":"shp_business_limitation_s", # needed for modelling monetary loss (abs. loss) of business reduction 

       r'11.2$' : 'shp_employees', 
       r'11.3.1$' : 'shp_avgmonthly_sale_VND', 
       r'10.5$' : 'hh_monthly_income_cat',
       r'11.5$' : 'shp_registered_capital_VND_cat', # 11.5.	How much did you pay for opening your shop


       ## further variables which might be intresting but not needed for feature selection
       r'^1.4$' : 'shp_sector',
       r'^3.2$' : 'flood_type',
       r'^3.5$' : 'warning_time_day',  # for HCMC Q1P2.9 wanring_time_hours
       r'3.9$' : 'effect_emergency_measures',
       r'3.32$' : 'replaced_cost_e',
       r'3.34$' : 'replaced_cost_g',

        # Risk perception and resilience
       r'^5.1$' : 'risk_future_flood',
       r'^5.2$' : 'risk_consequents_future_flood',
       r'4.9.1$' : 'perception_too_destructive_floods',  # NOTE == resilienceLeftAlone
       r'4.1.3' : 'resilience_joined_neighborhood_network', # 3 categories [Year	Before flood 2011	After flood 2011]
       r'6.1.1' : 'resilience_flood_management', # binary
       r'6.1.2' : 'resilience_govern_careing', # binary
       r'6.3$' : 'resilience_city_protection', # scaled [1-6]

       # '6.1' : 'resilience_govern_management',
       # '6.2' : 'perception_govern_increase_management',
       #'6.3' : 'perception_govern_protection', # is scaled 1-6
       #'resilience', 

       r'7.2$' : 'ownership',
       r'11.15$' : 'builing_elevation',
       'Type of house' : 'builing_type',
}

for k, v in col_names.items():
    df_cantho.rename(columns ={ i: re.sub(k, v, i) for i in  df_cantho.columns }, inplace=True )

## drop unneeded columns 
df_cantho = df_cantho[df_cantho.columns[ ~df_cantho.columns.str.match('^\d')]]


# target vars for relative and absolute costs on content loss [VND]
df_cantho.insert(0, "Target_eloss_VND", df_cantho.pop("Target_eloss_VND")) 
df_cantho.insert(1, "Target_gloss_VND", df_cantho.pop("Target_gloss_VND"))
df_cantho.insert(2, "Target_contentloss_euro", df_cantho.pop("Target_contentloss_euro"))
df_cantho.insert(3, "Target_relative_contentloss_euro", df_cantho.pop("Target_relative_contentloss_euro"))

# explanatory var: monthly reduction of business [%] 
df_cantho.insert(4, "Target_businessreduction", df_cantho.pop("Target_businessreduction"))  


df_cantho.columns[:]

In [None]:
df_cantho

In [None]:
## NOTE resilience_flood_management == 6.1.1 is not binary (as in questionary.docx)
df_cantho_raw["6.1.1"].value_counts()
df_cantho.resilience_flood_management.value_counts()

### Absolute content loss

In [None]:
## merge both absolute content loss targets
df_cantho["Target_contentloss_VND"]  = df_cantho.Target_eloss_VND + df_cantho.Target_gloss_VND
np.round(df_cantho["Target_contentloss_VND"], 2).describe()

## merge content values 
df_cantho["shp_content_value_VND"]  = df_cantho.content_value_g_VND + df_cantho.content_value_e_VND

df_cantho.drop(["Target_eloss_VND", "Target_gloss_VND", "content_value_g_VND", "content_value_e_VND"], axis=1, inplace=True)


### Hydrological variables


In [None]:
df_cantho["inundation_duration_h"] =  df_cantho["inundation_duration_hour/day"] # <<-- more similar to HCMC, than df_cantho["inundation_duration_day/month"] * inundation_duration_hour/day


## verify with HCMC
print("CanTHo", df_cantho.inundation_duration_h.describe(), "\n", "HCMC", df_hcmc_rloss.inundation_duration_h.describe())

# # water depth [cm]
print("CanTHo", df_cantho.water_depth_cm.describe(), "\n", "HCMC", df_hcmc_rloss.water_depth_cm.describe())
# median: 0.20cm, mean: 0.25m, max: 0.80m



In [None]:
## contamination
## rank contamination according to their occurence and damage potential: 0:no contamination, 1:light contamination and 2:heavy contamination

df_contaminations = df_cantho.filter(like="contaminations", axis=1)

df_cantho["contaminations"] = df_cantho["contaminations"].replace(
    {
    0.00 : 0, #  no contamination
    4.00 : 0, # no contamination  
    1.00 : 1, # light
    2.00 : 2,  # heavy
    3.00 : 2,  # heavy
    }
)
df_cantho.contaminations.value_counts()

In [None]:
# assign according to HCMC distribution


## HCMC scales
# all_input.flood_experience = all_input.flood_experience.replace(
#     {
#       1:3,     # less than once a year , freq: 1
#       2:8,     # about once a year, freq: 14
#       3:16,      # twice a year, freq: 37
#       4:36,      # 4 times a year, freq: 58
#       5:76,      # 8 times a year, freq: 98
#       6:151     # around 15 times a year - used range 100-200, freq: 112 (more than 10 times a year, 100 times since 2010)
#     }
# )

## adapt to flood experience scales in HCMC
df_cantho.flood_experience = df_cantho.flood_experience.replace(
    {
        #  less than once a year doesnot exist in Can Tho
        1.0 : 8,    # once a year
        2.0 : 16,   # twice a year
        4.0 : 26,    # three times a year (doesnt exist for HCMC)
        4.0 : 36,   # 4 times a year
        5.0 : 76,    # More than four times a year
        # around 15 times a year does nto exist in cantho
    }
)
df_cantho.flood_experience.value_counts()


## TODOD  bag

In [None]:
## flowvelo

## Paprotny 2021:  As for flow velocity, the respondents assessed i based on a qualitative scale, 
# providing a value from 1 to 6, with half-points possible (Thieken et al. 2005). 
# A value of 0.1 m/s was assigned to each full step of this qualitative scale. 


## flowvelocity high depends on the exepience and Einschätzung of the intrviewee
## e.g. has the interviewee ever seen turbulent water
## still the variable shows the general characteristics, if flood water is fast or not
## Extrapolation: use fv as input and also for damage-funs(due that maybe most important var in FI) 
## - flowvelo is highly related to inundation duration --> from inundation duration possible to derive flovelo 

## adapt to v scales in HCMC
df_cantho.flowvelocity = df_cantho.flowvelocity.replace(        # 1 - 6 # calm - torrential
    {
        0 : np.nan,  
        1 : 0.1,  # calm
        2 : 0.1,  # calm
        3 : 0.2, 
        4 : 0.3, 
        5 : 0.4,
        6 : 0.5  # torrential
    }
)

#df_cantho.flowvelocity.value_counts()
df_cantho.flowvelocity.value_counts()


### Building variables

In [None]:
df_cantho["building_value_mVND"] = df_cantho["building_value_cat"].replace(
    {
        1.0 : 2.5,
        2.0 : 7.5,
        3.0 : 15,
        4.0 : 35,
        5.0 : 75,
        6.0 : 150,
        7.0 : 350,
        8.0 : 750,
        9.0 : 1500,
        10.0 : 3500,
    }
)
df_cantho = df_cantho.drop("building_value_cat", axis=1)


In [None]:
## bv
# median: 75 mVND, mean: 75 mVND, max 3500 mVND
# print(df_cantho.floors.describe() ) # meaningless or only flat buildings (or count ground floor = 0, first level = 1 ?)
    
## fix floors :
## problably 0 : bungalow with only ground floor --> in HCMC would be this 1 floor
df_cantho["floors"] = df_cantho["floors"] +1 

## remove records with more than 3 floors
df_cantho = df_cantho.loc[df_cantho["floors"] <= 3, :]  # rm ~ 20 records



## verify with HCMC
print("CanTHo", df_cantho.floors.describe(), "\n", "HCMC: median and mean are at 2 floors ")



In [None]:
df_cantho["bage"].value_counts()

In [None]:
# in HMCC oldest bui is around 90 to 100 years old

In [None]:
## bage
# convert fro mcategrical scale to relative scale with bage at time of flood event

# print(df_cantho["bage"].describe())

## repalce missing info
df_cantho.bage = df_cantho.bage.replace(0, np.nan)  # P4Q2.2: when was the house constructed [year]?
df_cantho.bage = df_cantho.bage.replace(1, np.nan)  # existent but not mention year

## categories --> continous scales [year]
df_cantho["bage"]  = df_cantho["bage"].replace(
    {
       # 1.0 : specific year,  # existent but not mention year
        2 : 1950,
        3 : 1965,
        4 : 1975,
        5 : 1985,
        6 : 1995,
        7 : 2003,
        #8 : 2010,  # not in repsonded in cantho survey
    }
)

flood_year = 2011  # year of flood event in Can Tho

# ## NOTE ["bage"] is here converted to relative scale: its then the b.age at the time of flood event
df_cantho["bage"] = flood_year - df_cantho.bage.astype("Int64")  # building age at time of flood event
df_cantho["bage"] = df_cantho["bage"].astype(float)

print(df_cantho["bage"].describe())

## --> most bui buid in 90s (and in 80s)


### socio economic status
check if income , monthly sale etc is similar to the HCMC shophouses


In [None]:
df_cantho["hh_monthly_income_mVND"]  = df_cantho["hh_monthly_income_cat"].replace(
    {
        1.0 : 0.25,  # in mVND
        2.0 : 0.75,
        3.0 : 1.5,
        4.0 : 3.5,
        5.0 : 7.5,
        6.0 : 15,
        7.0 : 35,
        8.0 : 50,
    }
)

df_cantho.drop("hh_monthly_income_cat", axis=1, inplace=True)

In [None]:
# shp_registered_capital_VND

df_cantho["shp_registered_capital_mVND"]  = df_cantho["shp_registered_capital_VND_cat"].replace(
    {
        1.0 : 25,  # in mVND
        2.0 : 75,
        3.0 : 150,
        4.0 : 350,
        5.0 : 750,
        6.0 : 1500,
        7.0 : 3500,
    }
)

df_cantho.drop("shp_registered_capital_VND_cat", axis=1, inplace=True)
# df_cantho["shp_registered_capital_mVND"].describe()

## compared to HCMC
##  --> medians are similar, 1. and 3.Q: are also more or less similar when considering the categrical sturcutre for CanTho
## a bit too high values for CanTHo probably due to the categorical intervals were smallest registed captial value caputres most cases ,
# ## while for HCMC a continous scale was used in the questionary

In [None]:

df_cantho[["hh_monthly_income_mVND", "shp_employees", "shp_avgmonthly_sale_VND", "shp_registered_capital_mVND"]].describe()
# df_cantho[df_cantho.number_employees>=20]  # 5 businesses with more than 20 employees

## --> first keep also larger shops due that their absolute losses ae similar high aas for HCMC around 2500 €


In [None]:
# df_cantho[df_cantho.number_employees>=20]  # 5 businesses with more than 20 employees

## --> first keep also larger shops due that their absolute losses ae similar high aas for HCMC around 2500 €

### Business-characteristics


In [None]:
## HCMC
# hh_income	[category]
# median 300 €, mean: 425 €, max: 3320 €

# number_employees	
# median: 2,  mean: 2,  max: 10

# shp_avg_monthly_sale
# median: 290 €, mean: 370 €, max: 2760 €



## Can Tho
# hh_income	[category]
# median: ~ 140€, mean 140€-380€, max > 2000€

# number_employees	
# median: 1 mean: 1,  max: 37

# shp_avg_monthly_sale [mVND]
# median: 192 €, mean: 1670 €, max: 346500 €

## --> surveyed people in CanTHo have smaller income than in HCMC --> TODO check ownership (more employee stauts than in HCMC ? )
## --> maybe remove shop in CanTho with many employees and high sale


## Indicators

In [None]:
## reset idnex to avoid problem when re-merging with indicators 
df_cantho = df_cantho.reset_index(drop=True)


In [None]:
## emergency

# df_cantho.emergency_measures1 - 9

## emergency measures as indicator : range: 0:no measures - 6: applied all measures 

pattern = [r"^emergency_measures.?"] 
pattern_cols = re.compile('|'.join(pattern))
df_emergency = df_cantho.filter(regex=pattern_cols, axis=1)
df_emergency

## create indicator as ratio between implemented and potentially implemented emergency measures
df_cantho["emergency_measures"] = None
df_cantho["emergency_measures"] = df_emergency.eq(1).sum(axis=1) / len(df_emergency.columns)

# keep only indicator in final df
df_cantho.drop(
    df_emergency.filter(
        regex=r"^(?:.+\d)$"
        ).columns, 
    axis=1, inplace=True
)
df_cantho.emergency_measures.describe()

In [None]:
## overall_problem_house

## Cantho survey question
# 1: My house was collapsed or washed away 
# 2: My house was damage partly (specify)
# 3: My house was full of water which caused damage to the floor, walls, etc.


pattern = [r"overall_problem_house.(?<!4)$"] # get all columns but not overall_problem_house_r4: no problem/ problem 
pattern_cols = re.compile('|'.join(pattern))
df_problem_house = df_cantho.filter(regex=pattern_cols, axis=1)
df_problem_house.describe()

# ## create indicator (based on ranking scheme from HCMC)
df_cantho["overall_problem_house"] = None

idx = np.where(df_problem_house["overall_problem_house1"]==1)
df_cantho.overall_problem_house[idx[0].tolist()] = 6  # heavy building damage


idx = np.where(df_problem_house["overall_problem_house2"]==1)
df_cantho.overall_problem_house[idx[0].tolist()] = 4  # partly building damage

## fix typos in overall_problem_house3
df_problem_house["overall_problem_house3"] = df_problem_house["overall_problem_house3"].replace({10:1, 3:1})

idx = np.where(df_problem_house["overall_problem_house3"]==1)
df_cantho.overall_problem_house[idx[0].tolist()] = 3  # floor and wall damage


## keep only indicator in final df
df_cantho.drop(
    df_cantho.filter(regex=r"^overall_problem_house.$").columns,
    axis=1, inplace=True
)
print(df_cantho.columns)

In [None]:
# # # df_cantho.precautionary_measures1 - 10
# # #df_cantho.precautionary_measures1.value_counts()
# # df_precautionary.describe()

# pattern = [r"^precautionary_measures.*(?<![2,3])$"] # exclude information measures
# pattern_cols = re.compile('|'.join(pattern))
# df_precautionary = df_cantho.filter(regex=pattern_cols, axis=1)
# print(df_precautionary.columns)


In [None]:
## precaution measures 
# ## --> test first as seperate features to keep as much information as possible

pattern = [r"^precautionary_measures.*(?<![2,3])$"] # exclude information measures
pattern_cols = re.compile('|'.join(pattern))
df_precautionary = df_cantho.filter(regex=pattern_cols, axis=1)
# print(df_precautionary.columns)
df_precautionary.drop("precautionary_measures1", axis=1, inplace=True)


## fix typo
df_precautionary = df_precautionary.replace({40.00:np.nan})

# ## create ratio: 
df_precautionary = df_precautionary.replace(
    {
        4:1,
        3:1,    # impl before 
        2:1,    
        1:1,     
        0:0     # not at all  
    }
)
# ## devide into expensive and low cost precautionary measures due that expensive meausres seems to be better in flood-loss-reduction
df_precautionary_expensive = df_precautionary[["precautionary_measures9", "precautionary_measures10", "precautionary_measures7"]]
df_precautionary_low = df_precautionary.drop(["precautionary_measures9", "precautionary_measures10", "precautionary_measures7"], axis=1)



# create indicator as ratio between implemented and potentially implemented measures (in total 7 measures exist)
# range: 0.0: none measure were implemented before the flood event - 1.0: all potential measures were implemented before the reported flood
# --> splitting in low and expensive precaution measures reduces the importance of the feautres for the models remarkable
df_cantho["precautionary_measures_lowcost"] = None
df_cantho["precautionary_measures_lowcost"] = df_precautionary_low.sum(axis=1) / len(df_precautionary_low.columns)
df_cantho["precautionary_measures_expensive"] = None
df_cantho["precautionary_measures_expensive"] = df_precautionary_expensive.sum(axis=1) / len(df_precautionary_expensive.columns)


## keep only indicator in final df
df_cantho.drop(
    df_cantho.filter(regex=r"\d$").columns,
    # df_cantho.filter(regex=r"^precautionary_measures.$").columns,  # not caputere precuation_10
    axis=1, inplace=True
)
print(df_cantho.columns)


In [None]:
df_cantho[["precautionary_measures_lowcost", "precautionary_measures_expensive"]].describe()

## HCMC for both precautionary indicators
# keys stats are similar between both DS (similar mean and median ) 
## --> for CanTHo it seems that at least lesss often all types of low and expensive meausres were implemented compared to HCMC, but on average more than in HCMC (compare medians between both DS for low and expensive precautions)

In [None]:
# df_resilience.resilience_flood_management.value_counts()

# has one anser with 2 , and one reply with 6 
# --> seems like outliers whiche are removed or entire variable is not used in reslience indeicator due that its only binary

In [None]:
# ## resilience as indicator:
# ## 1 : strong disagree - 5 : strong agree


## select based on findings from PCA
# for HCMC DS: df_resilience = df_cantho[["resilience_city_protection", "resilience_govern_careing","resilience_neighbor_management"]]
# for CanTHo DS
df_resilience = df_cantho["resilience_city_protection"]
# df_resilience = df_cantho[["resilience_city_protection","resilience_flood_management", "resilience_govern_careing"]]
print(df_resilience.describe())

# convert binary resilience_govern_careing to scaled [1, 6]
# df_resilience["resilience_govern_careing"] = df_resilience["resilience_govern_careing"].replace({1:6, 0:1})


df_cantho["resilience"] = None
df_cantho["resilience"] = df_cantho["resilience_city_protection"] # only one variable is currently used
# df_cantho["resilience"] = df_resilience.sum(axis=1) / len(df_resilience.columns)
print(df_cantho["resilience"].value_counts())


# ## rather pessimistic resilience variables
# df_resilience_leftalone = df_cantho[["resilience_left_alone", "resilience_more_future_affected"]]
# print(df_resilience_leftalone.columns)



## keep only indicator in final df
# df_cantho.drop("perception_too_destructive_floods", axis=1, inplace=True)
df_cantho.drop(
    df_cantho.filter(like="resilience_").columns,
    axis=1, inplace=True
)
print(df_cantho.columns)
## 1: disagree, 5: agree


In [None]:
# ## perception as indicator
# ## o	Orderer rank okay as long its  quantitative

# # Reduced /Porrer  -> 1
# # Maintained /Same  -> 2
# # Increased /Richer -> 3


# # ## based on findings from PCA
# # df_perception = df[["perception_govern_support_past", "perception_govern_support_future"]]
# # print(df_perception.columns)

# # df["perception"] = None
# # df["perception"] = df_perception.sum(axis=1) / len(df_perception.columns)

# ## keep only indicator in final df
# df.drop(
#     df.filter(like="perception_").columns, 
#     axis=1, inplace=True
# )



## Unify monetary values

In [None]:
# ## check for very small registered capital
# vars_money = df_cantho.filter(regex="_mVND", axis=1)
# try:
#     vars_money[vars_money.shp_registered_capital_mVND <=1.0]  # less than 40 euros
# except:
#     pass

###  Variables inflation corrected for 2020 (align with HCMC ds)

Survey was done 2012 therefore all monetary variables need inflation corrected for the time period from 2012-2020 (reference year), except damage variables need to be inflation corrected from year of flood event (which was 2011) to price level of 2020. 


**Variables inflation corrected for year when survey was conducted**

- building_value_mVND	- price level for 2012 (year when survey was done)
- shp_building_value_mVND	- price level for 2012
- shp_content_value_VND	- price level for 2012
- shp_registered_capital_mVND  - price level for 2012
- hh_monthly_income_mVND     - continous [value ranges in mVND], # price level for 2012
- shp_avgmonthly_sale_VND   - continous [value ranges in mVND], # price level for 2012 


**Variables inflation corrected for flood year**
Damage variables ('Target_eloss_VND', 'Target_gloss_VND' -- merged in Target_contentloss_VND) need to be inflation corrected based on flood time which was 2011
- 'Target_contentloss_VND'  - price levels based on flood time (2011)



In [None]:
## check for very small registered capital
vars_money = df_cantho.filter(regex="_mVND", axis=1)


## covnert all columns with million VND --> VND

vars_money = np.where( (vars_money.values != np.nan),
            vars_money.values * 1000000, # convert to VND
            vars_money.values)

## rename columns
new_cols = df_cantho.filter(regex="_mVND", axis=1).columns.str.replace("_mVND", "_VND")
vars_money = pd.DataFrame(vars_money, columns=new_cols)
print(vars_money.columns)

df_cantho.drop( df_cantho.filter(regex="_mVND", axis=1).columns, axis=1, inplace=True)

In [None]:
## add remaining monetary variables
vars_money = pd.concat(
    [vars_money, 
    df_cantho[df_cantho.filter(regex="VND", axis=1).columns.tolist()]
    ], axis=1)
vars_money.columns  # sholuld all end with _VND (no _mVND)

*Conversion of VND to euro (or US$)*

Based on JRC, p.8 and Paprotny2018, eg.p245
The reported damage values have been converted to Euro using the the exchange rate for the year 2012 (mean annual value)

*Source:* 
- www.oanda.com/currency/historical-rates


In [None]:
# # GDP deflator source : https://jp.tradingeconomics.com/vietnam/gdp-deflator
gdp_price_index_year_of_flood = 121.26  # 2011
gdp_price_index_year_of_survey = 140.91  # 2012
gdp_price_index_2020 = 163.58 # price level 2020 based on GDP-deflation

## exchange rate 
exchange_rate = 1 / 27155  # ~ 3.68e-05  dong-> euro in 2020 
## (based on https://www.oanda.com/currency-converter/de/?from=VND&to=EUR&amount=1 )

## set price index on flood-year - all records reference on the severe flood event in CanTHo in 2011
gdp_price_index_year_of_flood = np.full( vars_money.shape[0], gdp_price_index_year_of_flood, dtype=float) # flood cases are all from 2011 flood in CanTHo

## set price index on survey-year - 2012
gdp_price_index_year_of_survey = np.full( vars_money.shape[0], gdp_price_index_year_of_survey, dtype=float) 

## when different flood-years exist
# gdp_price_index_year = data_ip2["flood_year"].astype("Int64").map(gdp_price_index_year_of_issue)  # series of cpi for each year of flood event


*Inflation correction via GDP-deflator*


\begin{equation}
\begin{align*}

&uninflated_{2020} = losses_y * exchangerate_{2020} \\
&inflationrate = uninflated_{2020} * pindex_{2020} / pindex_y\\

\end{align*}
\end{equation}

where:
- losses_y : losses in VND for year y
- uninflated_{2020} : uninflated losses in euro for 2020
- exchangerate_{2020} : exchang erate for VND to euro in year 2020 
- pindex_{2020} : price index from GDP-deflator for 2020 
- pindex_y : price index from GDP-deflator in year y

Given that inflation is the percentage change in the overall price of an item in an economy, we can use the GDP deflator to calculate the inflation rate since its a measure of the price level.


*Further sources* \
*Paprotny 2018*: also used country-level GDP deflators for adjusting nominal to real losses in 2011 prices , p153, p244 \
*Sairam et al. 2020*


In [None]:
## GDP-deflator

vars_money["Target_contentloss_VND_gdp"] = None

##  only direct losses needs inflation correction in respect to flood time
for r in range(len(vars_money.Target_contentloss_VND)):

    ## exchange rate: convert VND in certain year to € in the same year
    uninflated_losses = (vars_money.Target_contentloss_VND[r] * exchange_rate) # get uninflated losses in euros for year 2020
    
    ## price index from GDP-deflator
    vars_money["Target_contentloss_VND_gdp"][r] = round(uninflated_losses * gdp_price_index_2020 / gdp_price_index_year_of_flood[r], 1)



# ##  for all other monetary continous vars: need exchange conversion and inflation correction for time period 2012 (year of survey) to 2020 (ref.year)
for c in vars_money.drop(["Target_contentloss_VND_gdp","Target_contentloss_VND"], axis=1).columns:
    vars_money[c] = vars_money[c].apply(pd.to_numeric)
    for r in range(len(vars_money[c])):
        ## exchange rate: convert VND_2020 to €_2020
        uninflated = round((vars_money[c][r] * exchange_rate), 1)#.astype(int) 
        ## price index from GDP-deflator
        vars_money[c][r] = round(uninflated  * gdp_price_index_2020 / gdp_price_index_year_of_survey[r], 1)




In [None]:

## rename columns
new_cols = vars_money.filter(regex="_VND", axis=1).columns.str.replace("_VND", "_euro")
vars_money.columns = new_cols
vars_money = vars_money.apply(pd.to_numeric)


In [None]:
# update all_input with unified and inflated currencies
df_cantho.drop(df_cantho.filter(regex=r"_mVND|_VND", axis=1).columns, axis=1, inplace=True) 
df_cantho = pd.concat([df_cantho, vars_money], axis=1)
df_cantho.drop("Target_contentloss_euro", axis=1, inplace=True)
df_cantho.rename(columns={"Target_contentloss_euro_gdp" : "Target_contentloss_euro"}, inplace=True)

df_cantho.filter(regex=r"euro|VND", axis=1).columns  # should be only euros


In [None]:
df_cantho[["hh_monthly_income_euro", "shp_employees", "shp_avgmonthly_sale_euro", "shp_registered_capital_euro"]].describe()


## Targets
### absolute loss and Target business reduction 

In [None]:
# absolute content loss
df_cantho["Target_contentloss_euro"].describe()


# Target business reduction
df_cantho["Target_businessreduction"].describe()


## verify with HCMC
print("*CanTHo*\n", (df_cantho.Target_contentloss_euro).describe(), "\n\n", "*HCMC*\n: median: 0€ , mean:131€, 3.Q:78€, max:2600 €")
## --> abs losses are very similar in both DS
## -->  very similar in all keys-statistics  in min-max, median :-) interesting that fraction of zero-loss records is similar high as in HCMC

## verify with HCMC
print("*CanTHo*\n", df_cantho.Target_businessreduction.describe(), "\n\n", "*HCMC*\n", df_hcmc_bred.Target_businessreduction.describe())
## --> business reduction differ quite much: between avg < 10% in reduction in HCMC and 40% in Can THo 
## --> might not good to enrich bred ds with CanTHo

### content value
verify cv calculation for HCMC

In [None]:
df_cantho.describe()

### Target relative closs

In [None]:
## cv from survey
df_cantho["Target_relative_contentloss_euro"] = df_cantho["Target_contentloss_euro"] / df_cantho["shp_content_value_euro"]
# set all zero-loss cases to 0
df_cantho = df_cantho.apply(pd.to_numeric)
df_cantho.Target_relative_contentloss_euro[df_cantho.Target_relative_contentloss_euro.isna()] = 0.0


In [None]:
## rloss > cv
print("Records with relative content loss > total content value :", sum(df_cantho.Target_relative_contentloss_euro > 1.0) )

## drop these records where rloss > cv
df_cantho = df_cantho.loc[~(df_cantho.Target_relative_contentloss_euro >= 1.0), :]


# ## drop these records where rloss > cv
# df_cantho = df_cantho.loc[~(df_cantho.Target_rcloss >= 0.5), :]

df_cantho[["Target_contentloss_euro", "Target_relative_contentloss_euro", "Target_relative_contentloss_euro_self", 
           "shp_content_value_euro", "shp_content_value_euro_estimated", "Target_businessreduction"]].describe()


**cv self** is similar to the cv from questionary, therefore remove later one and the **respective Target version for loss ratio derived from it**, to have common approach of cv estimation in both DS (HCMC, CanTho)

In [None]:
df_cantho = df_cantho.drop(["Target_relative_contentloss_euro"], axis=1) #, "shp_content_value_euro"], axis=1)
df_cantho.rename(columns={
    "Target_relative_contentloss_euro_self" : "Target_relative_contentloss_euro",
    "shp_content_value_euro": "shp_content_value_euro_obs",  # obs
    "shp_content_value_euro_estimated" : "shp_content_value_euro", # pred
    }, inplace=True)


In [None]:
# ## remove shp sector # due that no expalnation which category revers to which sector
# df_cantho.shp_sector.value_counts()


## Explore variable's key statistics - are they similar to HCMC ?


In [None]:
df_cantho.columns

### missing data

In [None]:
## delete features with more than 10% missing values
print("Percentage of missing values per feature [%]\n", round(df_cantho.isna().mean().sort_values(ascending=False)[:15]  * 100), 2) 


## Microbusinesses

In [None]:
print("Businees which are not microbusinesses (> 10 employees)", len(df_cantho.loc[df_cantho.shp_employees >= 10, :]))
df_cantho = df_cantho.loc[df_cantho.shp_employees < 10, :]
df_cantho

## correlation between value of commercial building part and business content value
see if content value can be used as proxy for building value (apporach of thesis)

In [None]:
df_cantho["bv_commercial"] = df_cantho["building_value_euro"] / df_cantho.floors
df_cantho[["bv_commercial", "shp_content_value_euro_obs"]].corr(method="pearson")

# import seaborn as sns
# sns.lmplot(x="building_value_euro", y="shp_content_value_euro_obs", data=df_cantho, ci=None)
# plt.xlim(0,5000)
# plt.ylim(0,5000)


## correlation between building area and business content value


In [None]:
df_cantho[["b_area", "shp_content_value_euro_obs"]].corr(method="pearson")


## single dataset for bred and rloss

In [None]:
# df_cantho.rename(columns={
#     "shp_avgmonthly_sale_euro": "shp_avgmonthly_sale_euro",
# }, inplace=True)
df_cantho.shape

In [None]:
## make to same unit as Bred
df_cantho["Target_relative_contentloss_euro"] = df_cantho["Target_relative_contentloss_euro"] * 100


df_cantho_rloss = df_cantho
df_cantho_rloss = df_cantho_rloss.drop(
    [
   # "Target_contentloss_euro",  # keep to calc abs losses
    "Target_businessreduction",
    # "shp_content_value_euro",  # keep to estimate abs loss after BN
    ]
    , axis=1)
print(df_cantho_rloss.shape)
df_cantho_rloss.columns


In [None]:
df_cantho_bred = df_cantho
df_cantho_bred = df_cantho_bred.drop(
    [
    "Target_contentloss_euro",
    "Target_relative_contentloss_euro"
    ],
    axis=1)

print(df_cantho_bred.shape)
df_cantho_bred.columns

In [None]:
df_cantho_rloss.insert(0, "Target_relative_contentloss_euro", df_cantho_rloss.pop("Target_relative_contentloss_euro"))
df_cantho_bred.insert(0, "Target_businessreduction", df_cantho_bred.pop("Target_businessreduction"))

df_cantho_rloss.describe()


### Verify estimation of CV by error rate of estimated CV and obs. Cv in Can THo

In [None]:
bins = np.linspace(0, 40000, 100)

plt.hist((df_cantho.shp_content_value_euro_obs), bins, alpha=0.5, label='reported content values' )
# plt.hist((df_cantho.shp_content_value_euro_estimated_nocorrection), bins, alpha=0.5, label='estimated not corrected')
plt.hist((df_cantho.shp_content_value_euro), bins, alpha=0.5, label='estimated content values')
plt.legend(loc='upper right')
plt.title("Verification of the approach for deriving content values (Can Tho)")
#plt.ylim(0, 80)
plt.show()


plt.savefig( OUTPATH_FIGURES / "histo_estimation_contentvalues.png", dpi=300, bbox_inches="tight")  #format='jpg'



In [None]:
df_cv_residuals = pd.DataFrame(
    {
        "cv_true": df_cantho["shp_content_value_euro"],
        "cv_pred": df_cantho["shp_content_value_euro_estimated_nocorrection"],
        "cv_residual_notcorr": df_cantho["shp_content_value_euro_estimated_nocorrection"] - df_cantho_rloss["shp_content_value_euro"],
    }
)
print(df_cv_residuals.describe())


sns.violinplot(data=df_cv_residuals, x="cv_residual_notcorr", split=True)#, inner="quart")#,split=True, gap=.1, inner="quart")#, inner="stick")
# plt.xlim(-10000, 10000)

In [None]:
df_cantho_rloss.columns

In [None]:
df_cv_residuals = pd.DataFrame(
    {
        "cv_true": df_cantho_rloss["shp_content_value_euro_obs"],
        "cv_pred": df_cantho_rloss["shp_content_value_euro"],
        "cv_residual": df_cantho_rloss["shp_content_value_euro"] - df_cantho_rloss["shp_content_value_euro_obs"],
    }
)
print(df_cv_residuals.describe())

sns.violinplot(data=df_cv_residuals, x="cv_residual", split=True)#, inner="quart")#,split=True, gap=.1, inner="quart")#, inner="stick")
# plt.semilogx(df_cv_residuals.cv_residual)
# semilogx([x], y, [fmt], data=None, **kwargs)
# semilogx([x], y, [fmt], [x2], y2, [fmt2], ..., **kwargs)
# formatter = ticker.FuncFormatter(lambda y, _: f'{y:.3g}')
# ax.xaxis.set_major_formatter(formatter)
plt.xlabel("Residual between estimated and reported content value [in €]")
plt.xlim(-10000, 10000)

# around 50% of records have an underestimation of 300€ or overestimation of less than 500€, median is ~60€



In [None]:
print(df_cv_residuals.loc[df_cv_residuals.cv_residual < 2000, : ].shape)

df_cv_residuals

## Major cleaning 
Drop variables which are not in HCMC ds

In [None]:
df_cantho_rloss = df_cantho_rloss[df_hcmc_rloss.drop(["geometry", "shp_business_limitation"], axis=1).columns] # cantho has no GPS-information
df_cantho_bred = df_cantho_bred[df_hcmc_bred.drop(["geometry", "shp_business_limitation"], axis=1).columns]

print(df_cantho_rloss.columns, "\n", df_cantho_bred.columns)

In [None]:
df_cantho_rloss.describe() # 313

## Merge with HCMC dataset

In [None]:
# common_cols_rloss = list(set.intersection(*(set(df.columns) for df in [df_hcmc_rloss, df_cantho_rloss])))
# common_cols_bred = list(set.intersection(*(set(df.columns) for df in [df_hcmc_bred, df_cantho_bred])))

# df_rloss_joined = pd.concat([df[common_cols_rloss] for df in [df_hcmc_rloss, df_cantho_rloss]], ignore_index=True)
# df_bred_joined = pd.concat([df[common_cols_bred] for df in [df_hcmc_bred, df_cantho_bred]], ignore_index=True)


# print(df_rloss_joined.shape)
# df_rloss_joined.head(3)   # not containing HCMCs shp_sector, 'resilience', 'resilienceLeftAlone',

# print(df_bred_joined.shape)
# df_bred_joined.head(3)   # not containing HCMCs shp_sector, 'resilience', 'resilienceLeftAlone',

In [None]:
# df_hcmc_rloss.columns

In [None]:
# df_cantho.Target_contentloss_euro.describe()

# ## Test if also removing high abs losses imporves Logistic Reg (HCMC: rm 4 highest abs losses)
# print("Absolute loss higher than 500€: ", len(df_cantho.Target_contentloss_euro>500.0))
# df_cantho = df_cantho.loc[ ~(df_cantho.Target_contentloss_euro>500.0), :]
# df_cantho.Target_contentloss_euro.describe()


In [None]:
df_cantho_bred.describe()

### Plot distribution cantho and hcmc

In [None]:
bins = np.linspace(0, 10, 100)

plt.hist((df_cantho.Target_relative_contentloss_euro), bins, alpha=0.5, label='cantho rloss')
plt.hist((df_hcmc_rloss.Target_relative_contentloss_euro), bins, alpha=0.5, label='hcmc rloss')
plt.legend(loc='upper right')
plt.show()

## --> with removing 4 highest abs loss in HCMC and similar much in CanTho


## Write to disk

In [None]:
# 159/313
# 50% cantho

191/320
# 60% hcmc

In [None]:

# df_cantho_rloss
df_hcmc_rloss.describe()
# df_hcmc_rloss.Target_relative_contentloss_euro.value_counts()
# df_cantho_bred.Target_businessreduction.value_counts()

In [None]:
# only for CanTHo
df_cantho_rloss.to_excel(f"../{s.INPATH_DATA}/input_data_contentloss_cantho.xlsx", index=False)
df_cantho_bred.to_excel(f"../{s.INPATH_DATA}/input_data_businessreduction_cantho.xlsx", index=False)

In [None]:
# df_rloss_joined.insert(0, "Target_relative_contentloss_euro", df_rloss_joined.pop("Target_relative_contentloss_euro"))
# df_bred_joined.insert(0, "Target_businessreduction", df_bred_joined.pop("Target_businessreduction"))

# df_rloss_joined.describe()


In [None]:
# ## save to disk

# # combined for HCMC and CanTHo
# df_rloss_joined.to_excel("../input_survey_data/input_data_contentloss_tueb_cantho.xlsx", index=False)
# df_bred_joined.to_excel("../input_survey_data/input_data_businessreduction_tueb_cantho.xlsx", index=False)

In [None]:
# print("Relative content loss dataset")
# print("Number of candidate predictors ", df_rloss_joined.shape[1])
# print("Number of cases ", df_rloss_joined.shape[0])

# print("\nBusiness reduction dataset")
# print("Number of candidate predictors ", df_bred_joined.shape[1])
# print("Number of cases ", df_bred_joined.shape[0])

## Plot distirbutions HCMC ~ CanTho

**plot targets**

In [None]:
target = ["Target_relative_contentloss_euro", "Target_businessreduction"]
target = target[0]

target_plot = ["Target_relative_contentloss_euro_categ", "Target_businessreduction_categ"]
target_plot = target_plot[0]


df_hcmc_p = df_hcmc_bothtargets.drop([
  "geometry", 
  "Target_contentloss_euro", "shp_content_value_euro",
  "shp_registered_capital_euro", "shp_sector",
  "overall_problem_house", "shp_business_limitation"
  ], axis=1)
df_cantho_p = df_cantho[df_hcmc_p.columns]


## add col for hue plotting
df_hcmc_p["cityname"] = "HCMC"
df_cantho_p["cityname"] = "Can Tho"

## merge df for plotting
df_merged_p = pd.concat([df_hcmc_p, df_cantho_p], axis=0).reset_index(drop=True)
  
if target ==  "Target_relative_contentloss_euro":
        df_merged_p = df_merged_p.loc[df_merged_p.Target_relative_contentloss_euro > 0.0, : ]

#all_input_p["bage_categ"] = [f'{int(i//10*10)} - {int(i//10*10+10)}' for i in all_input_p["bage"]] # apply modulo operator and multiply by 10
df_merged_p[target_plot] = [ i//5*5 for i in df_merged_p[target]]  # apply modulo operator and multiply by 10
# print(df_merged_p[target_plot].value_counts())


## style settings
sns.set_style("whitegrid", {"axes.grid" : False})
fig, ax = plt.subplots(figsize=(5, 5))

bins = 50
hue_colors=("teal","firebrick")
alpha=0.2
color_dict = {
        "HCMC": to_rgba(hue_colors[0], alpha), # set transparency for each class independently
        "Can Tho": to_rgba(hue_colors[1], alpha)
    }

legend = True
p = sns.histplot(
        df_merged_p,  
        x=target_plot,
        hue="cityname", stat="count",
        multiple="dodge",
        bins=range(0, 101, 5), 
        palette=color_dict, 
        )
p
# set a hatch for HCMC, to distinguish bars for color-blind people
for hues, hatch in zip(ax.containers, ["//", ""]): 
        for hue in hues:
                hue.set_hatch(hatch)

# degree rcloss:
plt.xlabel("Relative content loss without zero-loss", fontsize=12)
plt.ylabel("Number of cases", fontsize=12)
plt.legend(loc="upper right", labels=["Can Tho", "HCMC"])
fig.get_figure().savefig(OUTPATH_FIGURES / "histo_degree_hcmc_cantho.png", dpi=300, bbox_inches="tight")


In [None]:
target = ["Target_relative_contentloss_euro", "Target_businessreduction"]
target = target[1]

target_plot = ["Target_relative_contentloss_euro_categ", "Target_businessreduction_categ"]
target_plot = target_plot[1]


df_hcmc_p = df_hcmc_bothtargets.drop([
  "geometry", 
  "Target_contentloss_euro", "shp_content_value_euro",
  "shp_registered_capital_euro", "shp_sector",
  "overall_problem_house", "shp_business_limitation"
  ], axis=1)
df_cantho_p = df_cantho[df_hcmc_p.columns]


## add col for hue plotting
df_hcmc_p["cityname"] = "HCMC"
df_cantho_p["cityname"] = "Can Tho"

## merge df for plotting
df_merged_p = pd.concat([df_hcmc_p, df_cantho_p], axis=0).reset_index(drop=True)
  
if target ==  "Target_relative_contentloss_euro":
        df_merged_p = df_merged_p.loc[df_merged_p.Target_relative_contentloss_euro > 0.0, : ]

#all_input_p["bage_categ"] = [f'{int(i//10*10)} - {int(i//10*10+10)}' for i in all_input_p["bage"]] # apply modulo operator and multiply by 10
df_merged_p[target_plot] = [ i//5*5 for i in df_merged_p[target]]  # apply modulo operator and multiply by 10
# print(df_merged_p[target_plot].value_counts())


## style settings
sns.set_style("whitegrid", {"axes.grid" : False})
fig, ax = plt.subplots(figsize=(5, 5))

bins = 50
hue_colors=("teal","firebrick")
alpha=0.2
color_dict = {
        "HCMC": to_rgba(hue_colors[0], alpha), # set transparency for each class independently
        "Can Tho": to_rgba(hue_colors[1], alpha)
    }

legend = True
p = sns.histplot(
        df_merged_p,  
        x=target_plot,
        hue="cityname", stat="count",
        multiple="dodge",
        bins=range(0, 101, 5), 
        palette=color_dict, 
        )
p
# set a hatch for HCMC, to distinguish bars for color-blind people
for hues, hatch in zip(ax.containers, ["//", ""]): 
        for hue in hues:
                hue.set_hatch(hatch)

# rbred:
plt.xlabel("Relative interruption loss", fontsize=12)
plt.ylabel("Number of cases", fontsize=12)
plt.legend(loc="upper right", labels=["Can Tho", "HCMC"])
fig.get_figure().savefig(OUTPATH_FIGURES / "histo_rbred_hcmc_cantho.png", dpi=300, bbox_inches="tight")



In [None]:
target = ["Target_relative_contentloss_euro", "Target_businessreduction"]
target = target[0]



df_hcmc_p = df_hcmc_bothtargets.drop([
  "geometry", 
  "Target_contentloss_euro", "shp_content_value_euro",
  "shp_registered_capital_euro", "shp_sector",
  "overall_problem_house", "shp_business_limitation"
  ], axis=1)
df_cantho_p = df_cantho[df_hcmc_p.columns]


df_hcmc_p.loc[df_hcmc_p.Target_relative_contentloss_euro > 0.0, : ] = 1.0 
df_cantho_p.loc[df_cantho_p.Target_relative_contentloss_euro > 0.0, : ] = 1.0 

# print(df_hcmc_p[target].describe())
# print(df_cantho_p[target].describe())


## style settings
sns.set_style("whitegrid", {"axes.grid" : False})
fig, ax = plt.subplots(figsize=(5, 5))

plt.hist(
    (df_hcmc_p[target], df_cantho_p[target]), 2, 
    rwidth=.2, label=("HCMC","Can Tho"), 
    alpha=0.8, color=("teal", "firebrick"))
# set a hatch for HCMC, to distinguish bars for color-blind people
for hues, hatch in zip(ax.containers, ["", "//"]):
    for hue in hues:
        hue.set_hatch(hatch)

labels = ['', '', '', '', '', '', '', '', '']
labels[1] = "                 zero-loss"
labels[6] = "                 loss"
ax.set_xticklabels(labels)

# plt.xlabel("Relative loss due to business interruption (rbred)")
plt.xlabel("Chance of content loss", fontsize=12)
plt.ylabel("Number of cases", fontsize=12)
plt.legend(loc='upper right', labels=["HCMC","Can Tho"])
p

fig.get_figure().savefig(OUTPATH_FIGURES / "histo_chance_hcmc_cantho.png", dpi=300, bbox_inches="tight")



**plot predictors**

In [None]:
import seaborn as sns
from matplotlib.colors import to_rgba

## prepare HCMC and cantho datasets for plotting, drop unneeded vars incl targets
df_hcmc_p =  df_hcmc_bothtargets.drop([
  "Target_relative_contentloss_euro", "Target_businessreduction",
  "Target_contentloss_euro", 
    "geometry", 
    "shp_content_value_euro",
    "shp_registered_capital_euro", "shp_sector",
    "overall_problem_house", "shp_business_limitation"
  ], axis=1)
df_hcmc_p
df_cantho_p = df_cantho[df_hcmc_p.columns]


## add col for hue plotting
df_hcmc_p["cityname"] = "HCMC"
df_cantho_p["cityname"] = "Can Tho"

## merge df for plotting
df_merged_p = pd.concat([df_hcmc_p, df_cantho_p], axis=0).reset_index(drop=True)
df_merged_p
  
## nice feature labels 
df_merged_p.rename(columns=s.feature_names_plot, inplace=True) 


## style settings
sns.set_style("whitegrid", {"axes.grid" : False})

hue_colors=("teal","firebrick")
alpha=0.3
color_dict = {
        "HCMC": to_rgba(hue_colors[0], alpha), # set transparency for each class independently
        "Can Tho": to_rgba(hue_colors[1], alpha)
    }
numcols = 4
bins = 30 #100 # np.linspace(0, 10, 100)

fig, axes = plt.subplots(1+len(df_merged_p.columns)//numcols, numcols, figsize=(15, 15), constrained_layout=True)

## shorten vlaue ranges of few vars for better plotting
df_merged_p["inundation duration"][df_merged_p["inundation duration"] > 120] = np.nan   # 5 cases only in hcmc
df_merged_p["building area"][df_merged_p["building area"] > 1000] = np.nan   # 3 cases only in cantho
df_merged_p["mthly. sales"][df_merged_p["mthly. sales"] > 5000] = np.nan  #  3 cases onl in cantho


## plot histos
for col, ax in zip(df_merged_p.columns, axes.flat):
    p = sns.histplot(
        df_merged_p,  
        x=col, stat="count",
        hue="cityname", hue_order=["HCMC", "Can Tho"],
        bins=bins, 
        # edgecolor="black",
        palette=color_dict,
        legend=False,
        # binwidth=.5,
        ax=ax).set_ylabel("")
    plt.tight_layout()
    #p.legend(fontsize=10, )  # outside plot: bbox_to_anchor= (1.2,1)
    #plt.setp(p.get_legend().get_texts(), fontsize="12")  
    #plt.setp(p.get_legend().get_title(), fontsize="15")

    # set a hatch for HCMC, to distinguish bars for color-blind people
    for hues, hatch in zip(ax.containers, ["//", ""]): 
        for hue in hues:
            hue.set_hatch(hatch)
                      
# plt.legend(loc="upper right")  # FIXME legend

fig.get_figure().savefig(OUTPATH_FIGURES / "histo_predictors_hcmc_cantho.png", dpi=300, bbox_inches="tight")
# sns.move_legend(ax, bbox_to_anchor=(1, 0.5), loc='center left', frameon=False)
p

## Left overs

In [None]:
# ## Target relative closs

# # #t[["abs_closs_VND", "shp_content_value_VND"]] = t[["abs_closs_VND", "shp_content_value_VND"]].fillna(0, inplace=True)
# # df_cantho["Target_rcloss"] = df_cantho["abs_closs_VND"] / df_cantho["shp_content_value_VND"]
# # # set all zero-loss cases to 0
# # df_cantho = df_cantho.apply(pd.to_numeric)
# # df_cantho.Target_rcloss[df_cantho.Target_rcloss.isna()] = 0.0

# ## rloss > cv
# print("Records with relative content loss exceding the content values for businesses:", sum(df_cantho.Target_rcloss > 1.0) )
# # t.Target_rcloss[t.Target_rcloss > 1.0]  = 1.0
# # print(all_input_contentloss[all_input_contentloss.Target_relative_contentloss_euro > 0.99 ])
# df_cantho[["abs_closs_VND", "shp_content_value_VND", "Target_rcloss", "Target_bred"]].describe()

# #df_cantho["Target_rcloss"] = df_cantho["abs_closs_VND"] / df_cantho["shp_content_value_VND"]
# df_cantho["Target_rcloss"] = df_cantho["abs_closs_VND"] / df_cantho["shp_content_value_VND_self"]

# # set all zero-loss cases to 0
# df_cantho = df_cantho.apply(pd.to_numeric)
# df_cantho.Target_rcloss[df_cantho.Target_rcloss.isna()] = 0.0

# ## rloss > cv
# print("Records with relative content loss > total content value :", sum(df_cantho.Target_rcloss > 1.0) )

# ## drop these records where rloss > cv
# df_cantho = df_cantho.loc[~(df_cantho.Target_rcloss >= 1.0), :]

# df_cantho[["abs_closs_VND", "shp_content_value_VND_self", "Target_rcloss", "Target_bred"]].describe()


# # Overestimation of CV for small businesses 
# # --> businesses with overestimated Cv is charcterized by low number of employees

# shps_with_overeestimated_cv = df_cantho.loc[df_cantho["shp_content_value_VND"]  <= 2000000.00, :]  # using 25% of busineeses with smallest CV [1.Qunatile]
# shps_with_overeestimated_cv.number_employees.value_counts()  
# ## --> most of the shops with small cv are indeed very small businesses

# # ## drop these records where rloss > cv
# # df_cantho = df_cantho.loc[~(df_cantho.Target_rcloss >= 0.5), :]

# # df_cantho[["abs_closs_VND", "shp_content_value_VND_self", "Target_rcloss", "Target_bred"]].describe()

# # df_cantho.abs_closs_VND.describe()  # max abs loss is 2300 €

# # ## explore cases where rloss > cv
# # tt = t.loc[t.Target_rcloss > 1.0, :]
# # tt.sort_values("abs_closs_VND", ascending=False)


# ## get rloss to similar ratio as for HCMC (rloss=0.3)
# df_cantho =  df_cantho.loc[~(df_cantho.Target_rcloss >= 1.0), :] 
# #df_cantho =  df_cantho.loc[~(df_cantho.Target_rcloss >= .5), :]   # removed ~ 15 records with higher loss ratio than 50%
# df_cantho[["abs_closs_VND", "shp_content_value_VND", "Target_rcloss", "Target_bred"]].describe()

# ## Can Tho
# # Abs Closs: median: 1 €, 3.Quantile: 50 €, mean 74 € , max:  2310 €
# # rloss : median: 0.0 , mean: 0.05,  max: 0.48
# # CV: mean 3340 €, median: 310 € , max: 392.610 € (no inflation corrected)
# ## Bred : mean 40%, median 40%