### Aim: explorative data analysis

In [217]:
import sys, os
import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


pd.set_option('display.max_columns', None)


In [218]:
# load raw survey data

raw_data = pd.read_excel("../input_survey_data/Select_data.xlsx")

raw_data.head(5);

In [219]:
## filter for most recent and serous flood hazards (2018/2019)

raw_data.tail(5)
raw_data.filter(regex=r'^P1Q3', axis=1)  # P1 = first section, second block of questions, 


Unnamed: 0,P1Q3.2.1,P1Q3.3.1,P1Q3.4.1,P1Q3.5.1,P1Q3.6.1,P1Q3.7.1,P1Q3.88.1,P1Q3.88.1.specify,P1Q3.8.1,P1Q3.11.1
0,3,2,3,1,1,98,,,2,100
1,4,4,1,4,1,98,,,100000000,100
2,1,1,1,1,1,98,,,2,99
3,1,2,1,1,1,98,,,100000,500
4,2,1,1,1,1,98,,,2,99
...,...,...,...,...,...,...,...,...,...,...
247,2,2,1,1,1,98,,,5000000,100
248,1,1,1,1,1,98,,,2,99
249,2,2,1,1,1,98,,,2,99
250,3,3,1,1,1,98,,,120000000,600


In [228]:
## small test set with preselection

df_p125 = raw_data.filter(regex=r'P1Q3.11.1|^P1|^P5', axis=1)   # P1 = first section, second block of questions, 

df_p125.insert(0, "Target_b", df_p125.pop("P1Q3.11.1"))  # target var for cost of building damages [VND]
df_p125.insert(1, "Target_c", df_p125.pop("P1Q5.6.1"))  # target var for cost of buiness content damages [VND]
df_p125.insert(2, "business_reduction", df_p125.pop("P1Q5.9.1"))  # explanatory var for monthly reduction of business [%] -> probl influences mainly economic damage costs


df_p125.tail(2)



Unnamed: 0,Target_b,Target_c,business_reduction,P1Q1,P1Q1.specify,P1Q2.1.1,P1Q2.2.1,P1Q2.3.1,P1Q2.4.1,P1Q2.5.1.0,P1Q2.5.1.1,P1Q2.5.1.2,P1Q2.5.1.3,P1Q2.5.1.4,P1Q2.5.1.88,P1Q2.5.1.99,P1Q2.5.1.specify,P1Q2.6.1,P1Q2.7.1.1,P1Q2.7.1.2,P1Q2.7.1.3,P1Q2.7.1.88,P1Q2.7.1.99,P1Q2.7.1.specify,P1Q2.8.1.1,P1Q2.8.1.2,P1Q2.8.1.3,P1Q2.8.1.4,P1Q2.8.1.5,P1Q2.8.1.6,P1Q2.8.1.7,P1Q2.8.1.8,P1Q2.8.1.9,P1Q2.8.1.10,P1Q2.8.1.99,P1Q2.8.1.specify,P1Q2.9.1,P1Q2.10.1.1,P1Q2.10.1.2,P1Q2.10.1.3,P1Q2.10.1.4,P1Q2.10.1.5,P1Q2.10.1.6,P1Q2.10.1.7,P1Q2.10.1.8,P1Q2.10.1.9,P1Q2.10.1.88,P1Q2.10.1.99,P1Q2.10.1.specify,P1Q2.11.1.1,P1Q2.11.1.2,P1Q2.11.1.3,P1Q2.11.1.4,P1Q2.11.1.5,P1Q2.11.1.6,P1Q2.11.1.7,P1Q2.11.1.8,P1Q2.11.1.9,P1Q2.11.1.88,P1Q2.11.1.99,P1Q2.11.1.specify,P1Q3.2.1,P1Q3.3.1,P1Q3.4.1,P1Q3.5.1,P1Q3.6.1,P1Q3.7.1,P1Q3.88.1,P1Q3.88.1.specify,P1Q3.8.1,P1Q5.2.1,P1Q5.3.1,P1Q5.4.1,P1Q5.5.1,P1Q5.88.1,P1Q5.88.1.specify,P1Q5.7.1,P1Q5.8.1,P1Q6.4.1.1,P1Q7.2.1.1,P1Q7.2.1.2,P1Q7.2.1.3,P1Q7.2.1.4,P1Q7.2.1.5,P1Q7.2.1.88,P1Q7.2.1.98,P1Q7.2.1.99,P1Q7.2.1.specify,P5Q1.1,P5Q1.1.Specify,P5Q1.2,P5Q1.3,P5Q1.3.Specify,P5Q1.4,P5Q1.5,P5Q1.6.1,P5Q1.6.2,P5Q1.6.3,P5Q1.6.4,P5Q1.6.5,P5Q1.6.88,P5Q1.6.Specify,P5Q1.7,P5Q1.9.1,P5Q1.9.2,P5Q1.9.3,P5Q1.9.4,P5Q1.9.5,P5Q1.9.6,P5Q1.10,P5Q1.11,P5Q1.12
250,600,2,0,4,,??t ng?p do tri?u c??ng dâng cao 2019,10/99/2019,3,5,0,1,0,0,1,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,1,1,1,0,0,1,0,0,0,0,,0,1,0,1,0,1,0,0,0,0,0,,3,3,1,1,1,98,,,120000000,1,1,1,4,,,0,0,,0,0,0,0,0,0,1,0,,3,,2016,17,Bán t?p hóa,2,2,1,0,0,0,0,0,,100,1,1,0,0,0,0,2,1,1
251,100,0,0,4,,2018,09/99/2018,1,20,0,1,0,0,0,1,0,Bùn sình,3,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,0,0,0,0,0,0,1,0,0,,0,1,1,0,0,0,0,0,0,0,0,,1,2,1,3,1,98,,,60000000,1,1,1,1,,,0,0,3.0,0,0,0,0,0,0,1,0,,2,,2019,17,"T?p hóa, cafe",1,1,1,0,0,0,0,0,,7,1,0,0,0,0,0,3,1,1


In [229]:
## rename certain important columns

df_p125.rename(columns={#'P1Q3.11.1':'TARGET', # = total_costs_damage
                     'P1Q2.3.1':'inundation_duration_h',
                     'P1Q2.4.1':'water_depth_cm',
                     'P1Q2.5.1':'contaminations',
                     'P1Q2.9.2':'warning_time_h',
                     'P1Q2.10.1.*':'emergency_measures', # bis .9, .88 ,.99 , .specify
                     'P1Q2.11.1.*':'overall_problem_house',
                     'P1Q3.2.1':'damage_level_floor',
                     'P1Q3.3.1':'damage_level_walls',
                     'P1Q3.4.1':'damage_level_foundation',
                     'P1Q3.5.1':'damage_level_doors',
                     'P1Q3.6.1':'damage_level_roof',
                     'P1Q3.7.1':'damage_level_basement',
                     'P1Q3.88.1':'damage_level_other',
                     'P1Q3.88.1':'damage_level_specify',
                     'P1Q3.8.1':'repair_costs_building_VND',
                     'P1Q3.11.1':'repair_costs_building_complete',

                     'P5Q1.1':'shp_position',
                     'P5Q1.2':'shp_established',
                     'P5Q1.3':'shp_sector', # check
                     'P5Q1.4':'shp_employees',
                     'P5Q1.5':'shp_avgmonthly_sale_mVND',
                     r'P5Q1.6.*':'shp_finance_investments',
                     'P5Q1.7':'shp_capital_mVND',
                     'P5Q1.9':'shp_suppliers_location',
                     'P5Q1.10':'shp_benefits_last5years',
                     'P5Q1.1':'shp_risk_behaviour',
                     'P5Q1.1':'shp_monetary_resources_prenvention'
 
                     }, inplace=True)

# P1Q7.1.1 and .2 = from whom recieved help
df_p125.head(3)

Unnamed: 0,Target_b,Target_c,business_reduction,P1Q1,P1Q1.specify,P1Q2.1.1,P1Q2.2.1,inundation_duration_h,water_depth_cm,P1Q2.5.1.0,P1Q2.5.1.1,P1Q2.5.1.2,P1Q2.5.1.3,P1Q2.5.1.4,P1Q2.5.1.88,P1Q2.5.1.99,P1Q2.5.1.specify,P1Q2.6.1,P1Q2.7.1.1,P1Q2.7.1.2,P1Q2.7.1.3,P1Q2.7.1.88,P1Q2.7.1.99,P1Q2.7.1.specify,P1Q2.8.1.1,P1Q2.8.1.2,P1Q2.8.1.3,P1Q2.8.1.4,P1Q2.8.1.5,P1Q2.8.1.6,P1Q2.8.1.7,P1Q2.8.1.8,P1Q2.8.1.9,P1Q2.8.1.10,P1Q2.8.1.99,P1Q2.8.1.specify,P1Q2.9.1,P1Q2.10.1.1,P1Q2.10.1.2,P1Q2.10.1.3,P1Q2.10.1.4,P1Q2.10.1.5,P1Q2.10.1.6,P1Q2.10.1.7,P1Q2.10.1.8,P1Q2.10.1.9,P1Q2.10.1.88,P1Q2.10.1.99,P1Q2.10.1.specify,P1Q2.11.1.1,P1Q2.11.1.2,P1Q2.11.1.3,P1Q2.11.1.4,P1Q2.11.1.5,P1Q2.11.1.6,P1Q2.11.1.7,P1Q2.11.1.8,P1Q2.11.1.9,P1Q2.11.1.88,P1Q2.11.1.99,P1Q2.11.1.specify,damage_level_floor,damage_level_walls,damage_level_foundation,damage_level_doors,damage_level_roof,damage_level_basement,damage_level_specify,P1Q3.88.1.specify,repair_costs_building_VND,P1Q5.2.1,P1Q5.3.1,P1Q5.4.1,P1Q5.5.1,P1Q5.88.1,P1Q5.88.1.specify,P1Q5.7.1,P1Q5.8.1,P1Q6.4.1.1,P1Q7.2.1.1,P1Q7.2.1.2,P1Q7.2.1.3,P1Q7.2.1.4,P1Q7.2.1.5,P1Q7.2.1.88,P1Q7.2.1.98,P1Q7.2.1.99,P1Q7.2.1.specify,shp_monetary_resources_prenvention,P5Q1.1.Specify,shp_established,shp_sector,P5Q1.3.Specify,shp_employees,shp_avgmonthly_sale_mVND,P5Q1.6.1,P5Q1.6.2,P5Q1.6.3,P5Q1.6.4,P5Q1.6.5,P5Q1.6.88,P5Q1.6.Specify,shp_capital_mVND,P5Q1.9.1,P5Q1.9.2,P5Q1.9.3,P5Q1.9.4,P5Q1.9.5,P5Q1.9.6,shp_benefits_last5years,P5Q1.11,P5Q1.12
0,100,0,0,5,,2015,99/99/2015,2,10,1,0,0,0,0,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,3.0,0,0,0,0,0,0,0,0,1,0,0,,0,0,0,0,0,1,0,0,0,0,0,,3,2,3,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,3,,2010,17,S?a qu?n áo c?,1,2,1,0,0,0,0,0,,5,1,0,0,0,0,0,4,1,1
1,100,0,0,5,,2023-09-17 00:00:00,09/99/2017,2,15,0,1,0,0,0,0,0,,2,1,1,0,0,0,,0,0,0,0,0,0,0,0,0,1,0,,,0,0,0,0,0,0,0,0,1,0,0,,0,0,0,0,0,0,1,0,0,0,0,,4,4,1,4,1,98,,,100000000,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,3,,2019,22,,1,2,1,0,0,0,0,0,,10,0,1,0,0,0,0,4,1,1
2,99,0,0,5,,c?n m?a l?n tháng 8/2020,8/99/2020,1,2,1,0,0,0,0,0,0,,2,0,1,0,0,0,,0,0,0,0,0,0,0,0,0,1,0,,,0,0,0,0,0,0,0,0,1,0,0,,0,1,0,0,0,0,0,0,0,0,0,,1,1,1,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,1,,1999,31,,1,1,1,0,0,0,0,0,,10,1,0,0,0,0,0,3,1,1


## select for shophouses 
- all rows with entries at business types


In [223]:

#df_p125["P1Q2.5.1.2"].unique() #.dtype


## categorize non-numeric variables

## Map spatial distribution GPS

In [235]:
df_p125["Target_c"].unique()  # VND

array([0, ',8', ',5', 50, 1, 40, 6, ',2', 2, 4, 99, 5, 15, 20, '1,5', 10,
       3, '3,2', 7, ',6', 30, ',7', '1,2', 280, 70, 6000, ',3', 700, 11,
       '7,5', '4,5'], dtype=object)

In [232]:
df_p125

Unnamed: 0,Target_b,Target_c,business_reduction,P1Q1,P1Q1.specify,P1Q2.1.1,P1Q2.2.1,inundation_duration_h,water_depth_cm,P1Q2.5.1.0,P1Q2.5.1.1,P1Q2.5.1.2,P1Q2.5.1.3,P1Q2.5.1.4,P1Q2.5.1.88,P1Q2.5.1.99,P1Q2.5.1.specify,P1Q2.6.1,P1Q2.7.1.1,P1Q2.7.1.2,P1Q2.7.1.3,P1Q2.7.1.88,P1Q2.7.1.99,P1Q2.7.1.specify,P1Q2.8.1.1,P1Q2.8.1.2,P1Q2.8.1.3,P1Q2.8.1.4,P1Q2.8.1.5,P1Q2.8.1.6,P1Q2.8.1.7,P1Q2.8.1.8,P1Q2.8.1.9,P1Q2.8.1.10,P1Q2.8.1.99,P1Q2.8.1.specify,P1Q2.9.1,P1Q2.10.1.1,P1Q2.10.1.2,P1Q2.10.1.3,P1Q2.10.1.4,P1Q2.10.1.5,P1Q2.10.1.6,P1Q2.10.1.7,P1Q2.10.1.8,P1Q2.10.1.9,P1Q2.10.1.88,P1Q2.10.1.99,P1Q2.10.1.specify,P1Q2.11.1.1,P1Q2.11.1.2,P1Q2.11.1.3,P1Q2.11.1.4,P1Q2.11.1.5,P1Q2.11.1.6,P1Q2.11.1.7,P1Q2.11.1.8,P1Q2.11.1.9,P1Q2.11.1.88,P1Q2.11.1.99,P1Q2.11.1.specify,damage_level_floor,damage_level_walls,damage_level_foundation,damage_level_doors,damage_level_roof,damage_level_basement,damage_level_specify,P1Q3.88.1.specify,repair_costs_building_VND,P1Q5.2.1,P1Q5.3.1,P1Q5.4.1,P1Q5.5.1,P1Q5.88.1,P1Q5.88.1.specify,P1Q5.7.1,P1Q5.8.1,P1Q6.4.1.1,P1Q7.2.1.1,P1Q7.2.1.2,P1Q7.2.1.3,P1Q7.2.1.4,P1Q7.2.1.5,P1Q7.2.1.88,P1Q7.2.1.98,P1Q7.2.1.99,P1Q7.2.1.specify,shp_monetary_resources_prenvention,P5Q1.1.Specify,shp_established,shp_sector,P5Q1.3.Specify,shp_employees,shp_avgmonthly_sale_mVND,P5Q1.6.1,P5Q1.6.2,P5Q1.6.3,P5Q1.6.4,P5Q1.6.5,P5Q1.6.88,P5Q1.6.Specify,shp_capital_mVND,P5Q1.9.1,P5Q1.9.2,P5Q1.9.3,P5Q1.9.4,P5Q1.9.5,P5Q1.9.6,shp_benefits_last5years,P5Q1.11,P5Q1.12
0,100,0,0,5,,2015,99/99/2015,2,10,1,0,0,0,0,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,3,0,0,0,0,0,0,0,0,1,0,0,,0,0,0,0,0,1,0,0,0,0,0,,3,2,3,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,3,,2010,17,S?a qu?n áo c?,1,2,1,0,0,0,0,0,,5,1,0,0,0,0,0,4,1,1
1,100,0,0,5,,2023-09-17 00:00:00,09/99/2017,2,15,0,1,0,0,0,0,0,,2,1,1,0,0,0,,0,0,0,0,0,0,0,0,0,1,0,,,0,0,0,0,0,0,0,0,1,0,0,,0,0,0,0,0,0,1,0,0,0,0,,4,4,1,4,1,98,,,100000000,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,3,,2019,22,,1,2,1,0,0,0,0,0,,10,0,1,0,0,0,0,4,1,1
2,99,0,0,5,,c?n m?a l?n tháng 8/2020,8/99/2020,1,2,1,0,0,0,0,0,0,,2,0,1,0,0,0,,0,0,0,0,0,0,0,0,0,1,0,,,0,0,0,0,0,0,0,0,1,0,0,,0,1,0,0,0,0,0,0,0,0,0,,1,1,1,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,1,,1999,31,,1,1,1,0,0,0,0,0,,10,1,0,0,0,0,0,3,1,1
3,500,0,99,3,,m?a l?n tháng 7/2020,7/99/2020,3,10,1,0,0,0,0,0,0,,1,0,1,0,0,0,,0,1,0,0,0,0,0,0,0,0,0,,8,0,0,0,0,0,0,0,0,1,0,0,,0,1,1,1,0,0,0,0,0,0,0,,1,2,1,1,1,98,,,100000,1,1,1,1,,,1,1,0,0,0,0,0,0,0,1,0,,1,,2015,11,,1,1,1,0,0,0,0,0,,99,1,0,0,0,0,0,3,1,1
4,99,0,0,3,,tr?n m?a l?n tháng 7/2020,7/99/2020,3,5,0,1,0,0,1,0,0,,3,0,1,0,0,0,,0,0,0,0,0,0,0,0,0,1,0,,,0,0,0,0,0,0,0,0,1,0,0,,0,1,0,0,0,0,0,0,0,0,0,,2,1,1,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,1,,2014,11,,1,1,1,0,0,0,0,0,,1,1,0,0,0,0,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,100,0,99,5,,Ng?p 2019,10/99/2019,4,50,0,1,0,0,1,0,0,,1,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,0,1,0,0,1,0,0,0,0,0,,0,1,1,0,0,0,0,0,0,0,0,,2,2,1,1,1,98,,,5000000,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,1,,2014,17,V?t li?u xây d?ng,2,3,1,0,0,0,0,0,,300,1,1,0,0,0,0,4,3,3
248,99,0,0,4,,Ng?p 2019,09/99/2019,1,5,0,1,0,0,1,0,0,,1,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,0,1,0,0,0,0,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,1,1,1,1,1,98,,,2,1,1,1,1,,,0,0,0,0,0,0,0,0,0,1,0,,1,,2015,11,,2,3,1,0,0,0,0,0,,20,1,1,0,0,0,0,4,3,4
249,99,0,0,4,,Ng?p 2017,08/99/2017,5,20,0,1,0,0,1,0,0,,1,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,0,0,1,0,0,0,0,0,0,1,0,Xây kè xung quanh nhà,0,1,1,1,0,0,0,0,0,0,0,,2,2,1,1,1,98,,,2,1,1,1,1,,,0,0,,0,0,0,0,0,0,1,0,,3,,2005,11,,5,3,1,0,0,0,0,0,,300,1,1,1,0,0,0,2,1,2
250,600,2,0,4,,??t ng?p do tri?u c??ng dâng cao 2019,10/99/2019,3,5,0,1,0,0,1,0,0,,2,1,0,0,0,0,,1,0,0,0,0,0,0,0,0,0,0,,99,1,1,1,1,0,0,1,0,0,0,0,,0,1,0,1,0,1,0,0,0,0,0,,3,3,1,1,1,98,,,120000000,1,1,1,4,,,0,0,,0,0,0,0,0,0,1,0,,3,,2016,17,Bán t?p hóa,2,2,1,0,0,0,0,0,,100,1,1,0,0,0,0,2,1,1


## PCA

*Example from: https://towardsdatascience.com/principal-component-analysis-with-python-an-example-for-beginners-by-a-beginner-ac052eff45c*


In [231]:
## select only numeric columns, exclude als .specfic-columns or with string or datetime
df_p125_t = df_p125.select_dtypes(include = ['int64'], exclude=["object"])  
df_p125_t

Unnamed: 0,Target_b,P1Q1,water_depth_cm,P1Q2.5.1.0,P1Q2.5.1.1,P1Q2.5.1.2,P1Q2.5.1.3,P1Q2.5.1.4,P1Q2.5.1.88,P1Q2.5.1.99,P1Q2.6.1,P1Q2.7.1.1,P1Q2.7.1.2,P1Q2.7.1.3,P1Q2.7.1.88,P1Q2.7.1.99,P1Q2.8.1.1,P1Q2.8.1.2,P1Q2.8.1.3,P1Q2.8.1.4,P1Q2.8.1.5,P1Q2.8.1.6,P1Q2.8.1.7,P1Q2.8.1.8,P1Q2.8.1.9,P1Q2.8.1.10,P1Q2.8.1.99,P1Q2.10.1.1,P1Q2.10.1.2,P1Q2.10.1.3,P1Q2.10.1.4,P1Q2.10.1.5,P1Q2.10.1.6,P1Q2.10.1.7,P1Q2.10.1.8,P1Q2.10.1.9,P1Q2.10.1.88,P1Q2.10.1.99,P1Q2.11.1.1,P1Q2.11.1.2,P1Q2.11.1.3,P1Q2.11.1.4,P1Q2.11.1.5,P1Q2.11.1.6,P1Q2.11.1.7,P1Q2.11.1.8,P1Q2.11.1.9,P1Q2.11.1.88,P1Q2.11.1.99,damage_level_floor,damage_level_walls,damage_level_foundation,damage_level_doors,damage_level_roof,damage_level_basement,repair_costs_building_VND,P1Q5.2.1,P1Q5.3.1,P1Q5.4.1,P1Q5.5.1,P1Q7.2.1.1,P1Q7.2.1.2,P1Q7.2.1.3,P1Q7.2.1.4,P1Q7.2.1.5,P1Q7.2.1.88,P1Q7.2.1.98,P1Q7.2.1.99,shp_monetary_resources_prenvention,shp_established,shp_sector,shp_employees,shp_avgmonthly_sale_mVND,P5Q1.6.1,P5Q1.6.2,P5Q1.6.3,P5Q1.6.4,P5Q1.6.5,P5Q1.6.88,P5Q1.9.1,P5Q1.9.2,P5Q1.9.3,P5Q1.9.4,P5Q1.9.5,P5Q1.9.6,shp_benefits_last5years,P5Q1.11,P5Q1.12
0,100,5,10,1,0,0,0,0,0,0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,3,2,3,1,1,98,2,1,1,1,1,0,0,0,0,0,0,1,0,3,2010,17,1,2,1,0,0,0,0,0,1,0,0,0,0,0,4,1,1
1,100,5,15,0,1,0,0,0,0,0,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,4,4,1,4,1,98,100000000,1,1,1,1,0,0,0,0,0,0,1,0,3,2019,22,1,2,1,0,0,0,0,0,0,1,0,0,0,0,4,1,1
2,99,5,2,1,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,98,2,1,1,1,1,0,0,0,0,0,0,1,0,1,1999,31,1,1,1,0,0,0,0,0,1,0,0,0,0,0,3,1,1
3,500,3,10,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,1,2,1,1,1,98,100000,1,1,1,1,0,0,0,0,0,0,1,0,1,2015,11,1,1,1,0,0,0,0,0,1,0,0,0,0,0,3,1,1
4,99,3,5,0,1,0,0,1,0,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,2,1,1,1,1,98,2,1,1,1,1,0,0,0,0,0,0,1,0,1,2014,11,1,1,1,0,0,0,0,0,1,0,0,0,0,0,2,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
247,100,5,50,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,2,2,1,1,1,98,5000000,1,1,1,1,0,0,0,0,0,0,1,0,1,2014,17,2,3,1,0,0,0,0,0,1,1,0,0,0,0,4,3,3
248,99,4,5,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,98,2,1,1,1,1,0,0,0,0,0,0,1,0,1,2015,11,2,3,1,0,0,0,0,0,1,1,0,0,0,0,4,3,4
249,99,4,20,0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,2,2,1,1,1,98,2,1,1,1,1,0,0,0,0,0,0,1,0,3,2005,11,5,3,1,0,0,0,0,0,1,1,1,0,0,0,2,1,2
250,600,4,5,0,1,0,0,1,0,0,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,3,3,1,1,1,98,120000000,1,1,1,4,0,0,0,0,0,0,1,0,3,2016,17,2,2,1,0,0,0,0,0,1,1,0,0,0,0,2,1,1


In [225]:
## determine only important variables

y_b = df_p125['Target_b'] # assign y variable - the target 
y_c = df_p125['Target_c'] # assign y variable - the target 
X = df_p125.loc[ :, df_p125.columns != ['Target_b', 'Target_c'] ]
X = X.iloc[:,:] # assign column 9 to 21 as x variable - the features

X = StandardScaler().fit_transform(X) # standarize the variables
#df['Attrition_Flag'].replace('Existing Customer','1',inplace=True)  # categorize string columns
#df['Attrition_Flag'].replace('Attrited Customer','0',inplace=True)


KeyError: 'Target_c'

In [None]:
## We will start by using only the first 2 leading principal components, and then explore 3 principal components and 4 principal components.
pca=PCA(n_components=2)
PC=pca.fit_transform(X)
principalDF=pd.DataFrame(data=PC, columns=['pc1','pc2'])
df_pca = pd.concat([principalDF, df_p125[['Target']]], axis = 1)
df_pca.head()

Unnamed: 0,pc1,pc2,Target
0,-3.201654,-0.524676,100
1,-0.740731,1.984172,100
2,-4.558331,-0.353355,99
3,-3.545795,-0.803994,500
4,-3.139464,-0.323059,99


In [None]:
X.columns.tolist()
#components #=components[9:21]


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
PCloadings = pca.components_.T * np.sqrt(pca.explained_variance_)
components = df_p125.loc[:, df_p125.columns != 'Target'].columns.tolist()  # use X before standarized

loadingdf = pd.DataFrame(PCloadings,columns=('PC1','PC2'))
loadingdf["variable"] = components
loadingdf

Unnamed: 0,PC1,PC2,variable
0,0.300467,-0.104960,P1Q1
1,0.455455,0.278457,water_depth_cm
2,-0.316068,-0.031204,P1Q2.5.1.0
3,0.289412,0.040595,P1Q2.5.1.1
4,0.187977,-0.072894,P1Q2.5.1.2
...,...,...,...
82,0.118116,0.009240,P5Q1.9.5
83,-0.000000,-0.000000,P5Q1.9.6
84,-0.087950,0.002755,shp_benefits_last5years
85,-0.111235,0.241011,P5Q1.11


### Visual selection of important variables by weights of PCs

In [236]:
fig = ex.scatter(x=loadingdf['PC1'],y=loadingdf['PC2'],text=loadingdf['variable'],)
fig.update_layout(
height = 600, width = 500,
title_text='loadings plot')
fig.update_traces(textposition='bottom center')
fig.add_shape(type="line",
    x0 = -0, y0 = -0.5, x1 = -0, y1 = 2.5,
    line = dict(color="RoyalBlue",width=3)
)
fig.add_shape(type="line",
    x0 = -1, y0 = 0, x1 = 1, y1 = 0,
    line=dict(color="RoyalBlue",width=3)
)
fig.show()

## Discretized continous variabels: 
E.g. by an equal frequency discretization
use coarse discrete classes for each varible e.g. 5 classes used in Can ho City-DS

## examine data distribution of variabels
- types of businesses
- water depth and inundation duration
- educational level
- distribution and amount of obs for business disruptions [monthly reduction %] P1Q5.9
- target 1 : actual total damage cost for business building [VND]
- target 2 : actual total damage cost for business contents  [VND]  P1Q5.6


#########################

## ungepürfte col name conversions

                    'P1Q5.1.1':'shp_flood_event',  # = recent or Q5.1.2?=most serious
                     'P1Q5.2.2':'shp_damage_level_furniture',
                     'P1Q5.3.2':'shp_damage_level_electronics',
                     'P1Q5.4.2':'shp_damage_level_business',
                     'P1Q5.5.2':'shp_damage_level_products',
                     'P1Q5.88.2':'shp_damage_level_others',
                     'P1Q5.6.2':'shp_total_direct_loss_content',
                     'P1Q5.7.2':'shp_closed_d',
                     'P1Q5.8.2':'shp_duration_back2normal',
                     'P1Q5.9.2':'shp_monthly_business_reduced_duringFlood',

                     'P2Q1.1.1':'protect_valuables',
                     'P2Q1.2.1':'water_barries',
                     'P2Q1.3.1':'pumping_equipment',
                     'P2Q1.4.1':'elevation_building',
                     'P2Q1.5.1':'resistant_material_building',
                     'P2Q1.6.1':'electricity_higher',
                     'P2Q1.7.1':'flood_protections',
                     'P2Q2.1.1':'elevation_building_year',
                     'P2Q2.2.1':'elevation_building_height_cm',
                     'P2Q2.3.1':'elevation_building_elements',

                     'P3Q1.1.1':'city_protection',
                     'P3Q1.2.1':'more_future_affected',
                     'P3Q1.3.1':'government_warnings',
                     'P3Q1.4.1':'government_careing',

                     'P4Q1.1':'household_inhabitants_number',
                     'P4Q1.8':'household_education',
                     'P4Q1.10':'household_income_aviable_monthly_mVND',
                     'P4Q2.1':'building_movingin',
                     'P4Q2.2':'building_year', # check 3.
                     'P4Q2.3':'building_floorsize_m2',
                     'P4Q3.1':'building_foundation', # check 3.
                     'P4Q3.2':'building_floor',
                     'P4Q3.3':'building_wall',

                     'P4Q3.5':'building_doors',
                     'P4Q3.6':'building_elevation_rel2surrounding_cm', # check3.
                     'P4Q4.3':'building_renovation_elements',
                     'P4Q4.4':'building_renovation_reasons',
                     'P4Q4.5':'building_renovation_cost_mVND',

                     'P5Q1.1':'shp_position',
                     'P5Q1.2':'shp_established',
                     'P5Q1.3':'shp_sector', # check
                     'P5Q1.4':'shp_employees',
                     'P5Q1.5':'shp_avgmonthly_sale_mVND',
                     'P5Q1.6':'shp_finance_investments',
                     'P5Q1.7':'shp_capital_mVND',
                     'P5Q1.9':'shp_suppliers_location',
                     'P5Q1.10':'shp_benefits_last5years',
                     'P5Q1.1':'shp_risk_behaviour',
                     'P5Q1.1':'shp_monetary_resources_prenvention'