In [119]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [120]:
df = pd.read_csv('../data/curated/visulisation_df.csv')

In [121]:
df = df.drop(columns=['id'])


## Encoding

In [122]:
encoded_df = pd.get_dummies(df, columns=['region', 'primary_purpose', 'primary_type', 'spillway', 'assessment'])

In [123]:
encoded_df['regulated_dam'].value_counts()

Yes    12052
No      8754
Name: regulated_dam, dtype: int64

In [124]:
encoded_df['hazard'] = encoded_df['hazard'].replace({
    'Low': 1,
    'High': 2,
    'Significant': 3,
    'Undetermined': 0
})

encoded_df['regulated_dam'] = encoded_df['hazard'].replace({
    'Yes': 1,
    'No': 0
})

## Normalisation

In [125]:
# take log transformation
for feature in ['height', 'volume', 'drainage','length', 'surface']:
    encoded_df[feature] = encoded_df[feature].apply(lambda x: np.log(x))

In [126]:
min_max_scaler = MinMaxScaler()
feature_df = encoded_df.drop(columns=['dam_repair_loss', 'damage_loss', 'business_interruption_loss', 'probability_of_failure'])

In [127]:
encoded_df[feature_df.columns] = pd.DataFrame(min_max_scaler.fit_transform(feature_df), 
                                                                                columns=feature_df.columns)

In [128]:
normalised_df = encoded_df

## Train Val Test Split

In [129]:
# split dataset into null, train, val, test set, with 80%, 10%, 10% distribution
def null_train_val_test_split(df, label):
    # split null rows out of df
    null_df = df.loc[df[label].isna()]
    temp_df = df.loc[df[label].notna()]
    # split traindf
    train_df, val_test_df = train_test_split(temp_df, test_size=0.2, random_state=42)
    # split val and test df
    val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)
    return null_df, train_df, val_df, test_df

In [130]:
# solit data set into feature and label parts
def X_Y_split(df, label):
    X = df.drop(columns=label)
    Y = df[label]
    return X, Y

#### Dam Repair Loss

In [131]:
repair_df = normalised_df.drop(columns=['damage_loss', 'business_interruption_loss'])

In [132]:
repair_null_df, repair_train_df, repair_val_df, repair_test_df = null_train_val_test_split(df=repair_df, label='dam_repair_loss')

In [133]:
repair_train_X, repair_train_Y = X_Y_split(repair_train_df, 'dam_repair_loss')
repair_val_X, repair_val_Y = X_Y_split(repair_val_df, 'dam_repair_loss')
repair_test_X, repair_test_Y = X_Y_split(repair_test_df, 'dam_repair_loss')
repair_null_X, _ = X_Y_split(repair_null_df, 'dam_repair_loss')

In [134]:
repair_val_Y

13584    693.8
7757      34.3
8184       5.5
5938      12.2
2319      12.6
         ...  
18908    363.5
12120     47.5
10682     42.3
1219      16.5
4476      13.7
Name: dam_repair_loss, Length: 2080, dtype: float64

In [135]:
# save data
repair_null_X.to_csv('../data/curated/repair/null_X.csv', index=False)
repair_train_X.to_csv('../data/curated/repair/train_X.csv', index=False)
repair_train_Y.to_csv('../data/curated/repair/train_Y.csv', index=False)
repair_val_X.to_csv('../data/curated/repair/val_X.csv', index=False)
repair_val_Y.to_csv('../data/curated/repair/val_Y.csv', index=False)
repair_test_X.to_csv('../data/curated/repair/test_X.csv', index=False)
repair_test_Y.to_csv('../data/curated/repair/test_Y.csv', index=False)

#### Damage Loss

In [136]:
damage_df = normalised_df.drop(columns=['dam_repair_loss', 'business_interruption_loss'])

In [137]:
damage_null_df, damage_train_df, damage_val_df, damage_test_df = null_train_val_test_split(df=damage_df, label='damage_loss')

In [138]:
damage_train_X, damage_train_Y = X_Y_split(damage_train_df, 'damage_loss')
damage_val_X, damage_val_Y = X_Y_split(damage_val_df, 'damage_loss')
damage_test_X, damage_test_Y = X_Y_split(damage_test_df, 'damage_loss')
damage_null_X, _ = X_Y_split(damage_null_df, 'damage_loss')

In [139]:
damage_train_Y

14968    184.1
5233       9.7
16018    645.2
47       863.9
11081    204.3
         ...  
11293    421.6
11974     18.0
5396      13.9
860       19.1
15805     23.2
Name: damage_loss, Length: 16635, dtype: float64

In [140]:
# save data
damage_null_X.to_csv('../data/curated/damage/null_X.csv', index=False)
damage_train_X.to_csv('../data/curated/damage/train_X.csv', index=False)
damage_train_Y.to_csv('../data/curated/damage/train_Y.csv', index=False)
damage_val_X.to_csv('../data/curated/damage/val_X.csv', index=False)
damage_val_Y.to_csv('../data/curated/damage/val_Y.csv', index=False)
damage_test_X.to_csv('../data/curated/damage/test_X.csv', index=False)
damage_test_Y.to_csv('../data/curated/damage/test_Y.csv', index=False)

#### Business Interruption Loss

In [141]:
BI_df = normalised_df.drop(columns=['damage_loss', 'dam_repair_loss'])

In [142]:
BI_null_df, BI_train_df, BI_val_df, BI_test_df = null_train_val_test_split(df=BI_df, label='business_interruption_loss')

In [143]:
BI_train_X, BI_train_Y = X_Y_split(BI_train_df, 'business_interruption_loss')
BI_val_X, BI_val_Y = X_Y_split(BI_val_df, 'business_interruption_loss')
BI_test_X, BI_test_Y = X_Y_split(BI_test_df, 'business_interruption_loss')
BI_null_X, _ = X_Y_split(BI_null_df, 'business_interruption_loss')

In [144]:
# save data
BI_null_X.to_csv('../data/curated/business_interruption/null_X.csv', index=False)
BI_train_X.to_csv('../data/curated/business_interruption/train_X.csv', index=False)
BI_train_Y.to_csv('../data/curated/business_interruption/train_Y.csv', index=False)
BI_val_X.to_csv('../data/curated/business_interruption/val_X.csv', index=False)
BI_val_Y.to_csv('../data/curated/business_interruption/val_Y.csv', index=False)
BI_test_X.to_csv('../data/curated/business_interruption/test_X.csv', index=False)
BI_test_Y.to_csv('../data/curated/business_interruption/test_Y.csv', index=False)

#### Probabilty of Failure

In [145]:
PF_df = normalised_df.drop(columns=['business_interruption_loss', 'damage_loss', 'dam_repair_loss'])

In [146]:
PF_null_df, PF_train_df, PF_val_df, PF_test_df = null_train_val_test_split(df=PF_df, label='probability_of_failure')

In [147]:
PF_train_X, PF_train_Y = X_Y_split(PF_train_df, 'probability_of_failure')
PF_val_X, PF_val_Y = X_Y_split(PF_val_df, 'probability_of_failure')
PF_test_X, PF_test_Y = X_Y_split(PF_test_df, 'probability_of_failure')

In [148]:
# save data
PF_train_X.to_csv('../data/curated/probability_of_failure/train_X.csv', index=False)
PF_train_Y.to_csv('../data/curated/probability_of_failure/train_Y.csv', index=False)
PF_val_X.to_csv('../data/curated/probability_of_failure/val_X.csv', index=False)
PF_val_Y.to_csv('../data/curated/probability_of_failure/val_Y.csv', index=False)
PF_test_X.to_csv('../data/curated/probability_of_failure/test_X.csv', index=False)
PF_test_Y.to_csv('../data/curated/probability_of_failure/test_Y.csv', index=False)