# Train, Test, and Validation Dataset Creation

We do a 60/20/20 split to create the datasets. Because we are going to use the hurdle method for predictions we split the modeling data into two categories: base and truncated. No additional processing (filtering) is required to create "base_" datasets. To create "truncated_" datasets we filter the modeling data by outcome variable (damage_perc >= 10). 

In [0]:
library(dataiku)

# Recipe inputs
modeling_data <- dkuReadDataset("modeling_data", samplingMethod="head", nbRows=100000)

In [0]:
# splits for base_ datasets

# number of rows in modeling_data
n <- nrow(modeling_data)

# Seeding for reproducibility
set.seed(12345)

# Generate random indices for 60% training set
base_train_id <- sample(1:n, floor(n * 0.6))

# Remaining indices after training selection
base_remaining_id <- setdiff(1:n, base_train_id)

# Split remaining 40% into 20% validation and 20% test
base_val_id <- sample(base_remaining_id, floor(n * 0.2))

base_test_id <- setdiff(base_remaining_id, base_val_id)  # The rest goes to test


# Compute recipe outputs for base_ datasets
base_train <- modeling_data[base_train_id] # Compute a data frame for the output to write into base_train

base_test <- modeling_data[base_test_id] # Compute a data frame for the output to write into base_test

base_validation <- modeling_data[base_val_id] # Compute a data frame for the output to write into base_validation

In [0]:
# splits for truncated_ datasets

df_cleaned_high <- df_cleaned %>%
  filter(DAM_perc_dmg >= 10)

## reset row ID's
rownames(df_cleaned_high) <- 1:nrow(df_cleaned_high)

## number of observations with damage > 10
n_high <- nrow(df_cleaned_high)

## use 60, 40 split
train_id_high <- sample(1:n_high, 
                  floor(n_high*0.6)
                  )

## Get the remaining 40% indices
remaining_id_high <- setdiff(1:n_high, train_id_high)

# Randomly select 50% of the remaining (which is 20% of the total) for validation
val_id_high <- sample(remaining_id_high, floor(n_high * 0.2))

## The rest (remaining 20%) goes to test
test_id_high <- setdiff(remaining_id_high, val_id_high)

## Create train, validation, and test datasets
df_high_train <- df_cleaned_high[train_id_high, ]
df_high_val <- df_cleaned_high[val_id_high, ]
df_high_test <- df_cleaned_high[test_id_high, ]

# Compute recipe outputs for truncated_ datasets
truncated_train <- replace_me # Compute a data frame for the output to write into truncated_train
truncated_validation <- replace_me # Compute a data frame for the output to write into truncated_validation
truncated_test <- replace_me # Compute a data frame for the output to write into truncated_test

In [0]:
# Recipe outputs
dkuWriteDataset(base_train,"base_train")
dkuWriteDataset(base_test,"base_test")
dkuWriteDataset(base_validation,"base_validation")
dkuWriteDataset(truncated_train,"truncated_train")
dkuWriteDataset(truncated_validation,"truncated_validation")
dkuWriteDataset(truncated_test,"truncated_test")