In [0]:
# Libraries
library(dataiku)
library(rpart)
library(dplyr)

In [0]:
# Recipe inputs
# Training data
df_base_train <- dkuReadDataset("base_train", samplingMethod="head", nbRows=100000)

# validation data 
df_base_validation  <- dkuReadDataset("base_validation", samplingMethod="head", nbRows=100000)

In [0]:
# Training structural equation for wind speed
# wind_speed = f(track_min_dist, eps)

base_wind_model <- rpart(wind_max ~ track_min_dist,
                       data = df_base_train,
                       method = "anova")

In [0]:
# Training structural equation for rain speed
# rain_total = f(track_min_dist, eps)

base_rain_model <- rpart(rain_total ~ track_min_dist,
                       data = df_base_train,
                       method = "anova")

In [0]:
# Adding the predicted parents' to the training dataset

## predicting wind_max
#wind_pred <- predict(base_wind_model,
#                         newdata = df_base_train)

## predicting rain_total
#rain_total_pred <- predict(base_rain_model,
#                         newdata = df_base_train)

df_base_train <- df_base_train %>%
  mutate(wind_max_pred = predict(base_wind_model,
                                 newdata = df_base_train),
         rain_total_pred = predict(base_rain_model, 
                                   newdata = df_base_train)
         )

In [0]:
# Training decision tree for classification
damage_fit_class_min <- rpart(damage_binary ~ wind_max_pred +
                           rain_total_pred +
                           roof_strong_wall_strong +
                           roof_strong_wall_light +
                           roof_strong_wall_salv +
                           roof_light_wall_strong +
                           roof_light_wall_light +
                           roof_light_wall_salv +
                           roof_salv_wall_strong +
                           roof_salv_wall_light +
                           roof_salv_wall_salv +
                           ls_risk_pct +
                           ss_risk_pct +
                           wind_blue_ss +
                           wind_yellow_ss +
                           wind_orange_ss +
                           wind_red_ss +
                           rain_blue_ss +
                           rain_yellow_ss +
                           rain_orange_ss +
                           rain_red_ss,
                         method = "class",
                         data = df_base_train
                         )

In [0]:
# parameter tuning
# Define a grid of hyperparameters
cp_values <- seq(0.0001, 0.05, by = 0.005)
maxdepth_values <- c(3, 5, 7, 10)
minsplit_values <- c(10, 20, 30, 40)
minbucket_values <- c(5, 10, 20)

# Create an empty list to store results
results <- data.frame(cp = numeric(), maxdepth = numeric(), 
                      minsplit = numeric(), minbucket = numeric(), RMSE = numeric())

# predicting for wind and rainfall for the validation dataset
df_val_base_tune <- df_base_val %>%
  mutate(
    wind_max_pred = predict(
      wind_max_fit, newdata = df_base_val),
    rain_total_pred = predict(
      rain_total_fit, 
      newdata = df_base_val)
    )

# Train the model using manual grid search
# Iterate over all combinations of hyperparameters
for (cp in cp_values) {
  for (maxdepth in maxdepth_values) {
    for (minsplit in minsplit_values) {
      for (minbucket in minbucket_values) {
        
        # Train the model with specific hyperparameters
        model <- rpart(
          DAM_perc_dmg ~ wind_max_pred + rain_total_pred + 
            roof_strong_wall_strong + roof_strong_wall_light + 
            roof_strong_wall_salv + roof_light_wall_strong + 
            roof_light_wall_light + roof_light_wall_salv + 
            roof_salv_wall_strong + roof_salv_wall_light + 
            roof_salv_wall_salv + ls_risk_pct + ss_risk_pct + 
            ruggedness_mean + slope_mean + wind_blue_ss + 
            wind_yellow_ss + wind_orange_ss + wind_red_ss + 
            rain_blue_ss + rain_yellow_ss + rain_orange_ss + rain_red_ss,
          data = df_base_train,
          method = "anova",  # Regression tree
          control = rpart.control(cp = cp, maxdepth = maxdepth, 
                                  minsplit = minsplit, minbucket = minbucket)
        )
        
        # Make predictions on the validation set
        val_predictions <- predict(model, newdata = df_val_base_tune)
        
        # Compute RMSE
        rmse_value <- rmse(df_val_base_tune$DAM_perc_dmg, val_predictions)
        
        # Store results
        results <- rbind(results, data.frame(cp, maxdepth, minsplit, minbucket, RMSE = rmse_value))
      }
    }
  }
}

# Print the best hyperparameter combination
best_params <- results[which.min(results$RMSE), ]
print(best_params)

In [0]:
# Recipe outputs
base_scm_classification_min_model <- dkuManagedFolderPath("8jrmex16")