In [0]:
library(dataiku)
library(rpart)
library(dplyr)
library(caret)
library(data.table)
library(mlflow)
library(reticulate)
library(Metrics)

In [0]:
# Recipe inputs
truncated_train <- dkuReadDataset("truncated_train", samplingMethod="head", nbRows=100000)
truncated_validation <- dkuReadDataset("truncated_validation", samplingMethod="head", nbRows=100000)

In [0]:
# Fitting tree for wind and rain
# wind_max prediction using decision trees

trunc_wind_model <- rpart(wind_max ~ track_min_dist, 
                       data = truncated_train, 
                       method = "anova")

trunc_rain_model <- rpart(rain_total ~ track_min_dist, 
                       data = truncated_train, 
                       method = "anova")

In [0]:
# add the predictions of wind and rainfall to the dataframes
df_trunc_train <- truncated_train %>%
  mutate(wind_max_pred = predict(trunc_wind_model, 
                         newdata = truncated_train), 
         rain_total_pred = predict(trunc_rain_model, 
                         newdata = truncated_train)
        )

In [0]:
# predicting for wind and rainfall for the validation dataset on trained high impact
# WE NEED THIS FOR HYPERPARAMETER TUNING!
df_trunc_val <- truncated_validation %>%
  mutate(
    wind_max_pred = predict(
      trunc_wind_model, newdata = truncated_validation),
    rain_total_pred = predict(
      trunc_rain_model, 
      newdata = truncated_validation)
    )

In [0]:
# Define a grid of hyperparameters same as used for the base model
cp_values <- seq(0.0001, 0.05, by = 0.0005)
maxdepth_values <- c(3, 5, 7, 10)
minsplit_values <- c(10, 20, 30, 40)
minbucket_values <- c(5, 10, 20)

# Create an empty list to store results
results_high <- data.frame(cp = numeric(), maxdepth = numeric(), 
                      minsplit = numeric(), minbucket = numeric(), RMSE = numeric())



# Train the model using manual grid search
# Iterate over all combinations of hyperparameters
for (cp in cp_values) {
  for (maxdepth in maxdepth_values) {
    for (minsplit in minsplit_values) {
      for (minbucket in minbucket_values) {
        
        # Train the model with specific hyperparameters
        model <- rpart(
          damage_perc ~ wind_max_pred + 
            rain_total_pred + 
            roof_strong_wall_strong + 
            roof_strong_wall_light + 
            roof_strong_wall_salv + 
            roof_light_wall_strong + 
            roof_light_wall_light + 
            roof_light_wall_salv + 
            roof_salv_wall_strong + 
            roof_salv_wall_light + 
            roof_salv_wall_salv + 
            ls_risk_pct + 
            ss_risk_pct + 
            wind_blue_ss + 
            wind_yellow_ss + 
            wind_orange_ss + 
            wind_red_ss + 
            rain_blue_ss + 
            rain_yellow_ss + 
            rain_orange_ss + 
            rain_red_ss,
          data = df_trunc_train,
          method = "anova",  # Regression tree
          control = rpart.control(cp = cp, maxdepth = maxdepth, 
                                  minsplit = minsplit, minbucket = minbucket)
        )
        
        # Make predictions on the validation set
        val_predictions <- predict(model, newdata = df_trunc_val)
        
        # Compute RMSE
        rmse_value <- rmse(df_val_high_tune$DAM_perc_dmg, val_predictions)
        
        # Store results
        results_high <- rbind(results, data.frame(cp, maxdepth, minsplit, minbucket, RMSE = rmse_value))
      }
    }
  }
}

# Print the best hyperparameter combination
best_params_high <- results_high[which.min(results_high$RMSE), ]
print(best_params_high)

In [0]:
# add the predictions of wind and rainfall to the dataframes
df_trunc_train <- truncated_train %>%
  mutate(wind_max_pred = predict(trunc_wind_model, 
                         newdata = truncated_train), 
         rain_total_pred = predict(trunc_rain_model, 
                         newdata = truncated_train)
        )

trunc_damage_fit_reg <- rpart(damage_perc ~ wind_max_pred + 
                           rain_total_pred + 
                           roof_strong_wall_strong + 
                           roof_strong_wall_light +
                           roof_strong_wall_salv + 
                           roof_light_wall_strong + 
                           roof_light_wall_light + 
                           roof_light_wall_salv + 
                           roof_salv_wall_strong +
                           roof_salv_wall_light +
                           roof_salv_wall_salv +
                           ls_risk_pct +
                           ss_risk_pct +
                           ruggedness_mean +
                           slope_mean +
                           wind_blue_ss +
                           wind_yellow_ss +          
                           wind_orange_ss +          
                           wind_red_ss +
                           rain_blue_ss +
                           rain_yellow_ss +
                           rain_orange_ss +
                           rain_red_ss, 
                         method = "anova", 
                         data = df_high_train)

In [0]:
# Recipe outputs
trunk_scm_min_model <- dkuManagedFolderPath("dL4i4SKb")