In [0]:
# Libraries
library(dataiku)
library(rpart)
library(dplyr)
library(caret)
library(pROC) # For AUC calculation
library(data.table)

In [0]:
# Recipe inputs
# Training data
df_base_train <- dkuReadDataset("base_train", samplingMethod="head", nbRows=100000)

# validation data 
df_base_validation  <- dkuReadDataset("base_validation", samplingMethod="head", nbRows=100000)

In [0]:
# Training structural equation for wind speed
# wind_speed = f(track_min_dist, eps)

base_wind_model <- rpart(wind_max ~ track_min_dist,
                       data = df_base_train,
                       method = "anova")

In [0]:
# Training structural equation for rain speed
# rain_total = f(track_min_dist, eps)

base_rain_model <- rpart(rain_total ~ track_min_dist,
                       data = df_base_train,
                       method = "anova")

In [0]:
# Adding the predicted parents' to the training dataset

## predicting wind_max
#wind_pred <- predict(base_wind_model,
#                         newdata = df_base_train)

## predicting rain_total
#rain_total_pred <- predict(base_rain_model,
#                         newdata = df_base_train)

df_base_train <- df_base_train %>%
  mutate(wind_max_pred = predict(base_wind_model,
                                 newdata = df_base_train),
         rain_total_pred = predict(base_rain_model, 
                                   newdata = df_base_train)
         )

In [0]:
# parameter tuning
# Define a grid of hyperparameters
cp_values <- seq(0.0001, 0.05, by = 0.005)
maxdepth_values <- c(3, 5, 7, 10)
minsplit_values <- c(10, 20, 30, 40)
minbucket_values <- c(5, 10, 20)

# Create an empty list to store results
results_list <- list()

# predicting for wind and rainfall for the validation dataset
df_val_base_tune <- df_base_validation %>%
  mutate(
    wind_max_pred = predict(
      base_wind_model, newdata = df_base_validation),
    rain_total_pred = predict(
      base_rain_model, 
      newdata = df_base_validation)
    )

# Train the model using manual grid search
grid_id <- 1  # Index for list storage

# Iterate over all combinations of hyperparameters
for (cp in cp_values) {
  for (maxdepth in maxdepth_values) {
    for (minsplit in minsplit_values) {
      for (minbucket in minbucket_values) {
        
        # Train the model with specific hyperparameters
        model <- rpart(
          damage_binary ~ wind_max_pred + 
            rain_total_pred + 
            roof_strong_wall_strong + 
            roof_strong_wall_light + 
            roof_strong_wall_salv + 
            roof_light_wall_strong + 
            roof_light_wall_light + 
            roof_light_wall_salv + 
            roof_salv_wall_strong + 
            roof_salv_wall_light + 
            roof_salv_wall_salv + 
            ls_risk_pct + 
            ss_risk_pct + 
            wind_blue_ss + 
            wind_yellow_ss + 
            wind_orange_ss +
            wind_red_ss + 
            rain_blue_ss + 
            rain_yellow_ss + 
            rain_orange_ss + 
            rain_red_ss,
          data = df_base_train,
          method = "class",  # classification
          control = rpart.control(cp = cp, maxdepth = maxdepth, 
                                  minsplit = minsplit, minbucket = minbucket)
        )
        
        # Make probability predictions for classification
        val_predictions <- predict(model, newdata = df_val_base_tune, type = "prob")[,2]  # Probability of class 1
        
        # Compute AUC (better for classification)
        auc_value <- auc(df_val_base_tune$damage_binary, val_predictions)
        
        # Store results efficiently in a list
        results_list[[grid_id]] <- data.frame(cp, maxdepth, minsplit, minbucket, AUC = auc_value)
        grid_id <- grid_id + 1
      }
    }
  }
}

# Convert list to data frame
results <- rbindlist(results_list)

# Print the best hyperparameter combination (highest AUC)
best_params <- results[which.max(results$AUC), ]
print(best_params)

In [0]:
# Training based on tuned parameters

# Combine Training and Validation datasets for final training

final_training_df  <- rbind(df_base_train, 
                           df_val_base_tune)


damage_fit_class_min <- rpart(damage_binary ~ wind_max_pred + 
                              rain_total_pred + 
                              roof_strong_wall_strong + 
                              roof_strong_wall_light +
                              roof_strong_wall_salv +
                              roof_light_wall_strong +
                              roof_light_wall_light +
                              roof_light_wall_salv +
                              roof_salv_wall_strong +
                              roof_salv_wall_light +
                              roof_salv_wall_salv +
                              ls_risk_pct +
                              ss_risk_pct +
                              wind_blue_ss +
                              wind_yellow_ss +
                              wind_orange_ss +
                              wind_red_ss +
                              rain_blue_ss +
                              rain_yellow_ss +
                              rain_orange_ss +
                              rain_red_ss, 
                              method = "class",
                              control = rpart.control(cp = best_params_high$cp, 
                                                      maxdepth = best_params_high$maxdepth, 
                                                      minsplit = best_params_high$minsplit, 
                                                      minbucket = best_params_high$minbucket),
                              data = final_training_df
                         )

In [0]:
# Recipe outputs
base_scm_classification_min_model <- dkuManagedFolderPath("8jrmex16")

In [0]:
best_params

In [0]:
colnames(df_val_base_tune)

In [0]:
?rpart