In [2]:
%%configure -f
{
    "executorMemory": "2g",
    "driverMemory": "2g",
    "executorCores": 2,
    "driverCores": 2,
    "numExecutors": 10
}

In [3]:
# Print the hostname for informational purposes
print(Sys.info()["nodename"])

# Define random values for hyperparameter(s)
set.seed(1)
alpha <- runif(n = 40, min = 0.01, max = 0.99)

# Create R dataframe of hyperparameter values
params_df <- data.frame(alpha)
print(params_df)

# Create Spark dataframe of hyperparameter values with one partition per row
params_sdf <- createDataFrame(params_df, numPartitions = nrow(params_df))


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
16,application_1550601622592_0017,sparkr,idle,Link,Link,✔


SparkSession available as 'spark'.


                      nodename 
"mssql-storage-pool-default-1" 
        alpha
1  0.27019849
2  0.37468142
3  0.57139630
4  0.90004363
5  0.20764829
6  0.89042189
7  0.93578176
8  0.65758184
9  0.62653176
10 0.07055055
11 0.21185508
12 0.18302562
13 0.68328239
14 0.38642164
15 0.76444459
16 0.49774526
17 0.71326614
18 0.98206797
19 0.38243448
20 0.77189632
21 0.92601113
22 0.21789967
23 0.64864029
24 0.13304399
25 0.27187626
26 0.38839181
27 0.02312253
28 0.38474020
29 0.86229703
30 0.34354202
31 0.48243851
32 0.59757451
33 0.49367048
34 0.19249325
35 0.82082585
36 0.66509740
37 0.78835506
38 0.11578475
39 0.71923673
40 0.41304894
closing unused connection 3 (->localhost:44545) 

In [7]:
fit_and_evaluate <- function(hyperparams) {
    .libPaths(c("/tmp/extra-libs"))
    library(glmnet)
    library(pROC)

    local_data_dir <- tempfile()
    dir.create(local_data_dir)

    X_train_rds_local <- file.path(local_data_dir, "X_train.rds")
    X_test_rds_local <- file.path(local_data_dir, "X_test.rds")
    y_train_rds_local <- file.path(local_data_dir, "y_train.rds")
    y_test_rds_local <- file.path(local_data_dir, "y_test.rds")

    # Copy data from HDFS to local temporary directory
    system2("hdfs", args = c("dfs", "-copyToLocal", "/tmp/data/*.rds", local_data_dir))

    X_train <- readRDS(X_train_rds_local)
    X_test <- readRDS(X_test_rds_local)
    y_train <- readRDS(y_train_rds_local)
    y_test <- readRDS(y_test_rds_local)

    alpha <- hyperparams[1]

    # Fit models on training data
    models <- glmnet(X_train, y_train, 
                    alpha=alpha, nlambda = 50, 
                    family="binomial")

    # Predict on test data
    test_pred <- predict(models, X_test, type="response")

    # Evaluate ROC AUC of each model
    test_set_aucs <- vector()
    for (i in 1:length(models$lambda)) {
        test_set_auc <- as.numeric(pROC::auc(factor(y_test), test_pred[,i], direction='<'))
        test_set_aucs[i] <- test_set_auc
    }

    # Identify the best model (highest AUC)
    max_auc <- 0
    best_lambda <- 0
    for (i in 1:length(models$lambda)) {
        if (test_set_aucs[i] > max_auc) {
            max_auc <- test_set_aucs[i]
            best_lambda <- models$lambda[i]
        }
    }

    # Delete local temporary directory
    unlink(local_data_dir, recursive = TRUE)

    # Return the hyperparameters and AUC of the best model
    unlist(c(alpha, lambda = best_lambda, auc = max_auc))
  }

In [8]:
# Invoke fit_and_evaluate() in parallel on each set of hyperparameter values

tuning_results_matrix <- dapplyCollect(
  params_sdf,
  func = fit_and_evaluate)

tuning_results <- as.data.frame(tuning_results_matrix)
sorted_results <- tuning_results[order(tuning_results$auc, decreasing = TRUE),]

print(sorted_results)

        alpha     lambda       auc
10 0.07055055 0.69599773 0.9280610
27 0.02312253 2.33286300 0.9275246
38 0.11578475 0.42408882 0.9275158
24 0.13304399 0.36907355 0.9271217
12 0.18302562 0.24421930 0.9257864
34 0.19249325 0.23220757 0.9255563
5  0.20764829 0.21526008 0.9252166
11 0.21185508 0.21098568 0.9251205
22 0.21789967 0.20513289 0.9249732
1  0.27019849 0.16542797 0.9237309
25 0.27187626 0.16440711 0.9236879
30 0.34354202 0.10781504 0.9214702
2  0.37468142 0.08998720 0.9204354
19 0.38243448 0.08816290 0.9201978
28 0.38474020 0.08763454 0.9201257
14 0.38642164 0.08725322 0.9200705
26 0.38839181 0.08681061 0.9200045
40 0.41304894 0.07430620 0.9190673
31 0.48243851 0.05791196 0.9165451
33 0.49367048 0.05659435 0.9161414
16 0.49774526 0.05613104 0.9160028
3  0.57139630 0.04450989 0.9135842
32 0.59757451 0.04256002 0.9127596
9  0.62653176 0.03695171 0.9118195
23 0.64864029 0.03569223 0.9110858
8  0.65758184 0.03520690 0.9108012
36 0.66509740 0.03480906 0.9105806
13 0.68328239 0.0338

In [6]:
%%cleanup -f