This repository has been archived by the owner on Oct 12, 2023. It is now read-only.
/
sas_resources_files_example.R
80 lines (63 loc) · 3.25 KB
/
sas_resources_files_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(doAzureParallel)
doAzureParallel::setCredentials("credentials.json")
# Using rAzureBatch directly for storage uploads
config <- rjson::fromJSON(file = paste0("credentials.json"))
storageCredentials <- rAzureBatch::SharedKeyCredentials$new(
name = config$sharedKey$storageAccount$name,
key = config$sharedKey$storageAccount$key
)
storageAccountName <- storageCredentials$name
inputContainerName <- "datasets"
storageClient <- rAzureBatch::StorageServiceClient$new(
authentication = storageCredentials,
url = sprintf("https://%s.blob.%s",
storageCredentials$name,
config$sharedKey$storageAccount$endpointSuffix
)
)
# Generate a sas tokens with the createSasToken function
# Write-only SAS. Will be used for uploading files to storage.
writeSasToken <- storageClient$generateSasToken(permission = "w", "c", path = inputContainerName)
# Read-only SAS. Will be used for downloading files from storage.
readSasToken <- storageClient$generateSasToken(permission = "r", "c", path = inputContainerName)
# Create a Storage container in the Azure Storage account
storageClient$containerOperations$createContainer(inputContainerName, content = "response")
# Upload blobs with a write sasToken
storageClient$blobOperations$uploadBlob(inputContainerName,
fileDirectory = "1989.csv",
sasToken = writeSasToken,
accountName = storageAccountName)
storageClient$blobOperations$uploadBlob(inputContainerName,
fileDirectory = "1990.csv",
sasToken = writeSasToken,
accountName = storageAccountName)
# Create URL paths with read-only permissions
csvFileUrl1 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
containerName = inputContainerName,
sasToken = readSasToken,
fileName = "1989.csv")
csvFileUrl2 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
containerName = inputContainerName,
sasToken = readSasToken,
fileName = "1990.csv")
# Create a list of files to download to the cluster using read-only permissions
# Place the files in a directory called 'data'
resource_files = list(
rAzureBatch::createResourceFile(httpUrl = csvFileUrl1, filePath = "data/1989.csv"),
rAzureBatch::createResourceFile(httpUrl = csvFileUrl2, filePath = "data/1990.csv")
)
# Create the cluster
cluster <- makeCluster("sas_resource_files_cluster.json", resourceFiles = resource_files)
registerDoAzureParallel(cluster)
workers <- getDoParWorkers()
# Files downloaded to the cluster are placed in a specific directory on each node called 'wd'
# Use the pre-defined environment variable 'AZ_BATCH_NODE_STARTUP_DIR' to find the path to the directory
listFiles <- foreach(i = 1:workers, .combine='rbind') %dopar% {
fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd", "/data")
files <- list.files(fileDirectory)
df = data.frame("node" = i, "files" = files)
return(df)
}
# List the files downloaded to each node in the cluster
listFiles
stopCluster(cluster)