Azure · alfpark · Jun 27, 2018 · May 2, 2018 · May 2, 2018 · May 2, 2018
diff --git a/recipes/MADL-CPU-OpenMPI/Data-Shredding/DataShredding.py b/recipes/MADL-CPU-OpenMPI/Data-Shredding/DataShredding.py
@@ -0,0 +1,149 @@
+from __future__ import print_function
+import json
+import os
+import sys
+import tempfile
+from azure.storage.blob import BlockBlobService
+from os import listdir
+from os.path import isfile, join
+
+# make sure config data is encode it correctly
+def encode(value):
+    if isinstance(value, type('str')):
+        return value
+    return value.encode('utf-8')
+
+# configuration class
+class Configuration:
+
+    def __init__(self, file_name):
+        if not os.path.exists(file_name):
+            raise ValueError('Cannot find configuration file "{0}"'.
+                             format(file_name))
+
+        with open(file_name, 'r') as f:
+            conf = json.load(f)
+
+        try:
+            self.node_count = encode(conf['node_count'])
+            self.thread_count = encode(conf['thread_count'])
+            self.training_data_shred_count = encode(conf['training_data_shred_count'])
+            self.dataset_local_directory = encode(conf['dataset_local_directory'])
+            self.shredded_dataset_local_directory = encode(conf['shredded_dataset_local_directory'])
+            self.shredded_dataset_Per_Node = encode(conf['shredded_dataset_Per_Node'])
+            self.container_name = encode(conf['container_name'])
+            self.trainind_dataset_name = encode(conf['trainind_dataset_name'])
+            self.training_data_container_name = encode(conf['training_data_container_name'])
+            self.subscription_id = encode(conf['subscription_id'])
+            self.secret_key = encode(conf['secret_key'])
+            self.resource_group = encode(conf['resource_group'])
+            self.storage_account_name = encode(conf['storage_account']['name'])
+            self.storage_account_key = encode(conf['storage_account']['key'])          
+        except KeyError as err:
+            raise AttributeError('Please provide a value for "{0}" configuration key'.format(err.args[0]))
+
+# load the configuration data
+cfg = Configuration('configuration.json')
+
+# azure block service object
+blob_service = BlockBlobService(cfg.storage_account_name, cfg.storage_account_key)
+
+# container name
+azure_blob_container_name = cfg.container_name
+
+# training data container name
+azure_blob_training_data_container_name = cfg.training_data_container_name
+
+# create the container that will host the data blobs
+blob_service.create_container(azure_blob_container_name, fail_on_exist=False)
+
+
+# the function that load the data from the training blob, partition the data
+# and upload it to the container blobs
+def partition_and_upload_dataset_to_blob(blob_service, azure_blob_container_name):
+
+    # List the blobs in a training container
+    blobs = []
+    marker = None
+    blobs_size = 1
+    while True:
+        batch = blob_service.list_blobs(azure_blob_training_data_container_name, marker=marker)
+        blobs.extend(batch)
+        if not batch.next_marker:
+            break
+        marker = batch.next_marker
+    for blob in blobs:
+        blobs_size  += blob.properties.content_length
+        print(blob.name)
+
+    # the vm / thread count
+    vm_thread_count = (int(cfg.node_count) - 1) * int(cfg.thread_count)
+
+    # the file count per vm
+    file_count = int(cfg.training_data_shred_count) // vm_thread_count
+
+    # the file size
+    file_size = blobs_size // int(cfg.training_data_shred_count) 
+
+    # data path directory
+    dataset_local_directory = os.path.normpath(cfg.dataset_local_directory)  
+
+    #local shredded data directory
+    shredded_dataset_local_directory = os.path.normpath(cfg.shredded_dataset_local_directory)
+    shredded_dataset_Per_Node= os.path.normpath(cfg.shredded_dataset_Per_Node)
+
+    # download data from training blob, slice it
+    print('downloading dataset from blob and create them localy...')
+    i = 0
+    for itr in range(len(blobs)):
+        blob = blobs[itr]
+        blob_service.get_blob_to_path(azure_blob_training_data_container_name,
+        blob.name, os.path.join(dataset_local_directory, blob.name))
+        file_name_no_extension, file_extension = os.path.splitext(blob.name)
+
+        lines_bytes_size = 0
+        alist = []
+        with open(os.path.join(dataset_local_directory, blob.name), 'r') as in_file:
+            for line in in_file:
+                lines_bytes_size += sys.getsizeof(line)
+                alist.append(line)
+                if(lines_bytes_size >= file_size):
+                    with open(os.path.join(shredded_dataset_local_directory,
+                    file_name_no_extension + '_' + str(itr) + '_' + str(i) + file_extension), 'w') as wr:
+                        for item in alist:
+                            wr.write(item)
+                        lines_bytes_size = 0
+                        alist = []
+                        i +=1
+
+     # combine shreded files into a one file per node
+    alldatafiles = [f for f in listdir(shredded_dataset_local_directory) if isfile(join(shredded_dataset_local_directory, f))]
+    low_index = 0
+    high_index = file_count
+    filename = "data.lst"
+    for vm_count in range(vm_thread_count):
+        blob_name = cfg.trainind_dataset_name + "-" + "%05d" % (vm_count,) 
+        if(high_index > len(alldatafiles)):
+            high_index = len(alldatafiles)
+        if not os.path.exists(os.path.join(shredded_dataset_Per_Node, blob_name)):
+           os.makedirs(os.path.join(shredded_dataset_Per_Node, blob_name))
+        with open(os.path.join(shredded_dataset_Per_Node, blob_name + '\\' + filename), 'w') as outfile:            
+            for itr in range(low_index, high_index):
+                    with open(os.path.join(shredded_dataset_local_directory, alldatafiles[itr])) as infile:
+                        for line in infile:
+                            outfile.write(line)
+        low_index += file_count
+        high_index += file_count    
+
+    # upload combined sliced data to blobs
+    for subdir, dirs, files in os.walk(shredded_dataset_Per_Node):
+        for file in files:
+            print ( os.path.basename(subdir))
+            print (os.path.join(subdir, file))
+            blob_service.create_blob_from_path(azure_blob_container_name,  os.path.basename(subdir) + '/' + file, os.path.join(subdir, file))       
+
+
+    print('Done')
+
+# begin loading, partitioning and deploying training data
+partition_and_upload_dataset_to_blob(blob_service, azure_blob_container_name)
diff --git a/recipes/MADL-CPU-OpenMPI/Data-Shredding/README.md b/recipes/MADL-CPU-OpenMPI/Data-Shredding/README.md
@@ -0,0 +1,27 @@
+## MADL-CPU-OpenMPI Data Shredding
+We included a python script that shows how to shred and deploy your training data prior to running a training job on Azure VMs via Open MPI.
+Azure VMs via Open MPI.
+
+### Data Shredding Configuration
+Rename the configuration-template.json to configuration.json.  The configuration should enable the following properties:
+* `node_count` should be set to the number of VMs in the compute pool.
+* `thread_count` thread's count per VM.
+* `training_data_shred_count` we advise to set this number high. This way you only do this step once, and use it for different VMs configuration.
+* 'dataset_local_directory' A local directory to download and shred the training data according to 'training_data_shred_count'.
+* 'shredded_dataset_Per_Node' A local directory to hold the final data shreds before deploying them to Azure blobs. 
+* 'container_name' container name where the sliced data will be stored.
+* 'trainind_dataset_name' name for the dataset.  Used when creating the data blobs.
+* 'subscription_id' Azure subscription id.
+* 'secret_key' Azure password.
+* 'resource_group' Resource group name.
+* 'storage_account' storage account name and access key.
+* 'training_data_container_name' Container name where the training data is hosted.
+*''
+
+You can use your own access mechanism (password, access key, etc.).  The above is only a one example.  Although, make sure to update the python script 
+every time you make a configuration change.
+
+You must agree to the following licenses prior to use:
+* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx)
+* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx)
+* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt) 
diff --git a/recipes/MADL-CPU-OpenMPI/Data-Shredding/configuration-template.json b/recipes/MADL-CPU-OpenMPI/Data-Shredding/configuration-template.json
@@ -0,0 +1,18 @@
+{
+  "node_count": <node count>
+  "thread_count": <thread count per node>,
+  "training_data_shred_count": <number of data shreds>,
+  "dataset_local_directory": <local dir for training data>,
+  "shredded_dataset_local_directory": <local dir for sliced data>,
+  "shredded_dataset_Per_Node":<local directory for final shredded data per node>
+  "container_name": <container name where the sliced data will be stored>,
+  "trainind_dataset_name": <name for the dataset.  Used when creating the data blobs,
+  "subscription_id": <azure subscription id>,
+  "secret_key": <password>,
+  "resource_group": <resource group name>,
+  "storage_account": {
+    "name": <storeage account name>,
+    "key": <storage key>
+  },
+  "training_data_container_name": <container name where the training data is hosted>
+}
diff --git a/recipes/MADL-CPU-OpenMPI/README.md b/recipes/MADL-CPU-OpenMPI/README.md
@@ -0,0 +1,61 @@
+# MADL-CPU-OpenMPI
+This recipe shows how to run High Performance ML Algorithms Learner on CPUs across
+Azure VMs via Open MPI.
+
+## Configuration
+Please see refer to this [set of sample configuration files](./config) for
+this recipe.
+
+### Pool Configuration
+The pool configuration should enable the following properties:
+* `vm_size` should be a CPU-only instance, for example, 'STANDARD_D2_V2'.
+* `inter_node_communication_enabled` must be set to `true`
+* `max_tasks_per_node` must be set to 1 or omitted
+
+### Global Configuration
+The global configuration should set the following properties:
+* `docker_images` array must have a reference to a valid MADL
+Docker image that can be run with OpenMPI. The image denoted with `0.0.1` tag found in [msmadl/symsgd:0.0.1](https://hub.docker.com/r/msmadl/symsgd/)
+is compatible with Azure Batch Shipyard VMs.
+
+### MPI Jobs Configuration (MultiNode)
+The jobs configuration should set the following properties within the `tasks`
+array which should have a task definition containing:
+* `docker_image` should be the name of the Docker image for this container invocation.
+For this example, this should be
+`msmadl/symsgd:0.0.1`.
+Please note that the `docker_images` in the Global Configuration should match
+this image name.
+* `command` should contain the command to pass to the Docker run invocation.
+For this example, we will run MADL training example in the `msmadl/symsgd:0.0.1` Docker image. The
+application `command` to run would be:
+`"/parasail/run_parasail.sh -w /parasail/supersgd -l 1e-4 -k 32 -m 1e-2 -e 10 -r 10 -f $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file> -t 1 -g 1 -d $AZ_BATCH_TASK_WORKING_DIR/models -b $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file>"`
+  * [`run_parasail.sh`](docker/run_parasail.sh) has these parameters
+    * `-w` the MADL superSGD directory
+    * `-l` learning rate
+    * `-k` approximation rank constant
+    * `-m` model combiner convergence threshold
+    * `-e` total epochs
+    * `-r` rounds per epoch
+    * `-f` training file prefix
+    * `-t` number of threads
+    * `-g` log global models every this many epochs
+    * `-d` log global models to this directory at the host"
+    * `-b` location for the algorithm's binary"
+
+* The training data will need to be shredded to match the number of VMs and the thread's count per VM, and then deployed to a mounted Azure blob that the VM docker images have read/write access.  
+We created a basic python script that can be used to shred and deploy the training data to a blob container identified by the user.
+Data shredding files can be found [here](./DataShredding).
+* `shared_data_volumes`  should contain the shared data volume with an `azureblob` volume driver as specified in the global configuration file found [here](./config/config.yaml).
+
+* `multi_instance` property must be defined
+  * `num_instances` should be set to `pool_current_dedicated`, or
+    `pool_current_low_priority`
+
+## Dockerfile and supplementary files
+Supplementary files can be found [here](./docker).
+
+You must agree to the following licenses prior to use:
+* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx)
+* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx)
+* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt) 
diff --git a/recipes/MADL-CPU-OpenMPI/config/config.yaml b/recipes/MADL-CPU-OpenMPI/config/config.yaml
@@ -0,0 +1,13 @@
+batch_shipyard:
+  storage_account_settings: mystorageaccount
+global_resources:
+  docker_images:
+  - msmadl/symsgd:0.0.1
+  volumes:
+    shared_data_volumes:
+      azureblob_vol:
+        volume_driver: azureblob
+        storage_account_settings: mystorageaccount
+        azure_blob_container_name: <blob container name>
+        container_path: $AZ_BATCH_NODE_SHARED_DIR/azblob
+        bind_options: rw
diff --git a/recipes/MADL-CPU-OpenMPI/config/credentials.yaml b/recipes/MADL-CPU-OpenMPI/config/credentials.yaml
@@ -0,0 +1,9 @@
+credentials:
+  batch:
+    account_key: <batch account key>
+    account_service_url: <batch account service url>
+  storage:
+    mystorageaccount:
+      account: <storage account name>
+      account_key: <storage account key>
+      endpoint: core.windows.net
diff --git a/recipes/MADL-CPU-OpenMPI/config/jobs.yaml b/recipes/MADL-CPU-OpenMPI/config/jobs.yaml
@@ -0,0 +1,10 @@
+job_specifications:
+- id: <job id>
+  auto_complete: true
+  shared_data_volumes:
+  - azureblob_vol
+  tasks:
+  - docker_image: msmadl/symsgd:0.0.1
+    multi_instance:
+      num_instances: pool_current_dedicated
+    command: /parasail/run_parasail.sh -w /parasail/supersgd -l 1e-4 -k 32 -m 1e-2 -e 10 -r 10 -f $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file> -t 1 -g 1 -d $AZ_BATCH_TASK_WORKING_DIR/models -b $AZ_BATCH_NODE_SHARED_DIR/azblob/<container_name from the data shredding configuration file>
diff --git a/recipes/MADL-CPU-OpenMPI/config/pool.yaml b/recipes/MADL-CPU-OpenMPI/config/pool.yaml
@@ -0,0 +1,12 @@
+pool_specification:
+  id: <pool id>
+  vm_configuration:
+    platform_image:
+      offer: UbuntuServer
+      publisher: Canonical
+      sku: 16.04-LTS
+  vm_count:
+    dedicated: 3
+    low_priority: 0
+  vm_size: STANDARD_D2_V2
+  inter_node_communication_enabled: true
diff --git a/recipes/MADL-CPU-OpenMPI/docker/Dockerfile b/recipes/MADL-CPU-OpenMPI/docker/Dockerfile
@@ -0,0 +1,45 @@
+#Dockerfile for MADL (Microsoft Distributed Learners)
+
+FROM ubuntu:16.04
+MAINTAINER Saeed Maleki Todd Mytkowicz Madan Musuvathi Dany rouhana <https://github.com/Azure/batch-shipyard>
+ENV DEBIAN_FRONTEND=noninteractive
+
+#install base system
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        openssh-client \
+        openssh-server \
+		libopenblas-dev \
+		libatlas-base-dev \
+		liblapacke-dev \
+		openmpi-bin \
+		openmpi-common \
+		libopenmpi-dev && \
+		apt-get clean && \
+		rm -rf /var/lib/apt/lists/* 
+
+# configure ssh server and keys
+RUN mkdir -p /root/.ssh && \
+    echo "Host *\n\tPort 23\n\tStrictHostKeyChecking no\n\tUserKnownHostsFile /dev/null" > /root/.ssh/config && \
+    mkdir /var/run/sshd && \
+    ssh-keygen -A && \
+    sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+    sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
+    ssh-keygen -f /root/.ssh/id_rsa -t rsa -N '' && \
+    chmod 600 /root/.ssh/config && \
+    chmod 700 /root/.ssh && \
+    cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys	
+
+# set parasail dir	
+WORKDIR /parasail
+
+# to create your own image, first download the supersgd from the link supplied in the read me file, 
+# and the put it in the same dir as this file.
+COPY supersgd /parasail
+COPY run_parasail.sh /parasail
+
+# remove romio314 bits
+RUN rm -rf /usr/lib/openmpi/lib/openmpi/mca_io_romio.so
+
+#make sshd listen on 23 and run by default
+EXPOSE 23
+CMD ["/usr/sbin/sshd", "-D", "-p", "23"]
diff --git a/recipes/MADL-CPU-OpenMPI/docker/README.md b/recipes/MADL-CPU-OpenMPI/docker/README.md
@@ -0,0 +1,9 @@
+# Dockerfile for msmadl/symsgd
+This image can be found in [Docker Hub](https://hub.docker.com/r/msmadl/symsgd/)
+
+You can use our algorithm by downloading our docker image (docker pull msmadl/symsgd).  If you decide to build your own, you first need to download the High Performance ML Algorithms Learner binary from this [link](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/supersgd), and include it in the same directory as your Dockerfile.
+
+You must agree to the following licenses prior to use:
+* [High Performance ML Algorithms License](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/High%20Performance%20ML%20Algorithms%20-%20Standalone%20(free)%20Use%20Terms%20V2%20(06-06-18).docx)
+* [TPN Ubuntu Container](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/TPN_Ubuntu%20Container_16-04-FINAL.docx)
+* [Microsoft Third Party Notice](https://github.com/saeedmaleki/Distributed-Linear-Learner/blob/master/MicrosoftThirdPartyNotice.txt)