# Datasets

In this notebook we will various ways of creating datasets. 

In [None]:
#import required libraries
from azure.ml import MLClient
from azure.ml.entities import Dataset, CommandJob, JobInput
from azure.identity import InteractiveBrowserCredential

To start with lets create a handle to the AML workspace

In [None]:
#Enter details of your AML workspace
subscription_id = '<SUBSCRIPTION_ID>'
resource_group = '<RESOURCE_GROUP>'
workspace = '<AML_WORKSPACE_NAME>'

In [None]:
#get a handle to the workspace
ml_client = MLClient(InteractiveBrowserCredential(), subscription_id, resource_group, workspace)

### Create a dataset from a local file or folder

In [None]:
# Use a local file
local_dataset = Dataset(
    local_path="./data/titanic.csv", 
    name="local-file-example", 
    description="Dataset created from local file.")

ml_client.create_or_update(local_dataset)

# Use a local folder
local_folder_dataset = Dataset(
    local_path="./data",
    name="local-folder-example", 
    description="Dataset created from local folder.")

ml_client.create_or_update(local_folder_dataset)

#  Create dataset from files or folders in the cloud

In [None]:
#Create dataset from a file in the aml workspace
cloud_ds_aml_file = Dataset(
    paths=[dict(file = "azureml://datastores/workspaceblobstore/paths/example-data/titanic.csv")],
    name="cloud-file-example",
    description="Dataset created from file in cloud."
)
ml_client.create_or_update(cloud_ds_aml_file)

#create dataset from a public file with hhtps URL
cloud_ds_file = Dataset(
    paths=[dict(file="https://azuremlexamples.blob.core.windows.net/datasets/titanic.csv")],
    name="public-file-https-example",
    description="Dataset created from a publicly available file using https URL."
)
ml_client.create_or_update(cloud_ds_file)

#Create dataset from a folder in the cloud
cloud_ds_folder = Dataset(
    paths=[dict(folder="https://mainstorage9c05dabf5c924.blob.core.windows.net/azureml-blobstore-54887b46-3cb0-485b-bb15-62e7b5578ee6/example-data/")],
    name="cloud-folder-https-example",
    description="Dataset created from folder in cloud using https URL."
)
ml_client.create_or_update(cloud_ds_folder)

#Create a dataset from a file with wasbs URL
cloud_ds_wasbs_file = Dataset(
    paths=[dict(file="wasbs://mainstorage9c05dabf5c924.blob.core.windows.net/azureml-blobstore-54887b46-3cb0-485b-bb15-62e7b5578ee6/example-data/titanic.csv")],
    name="cloud-file-wasbs-example",
    description="Dataset created from a file in cloud using wasbs URL."
)
ml_client.create_or_update(cloud_ds_wasbs_file)

#Create a dataset from a folder with wasbs URL
cloud_ds_wasbs_folder = Dataset(
    paths=[dict(folder="wasbs://mainstorage9c05dabf5c924.blob.core.windows.net/azureml-blobstore-54887b46-3cb0-485b-bb15-62e7b5578ee6/example-data/")],
    name="cloud-folder-wasbs-example",
    description="Dataset created from folder in cloud using wasbs URL."
)
ml_client.create_or_update(cloud_ds_wasbs_folder)

# Use dataset in a Job

You can now use any of the above datasets in a job (or a pipeline).

To illustrate, let us use the dataset `public-file-https-example` in a `CommandJob`. We will look for a file _titanic.csv_ in the `dataset`, and print out the column names and number of rows in the file.


In [None]:
#create the command job
job = CommandJob(
    code_local_path="./src", #local path where the code is stored
    command= 'python main.py --input-dataset ${{inputs.input_dataset}}',
    inputs={"input_dataset":JobInput(dataset="public-file-https-example:1")},
    environment= "AzureML-sklearn-0.24-ubuntu18.04-py37-cpu:9",
    compute = "cpu-cluster", #replace this with compute in your workspace
    display_name="use-dataset-in-a-job"
)

In [None]:
#submit the command job
returned_job = ml_client.create_or_update(job)
#get a URL for the status of the job
returned_job.services["Studio"].endpoint