In [None]:
# Check core SDK version number
import azureml.core
print("SDK version:", azureml.core.VERSION)

#### Connect to workspace via config.json

In [None]:
from azureml.core import Workspace
ws = Workspace.from_config()
ws

# Register Datastore

Currently, managed dataset supports 4 different datastore types. 
* Azure Blob Container
* Azure File Share
* Azure Data Lake
* Azure Data Lake Gen2


[Datastore Documents](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.datastore.datastore?view=azure-ml-py)

## Register Azure Blob Container Datastore

In [None]:
from azureml.core import Datastore
blob_datastore = Datastore.register_azure_blob_container(
   workspace=ws, # workspace
   datastore_name="<datastore name you want to register to workspace>",
   account_name="<Azure storage account name>",
   container_name="<container name in the Azure blob storage>",
   account_key="<access key for the storage account>"
)

account key can be found from [storage account] -> [Settings] -> [Access Keys]

## Register Azure File Share Datastore

In [None]:
from azureml.core import Datastore
fileshare_datastore = Datastore.register_azure_file_share(
   workspace=ws, # workspace
   datastore_name="<datastore name you want to register to workspace>",
   account_name="<Azure storage account name>",
   file_share_name="<file share name in the Azure storage account>",
   account_key="<access key for the storage account>"
)

## Register Azure Data Lake Storage Gen1 (ADLS Gen1) Datastore

For ADLS gen1 and gen2, you will need service principal to access. Service principal need to be assigned with proper RBAC roles to interact with. For example, for ADLS gen2, you will need to assign sp with Azure blob contrainer contributor/owner roles. 

[How to use portal to create Azure AD service principal.](https://docs.microsoft.com/en-us/azure/active-directory/develop/howto-create-service-principal-portal)

In [None]:
adls_datastore = Datastore.register_azure_data_lake(
                workspace=ws,
                datastore_name="<datastore name you want to register to workspace>",
                subscription_id="<subscription id of ADLS Gen1 stoarge account>",
                resource_group="<resource group of ADLS Gen1 storage account>",
                store_name="<name of ADLS Gen1 storage account>",
                tenant_id="<tenant id of service principal>", 
                client_id="<client id of service principal>",
                client_secret="<the secret of service principal>")

## Register Azure Data Lake Storage Gen2 (ADLS Gen2) Datastore

In [None]:
from azureml.core import Dataset, Datastore
adls_datastore_gen2 = Datastore.register_azure_data_lake_gen2(
                workspace=ws,
                datastore_name="<datastore name you want to register to workspace>",
                account_name="<name of ADLS Gen2 storage account>",
                filesystem="<ADLS gen2 container name>",
                tenant_id="<tenant id of service principal>", 
                client_id="<client id of service principal>",
                client_secret="<the secret of service principal>")

## Common Methods for Datastore

In [None]:
## get existing datastore
from azureml.core import Datastore
datastore = Datastore.get(ws, 'fileshare_datastore')

In [None]:
## show datastore details
datastore.__dict__

In [None]:
## unregister a datastore
datastore.unregister()

## [Upload a Spark dataframe and register as a dataset] register_spark_dataframe

```python
def register_spark_dataframe(dataframe, target, name, show_progress=True):
    """Create a dataset from spark dataframe.

    :param dataframe: In memory dataframe to be uploaded.
    :type dataframe: pyspark.sql.DataFrame
    :param target: The datastore path where the dataframe parquet data will be uploaded to.
        A guid folder will be generated under the target path to avoid conflict.
    :type target: azureml.data.datapath.DataPath, azureml.core.datastore.Datastore
        or tuple(azureml.core.datastore.Datastore, str) object
    :param name: The name of the registered dataset.
    :type name: str
    :param show_progress: Indicates whether to show progress in the console. Defaults to be True.
    :type show_progress: bool, optional
    :return: The registered dataset.
    :rtype: azureml.data.TabularDataset
    """
```

#### Currently, register_spark_dataframe only support Azure Blob storage, ADLS Gen1 and ADLS Gen2. File share is not supported.
You can use Spark dataframe created by yourself or use the below sample code to get Spark dataframe from existing dataset. 

```python
dstore = workspace.get_default_datastore()
datastore_path = [(dstore, 'weather-data-florida/*/*/data.parquet')]
dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)
spark_df = dataset.to_spark_dataframe()
```

In [None]:
# dataset=Dataset.Tabular.register_spark_dataframe(<spark dataframe>, <datastore>, "<name of registered dataset>", show_progress=True)
# Using blob_datastore that we created earlier in this sample notebook
dataset = Dataset.Tabular.register_spark_dataframe(spark_df, blob_datastore, "ds_from_spark_df", show_progress=True)

## [Upload a Pandas dataframe and register as a dataset] register_pandas_dataframe

```python
def register_pandas_dataframe(dataframe, target, name, show_progress=True):
    """ Create a dataset from pandas dataframe.
        Datastore type can only be azure data lake store or azure storage store.

    :param dataframe: In memory dataframe to be uploaded.
    :type dataframe: pandas.DataFrame
    :param target: The datastore path where the dataframe parquet data will be uploaded to.
        A guid folder will be generated under the target path to avoid conflict.
    :type target: azureml.data.datapath.DataPath, azureml.core.datastore.Datastore or tuple(azureml.core.datastore.Datastore, str) object
    :param name: The name of the registered dataset.
    :type name: str
    :param show_progress: Indicates whether to show progress in the console. Defaults to be True.
    :type show_progress: bool, optional
    :return: The registered dataset.
    :rtype: azureml.data.TabularDataset
    """
```

You can use Pandas dataframe created by yourself or use the below sample code to get Pandas dataframe from existing dataset. 

```python
dstore = workspace.get_default_datastore()
datastore_path = [(dstore, 'weather-data-florida/*/*/data.parquet')]
dataset = Dataset.Tabular.from_parquet_files(path=datastore_path)
pandas_df = dataset.to_pandas_dataframe()
```

In [None]:
# dataset=Dataset.Tabular.register_pandas_dataframe(<pandas dataframe>, <datastore>, "<name of registered dataset>", show_progress=True)
# Using blob_datastore that we created earlier in this sample notebook
dataset = Dataset.Tabular.register_pandas_dataframe(pandas_df, blob_datastore, "ds_from_pandas_df", show_progress=True)