
 Before you start using this notebook install **com.databricks.spark.xml** Maven module on your Apache Spark and then change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
datalake_name = 'cadlstoremke4wb3ui2s6q'

In [None]:
%fs ls adl://cadlstoremke4wb3ui2s6q.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
path = f'adl://{datalake_name}.azuredatalakestore.net/Users.xml'
users_output_path = f'adl://{datalake_name}.azuredatalakestore.net/users.parquet'

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType() \
            .add('_Id', IntegerType()) \
            .add('_Reputation', StringType()) \
            .add('_CreationDate', TimestampType()) \
            .add('_LastAccessDate', TimestampType()) \
            .add('_Location', StringType())

In [None]:
users_df = spark.read \
      .format('com.databricks.spark.xml') \
      .option('rootTag', 'users') \
      .option('rowTag', 'row') \
      .load(path, schema = schema)

In [None]:
users_df.show()

In [None]:
users_df.count()

In [None]:
users_df \
        .withColumnRenamed('_Id', 'Id') \
        .withColumnRenamed('_Reputation', 'Reputation') \
        .withColumnRenamed('_CreationDate', 'CreationDate') \
        .withColumnRenamed('_LastAccessDate', 'LastAccessDate') \
        .withColumnRenamed('_Location', 'Location')
        .write \
        .mode('append') \
        .format('parquet') \
        .option('path', users_output_path) \
        .save()

In [None]:
users_based_parquet_df = spark.read.parquet(users_output_path)

users_based_parquet_df.show()

In [None]:
users_based_parquet_df.count()