
 Before you start using this notebook install **com.databricks.spark.xml** Maven module on your Apache Spark and then change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
datalake_name = 'cadlstoreo54jdyoinsvde'

In [None]:
%fs ls adl://cadlstoreo54jdyoinsvde.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
path = f'adl://{datalake_name}.azuredatalakestore.net/Comments.xml'
comments_output_path = f'adl://{datalake_name}.azuredatalakestore.net/comments.parquet'

In [None]:
%sh

sudo apt-get -y install p7zip-full

wget https://archive.org/download/stackexchange/stackoverflow.com-Comments.7z

7za x stackoverflow.com-Comments.7z

ls -al

In [None]:
%python

dbutils.fs.mv('file:/databricks/driver/Comments.xml', f'{path}')

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType() \
            .add('_Id', IntegerType()) \
            .add('_PostId', IntegerType()) \
            .add('_CreationDate', TimestampType()) \
            .add('_UserDisplayName', StringType()) \
            .add('_UserId', IntegerType())

In [None]:
comments_df = spark.read \
      .format('com.databricks.spark.xml') \
      .option('rootTag', 'comments') \
      .option('rowTag', 'row') \
      .load(path, schema = schema)

In [None]:
comments_df.show()

In [None]:
comments_df \
        .filter(col('_UserId').isNotNull()) \
        .withColumnRenamed('_Id', 'Id') \
        .withColumnRenamed('_PostId', 'PostId') \
        .withColumnRenamed('_CreationDate', 'CreationDate') \
        .withColumnRenamed('_UserDisplayName', 'UserDisplayName') \
        .withColumnRenamed('_UserId', 'UserId') \
        .write \
        .mode('overwrite') \
        .format('parquet') \
        .option('path', comments_output_path) \
        .save()

In [None]:
comments_based_parquet_df = spark.read.parquet(comments_output_path)
comments_based_parquet_df.show()

In [None]:
comments_based_parquet_df.count()