
 Before you start using this notebook install **com.databricks.spark.xml** Maven module on your Apache Spark and then change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
datalake_name = 'cadlstoreeey2zo52uynoq'

In [None]:
%fs ls adl://cadlstorechnt4ejmxmp6o.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
path = f'adl://{datalake_name}.azuredatalakestore.net/Tags.xml'
tags_output_path = f'adl://{datalake_name}.azuredatalakestore.net/tags.parquet'

In [None]:
%sh

sudo apt-get -y install p7zip-full

wget https://archive.org/download/stackexchange/stackoverflow.com-Tags.7z 

7za x stackoverflow.com-Tags.7z

ls -al

In [None]:
%python

dbutils.fs.mv('file:/databricks/driver/Tags.xml', f'{path}')

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType() \
            .add('_TagName', StringType()) \
            .add('_Count', IntegerType())

In [None]:
tags_df = spark.read \
      .format('com.databricks.spark.xml') \
      .option('rootTag', 'tags') \
      .option('rowTag', 'row') \
      .load(path, schema = schema)

In [None]:
tags_df.show()

In [None]:
tags_df \
        .withColumnRenamed('_TagName', 'TagName')\
        .withColumnRenamed('_Count', 'Count')\
        .write \
        .mode('append') \
        .format('parquet') \
        .option('path', tags_output_path) \
        .save()

In [None]:
tags_based_parquet_df = spark.read.parquet(tags_output_path)
tags_based_parquet_df.show()

In [None]:
tags_based_parquet_df.count()