
 Before you start using this notebook install **com.databricks.spark.xml** Maven module on your Apache Spark and then change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
datalake_name = 'cadlstoremke4wb3ui2s6q'

In [None]:
%fs ls adl://cadlstoremke4wb3ui2s6q.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
%sh

sudo apt-get -y install p7zip-full

wget https://archive.org/download/stackexchange/stackoverflow.com-Users.7z

7za x stackoverflow.com-Users.7z

ls -al

In [None]:
%python

dbutils.fs.mv('file:/databricks/driver/Users.xml', f'{path}')

In [None]:
path = f'adl://{datalake_name}.azuredatalakestore.net/Users.xml'
users_output_path = f'adl://{datalake_name}.azuredatalakestore.net/users.parquet'


most_popular_questions_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_questions_with_unique_tag.parquet'
most_popular_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_answers_with_unique_tag.parquet'
most_popular_accepted_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_accepted_answers_with_unique_tag.parquet'


lowest_popular_questions_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_questions_with_unique_tag.parquet'
lowest_popular_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_answers_with_unique_tag.parquet'
lowest_popular_accepted_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_accepted_answers_with_unique_tag.parquet'


users_questions_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_with_most_popular_tags.parquet'
users_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_most_popular_tags.parquet'
users_accepted_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_accepted_answers_with_most_popular_tags.parquet'
users_questions_and_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_most_popular_tags.parquet'


users_questions_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_with_lowest_popular_tags.parquet'
users_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_lowest_popular_tags.parquet'
users_accepted_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_accepted_answers_with_lowest_popular_tags.parquet'
users_questions_and_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_lowest_popular_tags.parquet'

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType() \
            .add('_Id', IntegerType()) \
            .add('_Reputation', StringType()) \
            .add('_CreationDate', TimestampType()) \
            .add('_LastAccessDate', TimestampType()) \
            .add('_Location', StringType())

In [None]:
users_df = spark.read \
      .format('com.databricks.spark.xml') \
      .option('rootTag', 'users') \
      .option('rowTag', 'row') \
      .load(path, schema = schema)

In [None]:
users_df.show()

In [None]:
users_df.count()

In [None]:
users_df \
        .filter(col('_Id') == -1) \
        .withColumnRenamed('_Id', 'Id') \
        .withColumnRenamed('_Reputation', 'Reputation') \
        .withColumnRenamed('_CreationDate', 'CreationDate') \
        .withColumnRenamed('_LastAccessDate', 'LastAccessDate') \
        .withColumnRenamed('_Location', 'Location')
        .write \
        .mode('append') \
        .format('parquet') \
        .option('path', users_output_path) \
        .save()

In [None]:
users_based_parquet_df = spark.read.parquet(users_output_path)

users_based_parquet_df.show()

In [None]:
users_based_parquet_df.count()

## Correlating Users data with Questions and Answers - Most

In [None]:
most_popular_questions_with_unique_tag_df = spark.read.parquet(most_popular_questions_with_unique_tag_parquet_path)
most_popular_answers_with_unique_tag_df = spark.read.parquet(most_popular_answers_with_unique_tag_parquet_path)
most_popular_accepted_answers_with_unique_tag_df = spark.read.parquet(most_popular_accepted_answers_with_unique_tag_parquet_path)

lowest_popular_questions_with_unique_tag_df = spark.read.parquet(lowest_popular_questions_with_unique_tag_parquet_path)
lowest_popular_answers_with_unique_tag_df = spark.read.parquet(lowest_popular_answers_with_unique_tag_parquet_path)
lowest_popular_accepted_answers_with_unique_tag_df = spark.read.parquet(lowest_popular_accepted_answers_with_unique_tag_parquet_path)

In [None]:
users_questions_with_most_popular_tags_df =  users_based_parquet_df.alias('u') \
    .join(most_popular_questions_with_unique_tag_df.alias('q'), col('u.Id') == col('q.OwnerUserId')) \
    .select('u.*', 'q.Tag', 'q.CreationDate')

users_questions_with_most_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_questions_with_most_popular_tags_parquet_path) \
    .save()

In [None]:

users_answers_with_most_popular_tags_df = users_based_parquet_df.alias('u') \
    .join(most_popular_answers_with_unique_tag_df.alias('a'), col('u.Id') == col('a.OwnerUserId')) \
    .select('u.*', 'a.Tag', 'a.CreationDate')

users_answers_with_most_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_answers_with_most_popular_tags_parquet_path) \
    .save()

In [None]:
users_accepted_answers_with_most_popular_tags_df = users_based_parquet_df.alias('u') \
    .join(most_popular_accepted_answers_with_unique_tag_df.alias('a'), col('u.Id') == col('a.OwnerUserId')) \
    .select('u.*', 'a.Tag', 'a.CreationDate')

users_accepted_answers_with_most_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_accepted_answers_with_most_popular_tags_parquet_path) \
    .save()

In [None]:
users_questions_and_answers_with_most_popular_tags_df = users_questions_with_most_popular_tags_df.union(users_answers_with_most_popular_tags_df)


users_questions_and_answers_with_most_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_questions_and_answers_with_most_popular_tags_parquet_path) \
    .save()

## Correlating Users data with Questions and Answers - Least

In [None]:
users_questions_with_lowest_popular_tags_df =  users_based_parquet_df.alias('u') \
    .join(lowest_popular_questions_with_unique_tag_df.alias('q'), col('u.Id') == col('q.OwnerUserId')) \
    .select('u.*', 'q.Tag', 'q.CreationDate')

users_questions_with_lowest_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_questions_with_lowest_popular_tags_parquet_path) \
    .save()



In [None]:
users_answers_with_lowest_popular_tags_df = users_based_parquet_df.alias('u') \
    .join(lowest_popular_answers_with_unique_tag_df.alias('a'), col('u.Id') == col('a.OwnerUserId')) \
    .select('u.*', 'a.Tag', 'a.CreationDate')

users_answers_with_lowest_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_answers_with_lowest_popular_tags_parquet_path) \
    .save()

In [None]:
users_accepted_answers_with_lowest_popular_tags_df = users_based_parquet_df.alias('u') \
    .join(lowest_popular_accepted_answers_with_unique_tag_df.alias('a'), col('u.Id') == col('a.OwnerUserId')) \
    .select('u.*', 'a.Tag', 'a.CreationDate')

users_accepted_answers_with_lowest_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_accepted_answers_with_lowest_popular_tags_parquet_path) \
    .save()

In [None]:
users_questions_and_answers_with_lowest_popular_tags_df = users_questions_with_lowest_popular_tags_df.union(users_answers_with_lowest_popular_tags_df)

users_questions_and_answers_with_lowest_popular_tags_df \
    .write \
    .mode('overwrite') \
    .format('parquet') \
    .option('path', users_questions_and_answers_with_lowest_popular_tags_parquet_path) \
    .save()