In [None]:
spark

In [None]:
display(dbutils.fs.ls('/'))

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/mnt/,mnt/,0,0
dbfs:/my_work/,my_work/,0,0
dbfs:/project/,project/,0,0


In [None]:
dbutils.fs.mkdirs('/project')

Out[6]: True

In [None]:
display(dbutils.fs.ls('/'))

path,name,size,modificationTime
dbfs:/FileStore/,FileStore/,0,0
dbfs:/databricks-datasets/,databricks-datasets/,0,0
dbfs:/databricks-results/,databricks-results/,0,0
dbfs:/my_work/,my_work/,0,0
dbfs:/project/,project/,0,0


In [None]:
# Configuration for AWS S3
configs = {
    "fs.s3a.aws.credentials.provider": "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    "fs.s3a.access.key": "A*******************",
    "fs.s3a.secret.key": 'ABc*****************'
}

# Mounting S3 bucket
dbutils.fs.mount(
    source="s3a://deltalake1",  # Bucket URL
    mount_point="/mnt/projects",  # Mount point in Databricks
    extra_configs=configs
)


Out[15]: True

In [None]:
%fs ls '/mnt/projects/'

path,name,size,modificationTime
dbfs:/mnt/projects/buyers_raw/,buyers_raw/,0,0
dbfs:/mnt/projects/countries_raw/,countries_raw/,0,0
dbfs:/mnt/projects/sellers_raw/,sellers_raw/,0,0
dbfs:/mnt/projects/user_raw/,user_raw/,0,0


In [None]:
%fs ls '/mnt/projects/buyers_raw/'

path,name,size,modificationTime
dbfs:/mnt/projects/buyers_raw/Buyers-repartition-by-country.csv,Buyers-repartition-by-country.csv,8038,1715148211000


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
     

In [None]:
spark = SparkSession.builder.appName("EcomDataAnalysis").getOrCreate()

In [None]:
spark

In [None]:
#Reading the user_raw file from Source
userDF = spark.read.format("csv")\
    .option("header",'true')\
    .option("inferSchema",'true')\
    .load("/mnt/projects/user_raw")

In [None]:
userDF.show(5)

+--------------------+----+----------+--------+-----------------+---------------+-------------------+--------------+------------+----------------+--------------+--------------+------+----------------+-------------+---------+-------------+---------+-----------------+------------------+---------+-----------------+----------------+-----------+
|      identifierHash|type|   country|language|socialNbFollowers|socialNbFollows|socialProductsLiked|productsListed|productsSold|productsPassRate|productsWished|productsBought|gender|civilityGenderId|civilityTitle|hasAnyApp|hasAndroidApp|hasIosApp|hasProfilePicture|daysSinceLastLogin|seniority|seniorityAsMonths|seniorityAsYears|countryCode|
+--------------------+----+----------+--------+-----------------+---------------+-------------------+--------------+------------+----------------+--------------+--------------+------+----------------+-------------+---------+-------------+---------+-----------------+------------------+---------+-----------------+-

In [None]:
userDF.printSchema()

root
 |-- identifierHash: long (nullable = true)
 |-- type: string (nullable = true)
 |-- country: string (nullable = true)
 |-- language: string (nullable = true)
 |-- socialNbFollowers: integer (nullable = true)
 |-- socialNbFollows: integer (nullable = true)
 |-- socialProductsLiked: integer (nullable = true)
 |-- productsListed: integer (nullable = true)
 |-- productsSold: integer (nullable = true)
 |-- productsPassRate: double (nullable = true)
 |-- productsWished: integer (nullable = true)
 |-- productsBought: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- civilityGenderId: integer (nullable = true)
 |-- civilityTitle: string (nullable = true)
 |-- hasAnyApp: boolean (nullable = true)
 |-- hasAndroidApp: boolean (nullable = true)
 |-- hasIosApp: boolean (nullable = true)
 |-- hasProfilePicture: boolean (nullable = true)
 |-- daysSinceLastLogin: integer (nullable = true)
 |-- seniority: integer (nullable = true)
 |-- seniorityAsMonths: double (nullable = true

In [None]:
#convert the user_raw file to delta tables
userDF.write.format("delta")\
    .mode("overwrite")\
    .save("/mnt/delta/tables/bronze/users")

In [None]:
%fs ls '/mnt/delta/tables/bronze/users/'

path,name,size,modificationTime
dbfs:/mnt/delta/tables/bronze/users/_delta_log/,_delta_log/,0,0
dbfs:/mnt/delta/tables/bronze/users/part-00000-4f0bc485-69a3-40f4-88d2-7299a463fa51-c000.snappy.parquet,part-00000-4f0bc485-69a3-40f4-88d2-7299a463fa51-c000.snappy.parquet,283955,1715151351000


In [None]:

buyersDF = spark.read.format("csv")\
    .option("header",'true')\
    .option("inferSchema",'true')\
    .load("/mnt/projects/buyers_raw")

In [None]:
buyersDF.show(5)

+-----------+------+---------+-------------+------------+----------+---------------+-------------+-----------------+--------------------+----------------------+------------------+-------------------------+---------------------+-------------------+-------------------+------------------+----------------------+----------------------+---------------------+------------------+------------------+-----------------+---------------------+---------------------+--------------------+---------------+------------------+-------------+-------------+----------------+----------------+
|    country|buyers|topbuyers|topbuyerratio|femalebuyers|malebuyers|topfemalebuyers|topmalebuyers|femalebuyersratio|topfemalebuyersratio|boughtperwishlistratio|boughtperlikeratio|topboughtperwishlistratio|topboughtperlikeratio|totalproductsbought|totalproductswished|totalproductsliked|toptotalproductsbought|toptotalproductswished|toptotalproductsliked|meanproductsbought|meanproductswished|meanproductsliked|topmeanproductsbo

In [None]:
buyersDF.write.format("delta")\
    .mode("overwrite")\
    .save("/mnt/delta/tables/bronze/buyers")

In [None]:
sellersDF = spark.read.format("csv")\
    .option("header",'true')\
    .option("inferSchema",'true')\
    .load("/mnt/projects/sellers_raw")


In [None]:
sellersDF.show(5)

+---------+------+---------+----------------+------------------+------------------+-----------------+-------------------+------------------+------------------+-----------------+-----------+-----------+------------------+-------------+-----------+-----------------+-----------------+------------------+
|  country|   sex|nbsellers|meanproductssold|meanproductslisted|meansellerpassrate|totalproductssold|totalproductslisted|meanproductsbought|meanproductswished|meanproductsliked|totalbought|totalwished|totalproductsliked|meanfollowers|meanfollows|percentofappusers|percentofiosusers|     meanseniority|
+---------+------+---------+----------------+------------------+------------------+-----------------+-------------------+------------------+------------------+-----------------+-----------+-----------+------------------+-------------+-----------+-----------------+-----------------+------------------+
|Allemagne|Female|      116|            4.03|              2.72|             27.33|           

In [None]:
sellersDF.write.format("delta")\
    .mode("overwrite")\
    .save("/mnt/delta/tables/bronze/sellers")

In [None]:
%fs ls '/mnt/delta/tables/bronze/'

path,name,size,modificationTime
dbfs:/mnt/delta/tables/bronze/buyers/,buyers/,0,0
dbfs:/mnt/delta/tables/bronze/sellers/,sellers/,0,0
dbfs:/mnt/delta/tables/bronze/users/,users/,0,0


In [None]:
countriesDF = spark.read.format("csv")\
    .option("header",'true')\
    .option("inferSchema",'true')\
    .load("/mnt/projects/countries_raw")

In [None]:
countriesDF.show(5)

+---------+-------+----------+--------------+------------------+---------------------+-------------+-----------+----------------+--------------+----------------+-------------+--------------------+-----------------+----------------------+-------------------+-------------------+---------------------+-----------------+------------------+---------------+------------------+-------------+-------------+----------------+----------------+
|  country|sellers|topsellers|topsellerratio|femalesellersratio|topfemalesellersratio|femalesellers|malesellers|topfemalesellers|topmalesellers|countrysoldratio|bestsoldratio|toptotalproductssold|totalproductssold|toptotalproductslisted|totalproductslisted|topmeanproductssold|topmeanproductslisted| meanproductssold|meanproductslisted|meanofflinedays|topmeanofflinedays|meanfollowers|meanfollowing|topmeanfollowers|topmeanfollowing|
+---------+-------+----------+--------------+------------------+---------------------+-------------+-----------+----------------+---

In [None]:

countriesDF.write.format("delta")\
    .mode("overwrite")\
    .save("/mnt/delta/tables/bronze/countries")
     

In [None]:
%fs ls '/mnt/delta/tables/bronze/'

path,name,size,modificationTime
dbfs:/mnt/delta/tables/bronze/buyers/,buyers/,0,0
dbfs:/mnt/delta/tables/bronze/countries/,countries/,0,0
dbfs:/mnt/delta/tables/bronze/sellers/,sellers/,0,0
dbfs:/mnt/delta/tables/bronze/users/,users/,0,0


In [None]:
%fs ls '/mnt/delta/tables/bronze/countries'

path,name,size,modificationTime
dbfs:/mnt/delta/tables/bronze/countries/_delta_log/,_delta_log/,0,0
dbfs:/mnt/delta/tables/bronze/countries/part-00000-2f5704a7-bcb8-406b-a3b9-2c81a3b5039d-c000.snappy.parquet,part-00000-2f5704a7-bcb8-406b-a3b9-2c81a3b5039d-c000.snappy.parquet,10131,1715152703000
