In [17]:
# Importing necessary librarie
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, from_json, map_entries, explode, col
from pyspark.sql.types import MapType, StringType
import pandas as pd

In [18]:

if 'spark' in globals():
    spark.stop()
    
spark = SparkSession.builder \
    .appName("ExplorationApp") \
    .master("local[*]") \
    .config("spark.sql.debug.maxToStringFields", "100") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.jars.packages", "com.databricks:spark-xml_2.12:0.15.0") \
    .config("spark.memory.offHeap.size", "2g") \
    .getOrCreate()

In [6]:
# Loading shipping address data
shipping_address_file_path = "data/shippuingaddress_20240521.csv.csv"
address_rdd = spark.sparkContext.textFile(shipping_address_file_path)

# Skip the first 6 rows using zipWithIndex to get zero-based indices
shipping_address_rdd = address_rdd.zipWithIndex() \
    .filter(lambda x: x[1] >= 6) \
    .map(lambda x: x[0])

shipping_address_df = spark.read.options(header='true', inferSchema='true').csv(shipping_address_rdd)

shipping_address_df.printSchema()
shipping_address_df.show()

root
 |-- id: string (nullable = true)
 |-- customerid: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- effstart: string (nullable = true)
 |-- effend: string (nullable = true)
 |-- streetadd: string (nullable = true)

+------+----------+------------------+----------------+-------------+-----------+----------+----------+--------------------+
|    id|customerid|              city|           state|      country|postal_code|  effstart|    effend|           streetadd|
+------+----------+------------------+----------------+-------------+-----------+----------+----------+--------------------+
|100000|  AA-10315|     San Francisco|      California|United States|      94122|27/08/2005|      NULL|2320 Lisa Forest ...|
|100001|  AA-10375|       Los Angeles|      California|United States|      90008|17/11/2004|29/08/2004|54642 Harry Loop ...|
|100002|  AA-10480|

In [7]:
# Loading prodcut data

product_file_path = "data/product.json"
product_single_json_df = spark.read.option("multiLine", "true").json(product_file_path)

# Show the first few records.
product_single_json_df.show(truncate=False)

# Print the schema.
product_single_json_df.printSchema()

25/05/22 02:12:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 5:>                                                          (0 + 1) / 1]

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

The JSON file is structured as a single JSON object with several keys. Each key appears to hold a dictionary where the keys of that dictionaries are IDs and the values are the corresponding values for that field. To work with the data as individual product records, there's a need to "flatten" or "explode" these map-columns into rows.

In [8]:
# We convert the struct to JSON and then parse it as a MapType(StringType(), StringType())
product_transformed_df = product_single_json_df.select(
    from_json(to_json(col("Category")), MapType(StringType(), StringType())).alias("CategoryMap"),
    from_json(to_json(col("Product_ID")), MapType(StringType(), StringType())).alias("Product_IDMap"),
    from_json(to_json(col("Product_Name")), MapType(StringType(), StringType())).alias("Product_NameMap"),
    from_json(to_json(col("Sub-Category")), MapType(StringType(), StringType())).alias("Sub_CategoryMap")
)

product_transformed_df.printSchema()
product_transformed_df.show(truncate=False)

# Now, for each map column, we convert map entries into separate rows
cat_df = product_transformed_df.select(explode(map_entries(col("CategoryMap"))).alias("kv_cat")) \
            .select(col("kv_cat.key").alias("id"),
                    col("kv_cat.value").alias("Category"))

pid_df = product_transformed_df.select(explode(map_entries(col("Product_IDMap"))).alias("kv_pid")) \
            .select(col("kv_pid.key").alias("id"),
                    col("kv_pid.value").alias("Product_ID"))

pname_df = product_transformed_df.select(explode(map_entries(col("Product_NameMap"))).alias("kv_pname")) \
            .select(col("kv_pname.key").alias("id"),
                    col("kv_pname.value").alias("Product_Name"))

sub_df = product_transformed_df.select(explode(map_entries(col("Sub_CategoryMap"))).alias("kv_sub")) \
            .select(col("kv_sub.key").alias("id"),
                    col("kv_sub.value").alias("Sub_Category"))

# Join all the exploded DataFrames on the common key "id".
product_df = cat_df.join(pid_df, "id") \
                 .join(pname_df, "id") \
                 .join(sub_df, "id")

product_df.show(truncate=False)
product_df.printSchema()

root
 |-- CategoryMap: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- Product_IDMap: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- Product_NameMap: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- Sub_CategoryMap: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [9]:
# Load region data
region_file_path = "data/regiontxt"
region_df= spark.read.option("delimiter", "\t").option("header", 'true').option("inferSchema", "true").csv(region_file_path)
region_df.show(truncate=False)
region_df.printSchema()

+---+---------+----------------+--------------+------+--------------+
|_c0|Region_ID|State           |Country       |Market|Region        |
+---+---------+----------------+--------------+------+--------------+
|0  |6554348  |California      |United States |US    |West          |
|2  |6554349  |National Capital|Philippines   |APAC  |Southeast Asia|
|6  |6554350  |New York        |United States |US    |East          |
|8  |6554351  |Distrito Federal|Mexico        |LATAM |North         |
|12 |6554352  |Madrid          |Spain         |EU    |South         |
|14 |6554353  |Sinaloa         |Mexico        |LATAM |North         |
|15 |6554354  |Texas           |United States |US    |Central       |
|19 |6554355  |Berlin          |Germany       |EU    |Central       |
|20 |6554356  |England         |United Kingdom|EU    |North         |
|24 |6554357  |Uttar Pradesh   |India         |APAC  |Central Asia  |
|25 |6554358  |Western Visayas |Philippines   |APAC  |Southeast Asia|
|28 |6554359  |Minne

25/05/22 02:12:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Region_ID, State, Country, Market, Region
 Schema: _c0, Region_ID, State, Country, Market, Region
Expected: _c0 but found: 
CSV file: file:///Users/aramide/Documents/selfdev/data-wrangling/data/regiontxt


In [10]:
# Load invoice data
invoice_file_path = "data/invoice.xml"

# Read the XML file
invoice_df = spark.read.format("xml") \
    .option("rowTag", "row") \
    .load(invoice_file_path)

# Show DataFrame
invoice_df.show(truncate=False)
invoice_df.printSchema()

[Stage 14:>                                                         (0 + 1) / 1]

+-----------+--------+-------+----------+------------+--------------+--------------+--------+--------+----------+----------+--------------+-------------+
|Customer_ID|Discount|Line_No|Order_Date|Order_ID    |Order_Priority|Product_ID    |Profit  |Quantity|Sale_Value|Ship_Date |Ship_Mode     |Shipping_Cost|
+-----------+--------+-------+----------+------------+--------------+--------------+--------+--------+----------+----------+--------------+-------------+
|PO-8865    |0.7     |1      |03/10/2011|AE-2011-9160|Medium        |OFF/STO-002594|-157.086|2       |82.674    |07/10/2011|Standard Class|5.69         |
|PO-8865    |0.7     |2      |03/10/2011|AE-2011-9160|Medium        |TEC/MAC-003333|-88.992 |6       |78.408    |07/10/2011|Standard Class|3.87         |
|EB-4110    |0.7     |1      |14/10/2013|AE-2013-1130|High          |FUR/BOO-000034|-232.272|6       |224.748   |14/10/2013|Same Day      |60.08        |
|EB-4110    |0.7     |2      |14/10/2013|AE-2013-1130|High          |OFF/FAS

                                                                                

In [21]:
# Load Customer data
customer_file_path = "data/cust.xlsx"
customer_pd_df = pd.read_excel(customer_file_path)
customer_df = spark.createDataFrame(customer_pd_df)
customer_df.show(truncate=False)
customer_df.printSchema()


+----------------+------------------+-----------+
|cusid           |cusnm             |sgmnt      |
+----------------+------------------+-----------+
|ZZXJM-553500000 |Jessica Myrick    |Consumer   |
|ZZXGZ-454500000 |George Zrebassa   |Corporate  |
|ZZXBS-136500000 |Bill Shonely      |Corporate  |
|ZZXSR-2042500000|Sharelle Roach    |Home Office|
|ZZXHG-496500000 |Henry Goldwyn     |Corporate  |
|ZZXSC-2030500000|Sean Christensen  |Consumer   |
|ZZXTG-1131000000|Toby Gnade        |Consumer   |
|ZZXRS-1976500000|Roland Schwarz    |Corporate  |
|ZZXAC-1061500000|Ann Chong         |Corporate  |
|ZZXJL-1583500000|John Lee          |Consumer   |
|ZZXGH-1466500000|Greg Hansen       |Consumer   |
|ZZXCA-226500000 |Christina Anderson|Consumer   |
|ZZXBP-1118500000|Ben Peterman      |Corporate  |
|ZZXNP-1867000000|Nora Paige        |Consumer   |
|ZZXRL-961500000 |Rob Lucas         |Consumer   |
|ZZXEH-412500000 |Eugene Hildebrand |Home Office|
|ZZXRL-1961500000|Rob Lucas         |Consumer   |
