# Spark Analytics - Sales

## Import Modules and Initiate Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

import os
import zipfile

In [2]:
spark = SparkSession.builder.appName("SparkAnalyticsSales").getOrCreate()

25/02/26 19:18:34 WARN Utils: Your hostname, Cesars-MBP.local resolves to a loopback address: 127.0.0.1; using 192.168.7.230 instead (on interface en0)
25/02/26 19:18:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


25/02/26 19:18:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/02/26 19:18:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## Data Preparation

In [3]:
schema = StructType([
    StructField("Order ID", StringType(), True),
    StructField("Product", StringType(), True),
    StructField("Quantity Ordered", StringType(), True),
    StructField("Price Each", StringType(), True),
    StructField("Order Date", StringType(), True),
    StructField("Purchase Address", StringType(), True)
])

In [None]:
# Define path and file name
directory_path = "../../data/input"
file_name = "salesdata.zip"

In [5]:
# Create full path for the zip file
zip_file_path = os.path.join(directory_path, file_name)

# Create extraction directory
extraction_directory = os.path.join(directory_path, os.path.splitext(file_name)[0])

# Confirm the extraction directory exists
os.makedirs(extraction_directory, exist_ok=True)

# Open the zip file and extract the contents
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    zip_ref.extractall(extraction_directory)

print(f"Extraction complete. Files extracted to: {extraction_directory}")

Extraction complete. Files extracted to: ../../data/input/salesdata


In [6]:
sales_data_df = (spark.read
                .option("header", "true")
                .schema(schema)
                .csv(f"{extraction_directory}/*.csv"))

In [7]:
sales_data_df.show(10)

                                                                                

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|      1700|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

In [8]:
sales_data_df.printSchema()

root
 |-- Order ID: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: string (nullable = true)
 |-- Price Each: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [9]:
spark.stop()