In [59]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, mean, to_date, to_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

In [21]:
spark = SparkSession.builder.appName("Internal2").getOrCreate()

In [84]:
csv_path = "Datasets/Sales Data.csv"

sales_df = spark.read.csv(csv_path, header = True, inferSchema = True)

In [85]:
for cols in sales_df.columns:
    null_cnt = sales_df.filter(col(cols).isNull()).count()
    print(f"Column {cols} has {null_cnt} null values")

24/12/30 06:05:18 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv


Column _c0 has 0 null values
Column Order ID has 0 null values
Column Product has 0 null values
Column Quantity Ordered has 0 null values
Column Price Each has 0 null values
Column Order Date has 0 null values
Column Purchase Address has 0 null values
Column Month has 0 null values
Column Sales has 3 null values
Column City has 0 null values
Column Hour has 0 null values


In [86]:
df = sales_df

In [87]:
df.select(mean("Sales")).collect()

[Row(avg(Sales)=185.49375999620437)]

In [88]:
df = df.fillna({"Sales": df.select(mean("Sales")).collect()[0][0]})

In [89]:
for cols in df.columns:
    null_cnt = df.filter(col(cols).isNull()).count()
    print(f"Column {cols} has {null_cnt} null values")

24/12/30 06:05:21 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 
 Schema: _c0
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv


Column _c0 has 0 null values
Column Order ID has 0 null values
Column Product has 0 null values
Column Quantity Ordered has 0 null values
Column Price Each has 0 null values
Column Order Date has 0 null values
Column Purchase Address has 0 null values
Column Month has 0 null values
Column Sales has 0 null values
Column City has 0 null values
Column Hour has 0 null values


In [90]:
df = df.dropDuplicates()

In [91]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = false)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [92]:
from pyspark.sql.functions import to_timestamp

# Correctly parse Order Date in dd-MM-yyyy HH:mm format
df = df.withColumn("Order Date", to_date("Order Date", "dd-MM-yyyy HH:mm"))


In [93]:
df.select("Order Date").show(10, truncate=False)


24/12/30 06:05:25 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
 Schema: _c0, Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv


+----------+
|Order Date|
+----------+
|2019-12-31|
|2019-12-29|
|2019-12-10|
|2019-12-11|
|2019-12-24|
|2019-12-10|
|2019-12-09|
|2019-12-16|
|2019-12-18|
|2019-12-19|
+----------+
only showing top 10 rows



In [94]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Purchase Address: string (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Sales: double (nullable = false)
 |-- City: string (nullable = true)
 |-- Hour: integer (nullable = true)



In [95]:
df.show()


24/12/30 06:05:54 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
 Schema: _c0, Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv


+----+--------+--------------------+----------------+----------+----------+--------------------+-----+------+--------------+----+
| _c0|Order ID|             Product|Quantity Ordered|Price Each|Order Date|    Purchase Address|Month| Sales|          City|Hour|
+----+--------+--------------------+----------------+----------+----------+--------------------+-----+------+--------------+----+
| 297|  295941|     ThinkPad Laptop|               1|    999.99|2019-12-31|64 Dogwood St, Po...|   12|999.99|      Portland|  16|
| 464|  296105|Lightning Chargin...|               1|     14.95|2019-12-29|134 Dogwood St, S...|   12| 14.95| San Francisco|  17|
| 532|  296169|              iPhone|               1|     700.0|2019-12-10|111 Hickory St, S...|   12| 700.0|       Seattle|  22|
| 628|  296263|    Wired Headphones|               1|     11.99|2019-12-11|229 Pine St, San ...|   12| 11.99| San Francisco|  18|
| 721|  296351|     ThinkPad Laptop|               1|    999.99|2019-12-24|168 10th St, Po

In [96]:
df = df.filter((col("Sales") >= 0) & (col("Price Each") >= 0) & (col("Quantity Ordered") >= 0))

In [98]:
total_sales = df.groupBy("Product").agg(sum("Sales").alias("Total Sales"))

In [99]:
total_sales.show()

24/12/30 06:09:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
 Schema: _c0, Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv


+--------------------+------------------+
|             Product|       Total Sales|
+--------------------+------------------+
|    Wired Headphones| 246651.9337599966|
|  Macbook Pro Laptop|         8037600.0|
|Apple Airpods Hea...|         2349150.0|
|              iPhone|         4794300.0|
|Lightning Chargin...| 347094.1500000096|
|Bose SoundSport H...|1345565.4300000193|
|USB-C Charging Cable|286674.79376000474|
|AAA Batteries (4-...| 92740.82999999715|
|        20in Monitor|  454148.710000002|
|    27in FHD Monitor| 1132424.500000006|
|     Vareebadd Phone|          827200.0|
|34in Ultrawide Mo...| 2355558.009999994|
|            LG Dryer|          387600.0|
|AA Batteries (4-p...|106300.05375999837|
|        Google Phone|         3319200.0|
|       Flatscreen TV|         1445700.0|
|  LG Washing Machine|          399600.0|
|27in 4K Gaming Mo...| 2435097.559999993|
|     ThinkPad Laptop|4129958.6999999806|
+--------------------+------------------+



In [100]:
output_path = "Datasets/output/Sales_Data"

In [101]:
df.write.csv(output_path, header = True)

24/12/30 06:11:27 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
 Schema: _c0, Order ID, Product, Quantity Ordered, Price Each, Order Date, Purchase Address, Month, Sales, City, Hour
Expected: _c0 but found: 
CSV file: file:///home/aRKNitro/Tutorials/Datasets/Sales%20Data.csv
                                                                                