# Challenge 3

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

In [2]:
spark = (
    SparkSession.
    builder.
    appName('Pyspark_Challenge_3').
    getOrCreate()
)

In [3]:
sales_df = (
    spark.
    read.
    option('inferSchema', True).
    option('header', True).
    csv('data/salesdata')
)

In [4]:
sales_df.show(10)

+--------+--------------------+----------------+----------+--------------+--------------------+
|Order ID|             Product|Quantity Ordered|Price Each|    Order Date|    Purchase Address|
+--------+--------------------+----------------+----------+--------------+--------------------+
|  295665|  Macbook Pro Laptop|               1|    1700.0|12/30/19 00:01|136 Church St, Ne...|
|  295666|  LG Washing Machine|               1|     600.0|12/29/19 07:03|562 2nd St, New Y...|
|  295667|USB-C Charging Cable|               1|     11.95|12/12/19 18:21|277 Main St, New ...|
|  295668|    27in FHD Monitor|               1|    149.99|12/22/19 15:13|410 6th St, San F...|
|  295669|USB-C Charging Cable|               1|     11.95|12/18/19 12:38|43 Hill St, Atlan...|
|  295670|AA Batteries (4-p...|               1|      3.84|12/31/19 22:58|200 Jefferson St,...|
|  295671|USB-C Charging Cable|               1|     11.95|12/16/19 15:10|928 12th St, Port...|
|  295672|USB-C Charging Cable|         

In [5]:
sales_df.printSchema()

root
 |-- Order ID: integer (nullable = true)
 |-- Product: string (nullable = true)
 |-- Quantity Ordered: integer (nullable = true)
 |-- Price Each: double (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Purchase Address: string (nullable = true)



In [6]:
sales_df = (
    sales_df.
    select(
        F.col('Order ID'),
        F.col('Product'),
        F.col('Quantity Ordered'),
        F.col('Price Each'),
        F.col('Purchase Address'),
        F.to_timestamp(F.col('Order Date'), 'MM/dd/yy HH:mm').alias('Order Date')
    )
)

### The best month for sales

In [7]:
(
    sales_df.
    select(
        '*',
        F.col('Price Each') * F.col('Quantity Ordered'),
        F.month(F.col('Order Date')).alias('Month')
    ).
    dropna().
    groupBy('Month').
    agg(F.sum('Price Each').alias('Price Each')).
    orderBy('Price Each', ascending=False).
    show()
)

+-----+------------------+
|Month|        Price Each|
+-----+------------------+
|   12| 4588415.410001603|
|   10|3715554.8300010366|
|    4| 3367671.020000747|
|   11| 3180600.680000648|
|    5|3135125.1300005997|
|    3|2791207.8300004015|
|    7| 2632539.560000266|
|    6| 2562025.610000203|
|    8|2230345.4200000055|
|    2|2188884.7199999737|
|    9|2084992.0899999128|
|    1|1811768.3799999235|
+-----+------------------+



### Which city actually sold the most products?

In [8]:
(
    sales_df.
    select(
        F.col('Quantity Ordered'),
        F.split(F.col('Purchase Address'), ', ').getItem(1).alias('City'),
        F.split(F.split(F.col('Purchase Address'), ', ').getItem(2), ' ').getItem(0).alias('State')
    ).
    dropna().
    groupBy('City').
    agg(F.sum('Quantity Ordered').alias('Quantity Ordered')).
    orderBy('Quantity Ordered', ascending=False).
    show()
)

+-------------+----------------+
|         City|Quantity Ordered|
+-------------+----------------+
|San Francisco|           50239|
|  Los Angeles|           33289|
|New York City|           27932|
|       Boston|           22528|
|       Dallas|           16730|
|      Atlanta|           16602|
|      Seattle|           16553|
|     Portland|           14053|
|       Austin|           11153|
+-------------+----------------+



### What time should we display advertisements to maximize the likelihood of customers buying products?

In [9]:
(
    sales_df.
    select(
        'Order ID',
        F.col('Price Each') * F.col('Quantity Ordered'),
        F.date_format(F.col('Order Date'), 'HH').cast('int').alias('Hour')
    ).
    distinct().
    dropna().
    groupBy('Hour').
    agg(F.count('Hour').alias('TotalOrders')).
    orderBy('TotalOrders', ascending=False).
    show()
)

+----+-----------+
|Hour|TotalOrders|
+----+-----------+
|  19|      12886|
|  12|      12573|
|  11|      12392|
|  18|      12263|
|  20|      12218|
|  13|      12115|
|  14|      10965|
|  10|      10929|
|  21|      10905|
|  17|      10883|
|  16|      10359|
|  15|      10158|
|  22|       8808|
|   9|       8740|
|  23|       6262|
|   8|       6252|
|   7|       4002|
|   0|       3902|
|   6|       2481|
|   1|       2347|
+----+-----------+
only showing top 20 rows



### What products are often sold together in the state “NY”?

In [10]:
(
    sales_df.
    select(
        'Order ID',
        'Product',
        F.split(F.split(F.col('Purchase Address'), ', ').getItem(2), ' ').getItem(0).alias('State')
    ).
    where('State == "NY"').
    groupBy('Order ID', 'State').
    agg(F.collect_list('Product').alias('ProductList')).
    withColumn('ProductListSize', F.size('ProductList')).
    filter('ProductListSize > 1').
    groupBy('ProductList').
    count().
    orderBy('count', ascending=False).
    show(10)
)

+--------------------+-----+
|         ProductList|count|
+--------------------+-----+
|[iPhone, Lightnin...|  127|
|[Google Phone, US...|  124|
|[Google Phone, Wi...|   52|
|[Vareebadd Phone,...|   49|
|[iPhone, Wired He...|   46|
|[iPhone, Apple Ai...|   43|
|[Google Phone, Bo...|   23|
|[Vareebadd Phone,...|   17|
|[Apple Airpods He...|   12|
|[Google Phone, US...|   11|
+--------------------+-----+
only showing top 10 rows

