In [1]:
from pyspark.sql import SparkSession
spark = SparkSession. \
builder. \
appName("orders1").\
master('local'). \
getOrCreate()

In [2]:
orders_rdd = spark.sparkContext.textFile("D:\\Data_Engg\\ultimate_datasets\\trendytech\\retail_db\\orders\\*")

In [4]:
orders_rdd.collect()[:5]

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [5]:
mapped_rdd = orders_rdd.map(lambda x:(x.split(",")[3],1))

In [7]:
mapped_rdd.collect()[:5]

[('CLOSED', 1),
 ('PENDING_PAYMENT', 1),
 ('COMPLETE', 1),
 ('CLOSED', 1),
 ('COMPLETE', 1)]

In [8]:
reduced_rdd = mapped_rdd.reduceByKey(lambda x,y:x+y)

#### Orders in each category (COMPLETE, PENDING_PAYMENT,CLOSED..)

In [9]:
reduced_rdd.collect()

[('CLOSED', 7556),
 ('PENDING_PAYMENT', 15030),
 ('COMPLETE', 22899),
 ('PROCESSING', 8275),
 ('PAYMENT_REVIEW', 729),
 ('PENDING', 7610),
 ('ON_HOLD', 3798),
 ('CANCELED', 1428),
 ('SUSPECTED_FRAUD', 1558)]

In [10]:
reduced_sorted = reduced_rdd.sortBy(lambda x:x[1],False)

In [11]:
reduced_sorted.collect()

[('COMPLETE', 22899),
 ('PENDING_PAYMENT', 15030),
 ('PROCESSING', 8275),
 ('PENDING', 7610),
 ('CLOSED', 7556),
 ('ON_HOLD', 3798),
 ('SUSPECTED_FRAUD', 1558),
 ('CANCELED', 1428),
 ('PAYMENT_REVIEW', 729)]

#### Premium Customers (Top 10 customers who placed most of the orders)

In [12]:
customers_mapped = orders_rdd.map(lambda x:(x.split(",")[2],1))

In [14]:
customers_mapped.collect()[:5]

[('11599', 1), ('256', 1), ('12111', 1), ('8827', 1), ('11318', 1)]

In [15]:
customers_aggregated = customers_mapped.reduceByKey(lambda x,y:x+y)

In [16]:
customers_aggregated.take(20)

[('11599', 5),
 ('256', 10),
 ('12111', 6),
 ('8827', 6),
 ('11318', 6),
 ('7130', 7),
 ('4530', 10),
 ('2911', 6),
 ('5657', 12),
 ('5648', 13),
 ('918', 5),
 ('1837', 6),
 ('9149', 4),
 ('9842', 7),
 ('2568', 6),
 ('7276', 5),
 ('2667', 7),
 ('1205', 7),
 ('9488', 7),
 ('9198', 7)]

In [17]:
customers_sorted = customers_aggregated.sortBy(lambda x:x[1],False)

In [18]:
customers_sorted.take(10)

[('6316', 16),
 ('12431', 16),
 ('5897', 16),
 ('569', 16),
 ('4320', 15),
 ('5283', 15),
 ('12284', 15),
 ('5654', 15),
 ('221', 15),
 ('5624', 15)]

#### Distinct customers who placed atleast 1 order

In [19]:
distinct_customers = orders_rdd.map(lambda x:(x.split(",")[2])).distinct()

In [20]:
distinct_customers.count()

12405

In [21]:
orders_rdd.count()

68883

#### Customers having maximum number of CLOSED orders

In [22]:
filtered_orders = orders_rdd.filter(lambda x:x.split(",")[3] == 'CLOSED')

In [23]:
filtered_orders.take(20)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '12,2013-07-25 00:00:00.0,1837,CLOSED',
 '18,2013-07-25 00:00:00.0,1205,CLOSED',
 '24,2013-07-25 00:00:00.0,11441,CLOSED',
 '25,2013-07-25 00:00:00.0,9503,CLOSED',
 '37,2013-07-25 00:00:00.0,5863,CLOSED',
 '51,2013-07-25 00:00:00.0,12271,CLOSED',
 '57,2013-07-25 00:00:00.0,7073,CLOSED',
 '61,2013-07-25 00:00:00.0,4791,CLOSED',
 '62,2013-07-25 00:00:00.0,9111,CLOSED',
 '87,2013-07-25 00:00:00.0,3065,CLOSED',
 '90,2013-07-25 00:00:00.0,9131,CLOSED',
 '101,2013-07-25 00:00:00.0,5116,CLOSED',
 '116,2013-07-26 00:00:00.0,8763,CLOSED',
 '129,2013-07-26 00:00:00.0,9937,CLOSED',
 '133,2013-07-26 00:00:00.0,10604,CLOSED',
 '191,2013-07-26 00:00:00.0,16,CLOSED',
 '201,2013-07-26 00:00:00.0,9055,CLOSED',
 '211,2013-07-26 00:00:00.0,10372,CLOSED']

In [24]:
filtered_mapped = filtered_orders.map(lambda x:(x.split(",")[2],1))

In [26]:
filtered_aggregated = filtered_mapped.reduceByKey(lambda x,y:x+y)

In [27]:
filtered_aggregated.take(20)

[('11599', 1),
 ('8827', 1),
 ('1837', 2),
 ('1205', 3),
 ('11441', 2),
 ('9503', 1),
 ('5863', 1),
 ('12271', 2),
 ('7073', 1),
 ('4791', 1),
 ('9111', 1),
 ('3065', 2),
 ('9131', 1),
 ('5116', 2),
 ('8763', 1),
 ('9937', 1),
 ('10604', 2),
 ('16', 1),
 ('9055', 3),
 ('10372', 3)]

In [28]:
filtered_sorted = filtered_aggregated.sortBy(lambda x:x[1],False)

In [29]:
filtered_sorted.take(10)

[('1833', 6),
 ('1363', 5),
 ('1687', 5),
 ('5493', 5),
 ('5011', 4),
 ('8974', 4),
 ('2321', 4),
 ('3736', 4),
 ('8368', 4),
 ('9740', 4)]