# E-Commerce User Behavior Analysis from Multi Category Store

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.context import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession, SQLContext, Row
import os

In [3]:
conf = SparkConf()

In [4]:
sc = SparkContext(conf = conf)
sc.setLogLevel("Error")

24/03/27 18:30:28 WARN Utils: Your hostname, Dynaneshwaris-MacBook-Air.local resolves to a loopback address: 127.0.0.1; using 10.0.0.10 instead (on interface en0)
24/03/27 18:30:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/27 18:30:29 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark = SparkSession.builder \
    .appName("2019-Oct") \
    .getOrCreate()

In [6]:
df = spark.read.csv("2019-Oct.csv", header=True, inferSchema=True)
df.show()

                                                                                

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-09-30 20:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-09-30 20:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-09-30 20:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-09-30 20:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-09-30 20:00:04|      view|   1004237|2053013555631882655|electr

In [7]:
df = df.drop("event_time", "user_session", "category_code", "brand")
df.show()

+----------+----------+-------------------+-------+---------+
|event_type|product_id|        category_id|  price|  user_id|
+----------+----------+-------------------+-------+---------+
|      view|  44600062|2103807459595387724|  35.79|541312140|
|      view|   3900821|2053013552326770905|   33.2|554748717|
|      view|  17200506|2053013559792632471|  543.1|519107250|
|      view|   1307067|2053013558920217191| 251.74|550050854|
|      view|   1004237|2053013555631882655|1081.98|535871217|
|      view|   1480613|2053013561092866779| 908.62|512742880|
|      view|  17300353|2053013553853497655| 380.96|555447699|
|      view|  31500053|2053013558031024687|  41.16|550978835|
|      view|  28719074|2053013565480109009| 102.71|520571932|
|      view|   1004545|2053013555631882655| 566.01|537918940|
|      view|   2900536|2053013554776244595|  51.46|555158050|
|      view|   1005011|2053013555631882655| 900.64|530282093|
|      view|   3900746|2053013552326770905| 102.38|555444559|
|      v

In [8]:
num_rows = df.count()

columns = df.columns

num_columns = len(columns)

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)



Number of rows: 42448764
Number of columns: 5


                                                                                

In [9]:
for col_names in df.columns:
    null_count = df.filter(col(col_names).isNull()).count()
    print("Number of Null in column:", col_names, null_count)

                                                                                

Number of Null in column: event_type 0


                                                                                

Number of Null in column: product_id 0


                                                                                

Number of Null in column: category_id 0


                                                                                

Number of Null in column: price 0




Number of Null in column: user_id 0


                                                                                

In [10]:
df.groupBy("event_type").count().show()



+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  742849|
|      view|40779399|
|      cart|  926516|
+----------+--------+



                                                                                

In [11]:
df.printSchema()

root
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)



In [12]:
from pyspark.ml.feature import(
    StringIndexer, 
    VectorAssembler
)

In [13]:
indexer = StringIndexer(inputCol = "event_type", outputCol = "Event_Type")
indexed = indexer.fit(df).transform(df)

                                                                                

In [14]:
indexed.show()

+----------+----------+-------------------+-------+---------+
|Event_Type|product_id|        category_id|  price|  user_id|
+----------+----------+-------------------+-------+---------+
|       0.0|  44600062|2103807459595387724|  35.79|541312140|
|       0.0|   3900821|2053013552326770905|   33.2|554748717|
|       0.0|  17200506|2053013559792632471|  543.1|519107250|
|       0.0|   1307067|2053013558920217191| 251.74|550050854|
|       0.0|   1004237|2053013555631882655|1081.98|535871217|
|       0.0|   1480613|2053013561092866779| 908.62|512742880|
|       0.0|  17300353|2053013553853497655| 380.96|555447699|
|       0.0|  31500053|2053013558031024687|  41.16|550978835|
|       0.0|  28719074|2053013565480109009| 102.71|520571932|
|       0.0|   1004545|2053013555631882655| 566.01|537918940|
|       0.0|   2900536|2053013554776244595|  51.46|555158050|
|       0.0|   1005011|2053013555631882655| 900.64|530282093|
|       0.0|   3900746|2053013552326770905| 102.38|555444559|
|       

In [15]:
assembler = VectorAssembler(inputCols=["product_id", "category_id", "price", "user_id"], outputCol="features")
data = assembler.transform(indexed)

In [16]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans(featuresCol="features", k=3)  
model = kmeans.fit(data)

predictions = model.transform(data)

evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = {:.2f}".format(silhouette))

print("Cluster Centers: ")
centers = model.clusterCenters()
for center in centers:
    print(center)



Silhouette with squared euclidean distance = 0.98
Cluster Centers: 
[8.96876017e+06 2.05317315e+18 2.93991366e+02 5.33595561e+08]
[3.78016307e+07 2.10491584e+18 1.72294601e+02 5.32173873e+08]
[3.45795435e+07 2.15294218e+18 3.01964853e+02 5.33087647e+08]


                                                                                

In [17]:
total_count = data.count()

train_ratio = 0.7

train_count = int(train_ratio * total_count)
test_count = total_count - train_count

train_data = data.limit(train_count)
test_data = data.subtract(train_data)

                                                                                

In [18]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

rf = RandomForestClassifier(labelCol="Event_Type", featuresCol="features", numTrees=10)
model = rf.fit(train_data)

predictions = model.transform(test_data)

evaluator = MulticlassClassificationEvaluator(labelCol="Event_Type", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = {:.2f}%".format(accuracy * 100))



Test Accuracy = 95.54%


                                                                                