## Data transformation and Feature Engineering

In [232]:
import os
os.chdir('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system')

from pyspark.sql import SparkSession,Window
from pyspark.sql.types import StructField,StructType,StringType,IntegerType,FloatType,TimestampType
from pyspark.sql import functions as F
import seaborn as sns
import matplotlib.pyplot as plt



In [233]:
#  creat spark session

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("ProductRecommendationSystem") \
    .getOrCreate()

In [234]:
schema = StructType([
    StructField('event_time',TimestampType(),False),
    StructField('event_type',StringType(),True),
    StructField('product_id',StringType(),True),
    StructField('category_id',StringType(),True),
    StructField('category_code',StringType(),True),
    StructField('brand',StringType(),True),
    StructField('price',FloatType(),True),
    StructField('user_id',StringType(),True),
    StructField('user_session',StringType(),True)]
)

In [235]:
oct_data = spark.read.csv('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/ingested_data/2019-Oct.csv',header=True,schema=schema)
nov_data = spark.read.csv('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/ingested_data/2019-Nov.csv',header=True,schema=schema)
dec_data = spark.read.csv('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/ingested_data/2019-Dec.csv',header=True,schema=schema)
jan_data = spark.read.csv('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/ingested_data/2020-Jan.csv',header=True,schema=schema)
feb_data = spark.read.csv('/home/azureuser/cloudfiles/code/Users/oviemunooboro/Product_recommendation_system/Artifacts/ingested_data/2020-feb.csv',header=True,schema=schema)

In [236]:
oct_data.show()

+-------------------+----------------+----------+-------------------+-------------+--------+-----+---------+--------------------+
|         event_time|      event_type|product_id|        category_id|category_code|   brand|price|  user_id|        user_session|
+-------------------+----------------+----------+-------------------+-------------+--------+-----+---------+--------------------+
|2019-10-01 00:00:00|            cart|   5773203|1487580005134238553|         NULL|  runail| 2.62|463240011|26dd6e6e-4dac-477...|
|2019-10-01 00:00:03|            cart|   5773353|1487580005134238553|         NULL|  runail| 2.62|463240011|26dd6e6e-4dac-477...|
|2019-10-01 00:00:07|            cart|   5881589|2151191071051219817|         NULL|  lovely|13.48|429681830|49e8d843-adf3-428...|
|2019-10-01 00:00:07|            cart|   5723490|1487580005134238553|         NULL|  runail| 2.62|463240011|26dd6e6e-4dac-477...|
|2019-10-01 00:00:15|            cart|   5881449|1487580013522845895|         NULL|  lovel

In [237]:
datasets = [oct_data,nov_data,dec_data,jan_data,feb_data]

for data in datasets:
    data.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: string (nullable = true)
 |-- user_session: string (nullable = true)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (

In [238]:
# split into training , validation and test data

training_data = oct_data.union(nov_data).union(dec_data)
validation_data = jan_data
testing_data = feb_data

In [239]:
# check to see if the user_id are both traingin,validation and test data

train_user_id = training_data.select('user_id').distinct()
validation_user_id = validation_data.select('user_id').distinct()
test_user_id = testing_data.select('user_id').distinct()


print(f'total number of users in training data {train_user_id.count()}')
print(f'total number of users in validation data {validation_user_id.count()}')
print(f'total number of users in test data {test_user_id.count()}')

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

total number of users in training data 1012561


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

total number of users in validation data 410073




total number of users in test data 391055


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [240]:
# Users in Validation but not in Training:

val_not_in_train = validation_user_id.subtract(train_user_id)
print(f"Number of users in validation not in training: {val_not_in_train.count()}")



Number of users in validation not in training: 328938


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [241]:
val_not_in_test = test_user_id.subtract(validation_user_id)
print(f"Number of users in test not in validation: {val_not_in_test.count()}")




Number of users in test not in validation: 327020


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [242]:
test_not_in_train = test_user_id.subtract(train_user_id)
print(f"Number of users in test not in train: {test_not_in_train.count()}")



Number of users in test not in train: 331069


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

There is an issue of cold start in the data given that majority of the user in the validation and test data are not present in the training data

In [243]:
train_product_id = training_data.select('product_id').distinct()
val_product_id = validation_data.select('product_id').distinct()
test_product_id = testing_data.select('product_id').distinct()


print(f'total number of product in training data {train_product_id.count()}')
print(f'total number of product in validation data {val_product_id.count()}')
print(f'total number of product in test data {test_product_id.count()}')


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

total number of product in training data 49341


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

total number of product in validation data 45484




total number of product in test data 48579


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [244]:
val_not_in_train = val_product_id.subtract(train_product_id)
print(f"Number of product in validation not in training: {val_not_in_train.count()}")



Number of product in validation not in training: 2087


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [245]:
val_not_in_test = val_product_id.subtract(test_product_id)
print(f"Number of product in validation not in test: {val_not_in_test.count()}")



Number of product in validation not in test: 2345


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [246]:
test_not_in_train = test_product_id.subtract(train_product_id)
print(f"Number of product in test not in train: {test_not_in_train.count()}")



Number of product in test not in train: 4623


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [247]:
# check missing values 
train_missing_values = training_data.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in training_data.columns])
val_missing_values = validation_data.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in validation_data.columns])


train_missing_values.show() , val_missing_values.show()

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     12069716|5155579|    0|      0|        2229|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+





+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|      4190033|1775630|    0|      0|        1314|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

(None, None)

In [248]:
# making a copy of the training , val dataframe and caching into memory before begining preprocessing

new_training_data = training_data.select('*')
new_val_data = validation_data.select('*')

# cache into memory

new_training_data = new_training_data.cache()
new_val_data = new_val_data.cache()



24/12/18 19:32:39 WARN CacheManager: Asked to cache already cached data.
24/12/18 19:32:39 WARN CacheManager: Asked to cache already cached data.


####  handling the missing values

In [249]:
# handling the missing values

new_training_data = new_training_data.drop('category_code')
new_val_data = new_val_data.drop('category_code')


new_training_data = new_training_data.fillna({'brand':'UNKNOWN'})
new_val_data = new_val_data.fillna({'brand':'UNKNOWN'})

new_training_data = new_training_data.dropna(subset=['user_session'])
new_val_data = new_val_data.dropna(subset=['user_session'])

In [250]:
# check

train_missing_values = new_training_data.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in new_training_data.columns])
val_missing_values = new_val_data.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in new_val_data.columns])

train_missing_values.show() ,  val_missing_values.show()

24/12/18 19:32:40 WARN MemoryStore: Not enough space to cache rdd_257_7 in memory! (computed 13.3 MiB so far)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              (1 + 4) / 13]

+----------+----------+----------+-----------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-----+-----+-------+------------+
|         0|         0|         0|          0|    0|    0|      0|           0|
+----------+----------+----------+-----------+-----+-----+-------+------------+



[Stage 1852:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               (0 + 4) / 4]

+----------+----------+----------+-----------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-----+-----+-------+------------+
|         0|         0|         0|          0|    0|    0|      0|           0|
+----------+----------+----------+-----------+-----+-----+-------+------------+



                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

(None, None)

In [251]:
#### check for duplicates 

training_duplicates = new_training_data.count() - new_training_data.dropDuplicates().count()
val_duplicates = new_val_data.count() - new_val_data.dropDuplicates().count()


print(f'Number of duplicates rows in training data : {training_duplicates}')
print(f'Number of duplicates rows in val data : {val_duplicates}')

24/12/18 19:32:43 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 26.3 MiB so far)
24/12/18 19:32:43 WARN MemoryStore: Not enough space to cache rdd_257_7 in memory! (computed 20.1 MiB so far)
24/12/18 19:32:43 WARN MemoryStore: Not enough space to cache rdd_257_6 in memory! (computed 26.0 MiB so far)
24/12/18 19:32:45 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

Number of duplicates rows in training data : 643522
Number of duplicates rows in val data : 224922


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [252]:
duplicates  =  new_training_data.groupBy(new_training_data.columns).count().filter("count > 1")
duplicates.show()

24/12/18 19:33:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       (0 + 4) / 13]

+-------------------+----------------+----------+-------------------+----------+-----+---------+--------------------+-----+
|         event_time|      event_type|product_id|        category_id|     brand|price|  user_id|        user_session|count|
+-------------------+----------------+----------+-------------------+----------+-----+---------+--------------------+-----+
|2019-10-01 03:01:10|remove_from_cart|   5869151|1487580005092295511|cosmoprofi| 7.94|520408156|be0c7abd-aa38-45c...|    2|
|2019-10-01 04:03:13|remove_from_cart|   5700037|1487580009286598681|    runail|  0.4|526285483|1fa58edc-6d5c-43d...|    2|
|2019-10-01 06:30:44|remove_from_cart|   5687864|1487580006409307030|  airnails| 2.86|361710584|3fa1d835-8dc4-4dd...|    2|
|2019-10-01 07:28:27|remove_from_cart|   5827372|1487580009445982239|   UNKNOWN| 1.52|543404022|ab1aa345-902a-4ee...|    2|
|2019-10-01 07:37:55|remove_from_cart|   5760766|1487580005134238553|    runail| 2.62|548825399|0288ab87-831a-4ce...|    2|
|2019-10

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [253]:
event_type_counts = duplicates.groupBy('event_type').count()

event_type_counts.show()

24/12/18 19:34:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       (0 + 4) / 13]

+----------------+------+
|      event_type| count|
+----------------+------+
|        purchase|   601|
|            view|   353|
|            cart| 54812|
|remove_from_cart|555921|
+----------------+------+



                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [254]:
new_training_data = new_training_data.dropDuplicates()


In [255]:
new_val_data = new_val_data.dropDuplicates()

In [256]:
# check price column

price_data = new_training_data.filter(new_training_data['price'] < 0 )

price_data.show()
print(f' the number of rows with negative prices : {price_data.count()}')

24/12/18 19:34:35 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 26.3 MiB so far)

24/12/18 19:34:35 WARN MemoryStore: Not enough space to cache rdd_257_7 in memory! (computed 13.3 MiB so far)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          

+-------------------+----------+----------+-------------------+-------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|  brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+-------+------+---------+--------------------+
|2019-10-02 08:30:03|  purchase|   5716855|1487580014042939619|UNKNOWN| -7.94|550375225|5ddec778-9464-451...|
|2019-10-01 19:10:56|  purchase|   5716857|1487580014042939619|UNKNOWN|-23.81|552507528|dcdd60c6-1a70-442...|
|2019-10-03 18:25:39|  purchase|   5670257|1487580014042939619|UNKNOWN|-15.87|556383221|4333d203-bc4d-4d0...|
|2019-10-03 17:37:04|  purchase|   5716859|1487580014042939619|UNKNOWN|-47.62|555414763|479149eb-1807-417...|
|2019-10-14 17:33:24|  purchase|   5716861|1487580014042939619|UNKNOWN|-79.37|541122983|b60f777d-afca-429...|
|2019-10-10 14:33:29|  purchase|   5716855|1487580014042939619|UNKNOWN| -7.94|558797258|a406cf28-f04b-436...|
|2019-10-1

24/12/18 19:34:35 WARN MemoryStore: Not enough space to cache rdd_257_7 in memory! (computed 20.1 MiB so far)
24/12/18 19:34:35 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 45.3 MiB so far)
24/12/18 19:34:36 WARN MemoryStore: Not enough space to cache rdd_257_11 in memory! (computed 27.3 MiB so far)


 the number of rows with negative prices : 53


In [257]:
# convert negative prices to positive values

new_training_data = new_training_data.withColumn('price',F.abs(new_training_data['price']))
new_val_data = new_val_data.withColumn('price',F.abs(new_val_data['price']))

In [258]:
price_data = new_training_data.filter(new_training_data['price'] < 0 )

price_data.show()
print(f' the number of rows with negative prices : {price_data.count()}')

24/12/18 19:34:37 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 45.3 MiB so far)

+----------+----------+----------+-----------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-----+-----+-------+------------+
+----------+----------+----------+-----------+-----+-----+-------+------------+



24/12/18 19:34:39 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 7.0 MiB so far)

 the number of rows with negative prices : 0


                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [259]:
new_training_data.show()

24/12/18 19:34:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       (0 + 4) / 13]

+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+
|         event_time|      event_type|product_id|        category_id|    brand|price|  user_id|        user_session|
+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+
|2019-10-01 00:08:01|            cart|   5844397|1487580006317032337|  UNKNOWN| 2.38|385985999|d30965e8-1101-44a...|
|2019-10-01 00:14:32|            cart|   5864474|1487580011652186237|  UNKNOWN| 27.3|550021912|7698b35e-e2dc-486...|
|2019-10-01 00:19:17|        purchase|   5803451|1487580005461394279|  bluesky| 5.24|250032160|d87285d5-0487-4cd...|
|2019-10-01 00:20:12|            view|   5864871|1889472915104072007|     kiss|13.65|555449542|800d2386-e47a-411...|
|2019-10-01 00:29:11|            view|   5549808|1487580005595612013|  UNKNOWN|  3.0|541588084|7019c33f-6a8c-438...|
|2019-10-01 00:30:21|            view|   5773221|148758000513423

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

## create user features

In [260]:
# purchases made in the last 2 days

train_purchase = new_training_data.filter(new_training_data['event_type'] == 'purchase') 

train_purchase_per_2day = train_purchase.withColumn('two_day_purchase', 
                                                             F.from_unixtime(
                                                                 F.floor(F.unix_timestamp(F.col('event_time')) / (2 * 86400))* (2 * 86400) 
                                                                 )
                                                             )

train_purchase_per_2day = train_purchase_per_2day.groupBy('user_id','two_day_purchase').agg(F.count('*').alias('Num_purchase_per_2day'))
train_purchase_per_2day = train_purchase_per_2day.withColumnRenamed("user_id", "train_user_id") # rename 'user_id to prevent ambitiuity

train_purchase_per_2day.show()

24/12/18 19:35:06 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 26.3 MiB so far)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              (1 + 4) / 13]

+-------------+-------------------+---------------------+
|train_user_id|   two_day_purchase|Num_purchase_per_2day|
+-------------+-------------------+---------------------+
|    333709300|2019-10-01 00:00:00|                   30|
|    445702327|2019-10-01 00:00:00|                   19|
|    556122723|2019-10-03 00:00:00|                    8|
|    545287375|2019-10-05 00:00:00|                    5|
|    556850854|2019-10-03 00:00:00|                    6|
|    450686495|2019-10-03 00:00:00|                   15|
|    556271034|2019-10-03 00:00:00|                   14|
|    412895724|2019-10-05 00:00:00|                    4|
|    556983721|2019-10-05 00:00:00|                    7|
|    557668633|2019-10-07 00:00:00|                    7|
|    501762368|2019-10-03 00:00:00|                    4|
|    557760674|2019-10-07 00:00:00|                    9|
|    546741559|2019-10-07 00:00:00|                    5|
|    289331807|2019-10-05 00:00:00|                   21|
|    395968291

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [261]:
# Rename user_id in train_purchase_per_2day to avoid duplication in final result
train_purchase_per_2day = train_purchase_per_2day.withColumnRenamed("user_id", "train_user_id")

new_training_data = new_training_data.join(
    train_purchase_per_2day,
    (new_training_data['user_id'] == train_purchase_per_2day['train_user_id']) & 
    (F.floor(F.unix_timestamp(new_training_data['event_time']) / (2*86400)) ==  F.floor(F.unix_timestamp(train_purchase_per_2day['two_day_purchase']) / (2 * 86400))),
    how='left'
).drop('two_day_purchase','train_user_id')

# filling missing missin values in the num_purchase_per2_day column with 0 meaning no purchase where made in the 2 day interval
new_training_data = new_training_data.na.fill({'Num_purchase_per_2day' : 0 })

new_training_data.show()


24/12/18 19:35:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                                                                                                                                                                                                   (0 + 4) / 13][Stage 1905:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          (0 + 0) / 13]

+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+
|         event_time|      event_type|product_id|        category_id|    brand|price|  user_id|        user_session|Num_purchase_per_2day|
+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+
|2019-10-01 01:32:24|            cart|   5723501|1487580005134238553|   runail| 2.62|555452209|8e9b90bc-3b3a-489...|                   64|
|2019-10-01 02:02:15|remove_from_cart|   5850335|1487580007852147670|  UNKNOWN| 1.11|313639969|b92ea415-66ed-40b...|                    0|
|2019-10-01 00:30:21|            view|   5773221|1487580005134238553|   runail| 2.62|446404021|abac4e23-d977-47d...|                   13|
|2019-10-01 00:34:32|remove_from_cart|   5773221|1487580005134238553|   runail| 2.62|446404021|abac4e23-d977-47d...|                   13|
|2019-10-01 03:28:29|      

In [262]:
# average purchase price 

avg_purchase_price = train_purchase.groupBy('user_id').agg(F.format_number(F.mean('price'),2).cast(FloatType()).alias('Avg_purchase_price'))
avg_purchase_price = avg_purchase_price.withColumnRenamed("user_id", "train_user_id")

new_training_data = new_training_data.join(
    avg_purchase_price,
    new_training_data['user_id'] == avg_purchase_price['train_user_id'],
    how='left'
).drop('train_user_id')

new_training_data = new_training_data.na.fill({'Avg_purchase_price' : 0})

new_training_data.show()

24/12/18 19:35:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                            (0 + 4) / 13][Stage 1921:>                                                                                                                                                                                                                                                                                                                   (0 + 0) / 13][Stage 1922:>                                                                                                                                                                                                                                                                                                                   (0 + 0) / 13]


+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+------------------+
|         event_time|      event_type|product_id|        category_id|    brand|price|  user_id|        user_session|Num_purchase_per_2day|Avg_purchase_price|
+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+------------------+
|2019-10-01 01:32:24|            cart|   5723501|1487580005134238553|   runail| 2.62|555452209|8e9b90bc-3b3a-489...|                   64|              2.53|
|2019-10-01 02:02:15|remove_from_cart|   5850335|1487580007852147670|  UNKNOWN| 1.11|313639969|b92ea415-66ed-40b...|                    0|              2.35|
|2019-10-01 00:30:21|            view|   5773221|1487580005134238553|   runail| 2.62|446404021|abac4e23-d977-47d...|                   13|               4.6|
|2019-10-01 00:34:32|remove_from_cart|   5773221|148

In [263]:
# most purchase category of user 

# groupby user id and brand and create a window specification to get the top three brands
brand_count = train_purchase.groupBy('user_id','brand').agg(F.count('brand').alias('brand_count'))

window_spec = Window.partitionBy('User_id').orderBy(F.desc('brand_count'))
ranked_brands = brand_count.withColumn('rank' ,F.row_number().over(window_spec))
top_brands = ranked_brands.filter(F.col("rank") <= 3)

top_brands = top_brands.groupBy("user_id").pivot("rank", [1, 2, 3]).agg(F.first("brand"))

# rename the columns
top_brands = top_brands.select(
    F.col("user_id"),
    F.col("1").alias("1_most_purchased_brand"),
    F.col("2").alias("2_most_purchased_brand"),
    F.col("3").alias("3_most_purchased_brand")
)

# Get the most purchased brand for each user
most_purchased_brand = brand_count.withColumn('rank', F.row_number().over(Window.partitionBy('user_id').orderBy(F.desc('brand_count'))))
most_purchased_brand = most_purchased_brand.filter(F.col('rank') == 1).select('user_id', 'brand')

# Join the top brands with the most purchased brand
top_brands_with_most_purchased = top_brands.join(most_purchased_brand, on='user_id', how='left')
top_brands_with_most_purchased = top_brands_with_most_purchased.withColumnsRenamed({'user_id':'train_user_id','brand':'preferred_brand'})
# Fill missing values with the user's most purchased brand
top_brands_with_most_purchased = top_brands_with_most_purchased.fillna({
    '1_most_purchased_brand': 'preferred_brand',
    '2_most_purchased_brand': 'preferred_brand',
    '3_most_purchased_brand': 'preferred_brand'
})


top_brands_with_most_purchased.show()

result = new_training_data.join(
    top_brands_with_most_purchased,
    new_training_data['user_id'] == top_brands_with_most_purchased['train_user_id'],
    how = 'left'
).drop('train_user_id','preferred_brand')

24/12/18 19:36:40 WARN MemoryStore: Failed to reserve initial memory threshold of 1024.0 KiB for computing block rdd_257_7 in memory.                                                                                                                                                                                                                                                                                                                                                                  (1 + 4) / 13][Stage 1943:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          (0 + 0) / 13]

+-------------+----------------------+----------------------+----------------------+---------------+
|train_user_id|1_most_purchased_brand|2_most_purchased_brand|3_most_purchased_brand|preferred_brand|
+-------------+----------------------+----------------------+----------------------+---------------+
|    100787781|               UNKNOWN|               markell|                 estel|        UNKNOWN|
|     10079204|                   cnd|                kaaral|       preferred_brand|            cnd|
|    101025416|                 irisk|               UNKNOWN|                   cnd|          irisk|
|    103274988|             bpw.style|               UNKNOWN|             freedecor|      bpw.style|
|    103540490|               UNKNOWN|       preferred_brand|       preferred_brand|        UNKNOWN|
|    105075440|              skinlite|               UNKNOWN|                 dizao|       skinlite|
|    107113173|              farmstay|              glysolid|              skinlite|       

In [264]:
result.show()

24/12/18 19:36:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.                                                                                                                                                                                                            (0 + 4) / 13][Stage 1963:>                                                                                                                                                                                                                                                                                                                   (0 + 0) / 13][Stage 1964:>                                                                                                                                                                                                                                                                                                                   (0 + 0) / 13]


+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+------------------+----------------------+----------------------+----------------------+
|         event_time|      event_type|product_id|        category_id|    brand|price|  user_id|        user_session|Num_purchase_per_2day|Avg_purchase_price|1_most_purchased_brand|2_most_purchased_brand|3_most_purchased_brand|
+-------------------+----------------+----------+-------------------+---------+-----+---------+--------------------+---------------------+------------------+----------------------+----------------------+----------------------+
|2019-10-01 02:02:15|remove_from_cart|   5850335|1487580007852147670|  UNKNOWN| 1.11|313639969|b92ea415-66ed-40b...|                    0|              2.35|                runail|               UNKNOWN|               bluesky|
|2019-10-01 03:28:29|            view|   5881587|2151191071051219817|   lovely|14.84|3559341

24/12/18 19:06:59 WARN MemoryStore: Not enough space to cache rdd_257_4 in memory! (computed 45.3 MiB so far)                                                                                                                                                                                                                                                                                                                                                                                          (2 + 4) / 13][Stage 1531:>                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          (0 + 0) / 13]

+---------+----------------------+----------------------+----------------------+---------+
|  user_id|1_most_purchased_brand|2_most_purchased_brand|3_most_purchased_brand|    brand|
+---------+----------------------+----------------------+----------------------+---------+
|100787781|               UNKNOWN|               markell|                 estel|  UNKNOWN|
| 10079204|                   cnd|                kaaral|                 brand|      cnd|
|101025416|                 irisk|               UNKNOWN|                   cnd|    irisk|
|103274988|             bpw.style|               UNKNOWN|             freedecor|bpw.style|
|103540490|               UNKNOWN|                 brand|                 brand|  UNKNOWN|
|105075440|              skinlite|               UNKNOWN|                 dizao| skinlite|
|107113173|              farmstay|              glysolid|              skinlite| farmstay|
|111782974|              severina|                runail|                 brand| severina|