In [2]:
# basic imports 

import os # OS e.g directory structure
import sys
import numpy as np # linear algebra
import scipy as sc  # scientific computing
import pandas as pd # data processing, file I/O
import seaborn as sns  # visualization
import matplotlib.pyplot as plt # visualization
import math
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Spark related imports

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.ml.feature import StringIndexer
from pyspark.ml.fpm import FPGrowth
from pyspark.ml.fpm import PrefixSpan
from pyspark.ml.evaluation import RegressionEvaluator

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("abd_recommendation").getOrCreate()

Read parquet from data preparation

In [4]:
products = spark.read.parquet("products")
brands = spark.read.parquet("brands")
categories = spark.read.parquet("categories")

In [57]:
df_user_product = spark.read.parquet("user_products")
df_user_brand = spark.read.parquet("user_brands")
df_user_categories = spark.read.parquet("user_categories")
df_user_product_v = spark.read.parquet("user_products_v")
df_user_brand_v = spark.read.parquet("user_brands_v")
df_user_categories_v = spark.read.parquet("user_categories_v")

Creating the market-basktet model for user vs (products, brands, categories) purchased and viewed

In [68]:
fpGrowth_u_p = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model_u_p = fpGrowth_u_p.fit(df_user_product)

In [69]:
# Display frequent itemsets.
model_u_p.freqItemsets.orderBy(col("freq").desc()).show()

# Display generated association rules.
model_u_p.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30)


+---------+----+
|    items|freq|
+---------+----+
|[1004856]| 117|
|[1004767]|  91|
|[1005115]|  55|
|[1004870]|  43|
|[1002544]|  41|
|[1004833]|  40|
|[4804056]|  38|
|[1004249]|  36|
|[1005100]|  27|
|[1005105]|  27|
|[1004741]|  27|
|[1004873]|  23|
|[1004836]|  22|
|[1004750]|  22|
|[1004739]|  22|
|[4804295]|  21|
|[1003317]|  19|
|[1002633]|  16|
|[1004838]|  16|
|[1004858]|  16|
+---------+----+
only showing top 20 rows

+----------+----------+--------------------+-------------------+
|antecedent|consequent|          confidence|               lift|
+----------+----------+--------------------+-------------------+
| [3601425]| [3601405]|  0.6666666666666666|  301.1666666666667|
| [4804660]| [4804056]|  0.6666666666666666|  31.70175438596491|
| [1003772]| [1004767]|  0.6666666666666666| 13.238095238095237|
| [3601405]| [3601425]|                 0.5| 301.16666666666663|
| [1002629]| [1002544]|  0.2222222222222222|  9.794037940379404|
| [1004836]| [1004767]| 0.18181818181818182| 3

In [70]:
fpGrowth_u_b = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_u_b = fpGrowth_u_b.fit(df_user_brand)

In [71]:
# Display frequent itemsets.
model_u_b.freqItemsets.orderBy(col("freq").desc()).show()

# Display generated association rules.
model_u_b.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30)


+----------------+----+
|           items|freq|
+----------------+----+
|       [samsung]| 597|
|         [apple]| 439|
|        [xiaomi]| 182|
|        [huawei]|  98|
|          [oppo]|  36|
|            [lg]|  25|
|       [indesit]|  24|
|          [acer]|  19|
|         [midea]|  19|
|         [bosch]|  16|
|[apple, samsung]|  16|
|         [artel]|  16|
|            [hp]|  16|
|        [lenovo]|  15|
|      [elenberg]|  13|
|         [haier]|  12|
|          [vivo]|  11|
|         [casio]|   9|
|       [philips]|   9|
|          [beko]|   9|
+----------------+----+
only showing top 20 rows

+----------+----------+--------------------+-------------------+
|antecedent|consequent|          confidence|               lift|
+----------+----------+--------------------+-------------------+
|  [pulser]| [samsung]|  0.6666666666666666| 2.0178671133445003|
|   [haier]|   [apple]| 0.16666666666666666| 0.6860288534548216|
|   [bosch]|   [apple]|               0.125| 0.5145216400911161|
| [indes

In [73]:
fpGrowth_u_c = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_u_c = fpGrowth_u_c.fit(df_user_categories)

In [76]:
# Display frequent itemsets.
model_u_c.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_u_c.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30,truncate=False)

+------------------------------------------+----+
|items                                     |freq|
+------------------------------------------+----+
|[2053013555631882655]                     |1124|
|[2053013554658804075]                     |117 |
|[2053013554415534427]                     |66  |
|[2053013563810775923]                     |65  |
|[2053013565983425517]                     |51  |
|[2053013558920217191]                     |43  |
|[2053013553341792533]                     |42  |
|[2053013563911439225]                     |18  |
|[2053013555262783879]                     |15  |
|[2172371436436455782]                     |15  |
|[2053013555573162395]                     |13  |
|[2053013561579406073]                     |13  |
|[2053013554776244595]                     |13  |
|[2053013552293216471]                     |12  |
|[2053013554658804075, 2053013555631882655]|12  |
|[2053013552326770905]                     |11  |
|[2053013554834964853]                     |9   |


In [78]:
fpGrowth_u_p_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model_u_p_v = fpGrowth_u_p_v.fit(df_user_product_v)

In [79]:
# Display frequent itemsets.
model_u_p_v.freqItemsets.orderBy(col("freq").desc()).show()

# Display generated association rules.
model_u_p_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30)


+---------+----+
|    items|freq|
+---------+----+
|[1005115]|1232|
|[1004856]|1229|
|[1004767]| 998|
|[1005105]| 769|
|[1004249]| 641|
|[1004870]| 607|
|[1004833]| 575|
|[1002544]| 515|
|[4804056]| 474|
|[1004873]| 423|
|[1005135]| 390|
|[1004741]| 352|
|[1004836]| 342|
|[1004739]| 333|
|[1002524]| 322|
|[1003317]| 313|
|[1004785]| 313|
|[1004258]| 312|
|[1005100]| 312|
|[1002633]| 293|
+---------+----+
only showing top 20 rows

+------------------+----------+-------------------+------------------+
|        antecedent|consequent|         confidence|              lift|
+------------------+----------+-------------------+------------------+
|[1004750, 1004870]| [1004767]|              0.725| 20.43003507014028|
|[1004750, 1004856]| [1004833]| 0.6101694915254238|29.843124539425204|
|[1005100, 1004833]| [1004856]|             0.5625|12.871592758340114|
|[1004836, 1004856]| [1004767]| 0.5535714285714286|15.599287861437162|
|         [3600666]| [3600661]| 0.5316455696202531| 86.92714159552546

In [80]:
fpGrowth_u_b_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model_u_b_v = fpGrowth_u_b_v.fit(df_user_brand_v)

In [82]:
# Display frequent itemsets.
model_u_b_v.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_u_b_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate =False)

+-----------------+----+
|items            |freq|
+-----------------+----+
|[samsung]        |7430|
|[apple]          |6620|
|[xiaomi]         |3591|
|[huawei]         |1719|
|[apple, samsung] |974 |
|[xiaomi, samsung]|890 |
|[lg]             |784 |
|[oppo]           |734 |
|[acer]           |647 |
|[huawei, samsung]|604 |
|[bosch]          |568 |
|[lenovo]         |510 |
|[respect]        |509 |
|[elenberg]       |491 |
|[hp]             |490 |
|[huawei, xiaomi] |449 |
|[indesit]        |442 |
|[artel]          |426 |
|[xiaomi, apple]  |423 |
|[sony]           |405 |
+-----------------+----+
only showing top 20 rows

+------------------------------+----------+------------------+------------------+
|antecedent                    |consequent|confidence        |lift              |
+------------------------------+----------+------------------+------------------+
|[oppo, huawei, xiaomi, apple] |[samsung] |0.8157894736842105|3.0878125664092932|
|[asus, hp, lenovo]            |[acer]    |0.8

In [83]:
fpGrowth_u_c_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.001)
model_u_c_v = fpGrowth_u_c_v.fit(df_user_categories_v)

In [84]:
# Display frequent itemsets.
model_u_c_v.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_u_c_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate =False)

+------------------------------------------+-----+
|items                                     |freq |
+------------------------------------------+-----+
|[2053013555631882655]                     |13275|
|[2053013554658804075]                     |1726 |
|[2053013554415534427]                     |1197 |
|[2053013553341792533]                     |1112 |
|[2053013558920217191]                     |1064 |
|[2053013563810775923]                     |850  |
|[2053013565983425517]                     |794  |
|[2053013563911439225]                     |588  |
|[2053013561579406073]                     |518  |
|[2053013557192163841]                     |439  |
|[2053013565069067197]                     |407  |
|[2172371436436455782]                     |365  |
|[2053013553970938175]                     |358  |
|[2053013555573162395]                     |335  |
|[2053013561092866779]                     |300  |
|[2053013554658804075, 2053013555631882655]|286  |
|[2053013565639492569]         

Creating the market-basktet model for session vs (products, brands, categories) purchased and viewed

In [86]:
df_session_product = spark.read.parquet("session_products")
df_session_brand = spark.read.parquet("session_brands")
df_session_categories = spark.read.parquet("session_categories")
df_session_product_v = spark.read.parquet("session_products_v")
df_session_brand_v = spark.read.parquet("session_brands_v")
df_session_categories_v = spark.read.parquet("session_categories_v")

In [88]:
fpGrowth_s_p = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_s_p = fpGrowth_s_p.fit(df_session_product)

In [89]:
# Display frequent itemsets.
model_s_p.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_p.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+---------+----+
|items    |freq|
+---------+----+
|[1004856]|121 |
|[1004767]|96  |
|[1005115]|56  |
|[1004870]|44  |
|[1004833]|44  |
|[1002544]|43  |
|[4804056]|42  |
|[1004249]|38  |
|[1004741]|29  |
|[1005105]|28  |
|[1005100]|27  |
|[1004873]|23  |
|[1004739]|23  |
|[1004750]|22  |
|[1004836]|22  |
|[4804295]|22  |
|[1003317]|19  |
|[1004838]|18  |
|[1004858]|16  |
|[1002524]|16  |
+---------+----+
only showing top 20 rows

+----------+----------+--------------------+------------------+
|antecedent|consequent|confidence          |lift              |
+----------+----------+--------------------+------------------+
|[1002629] |[1002544] |0.2222222222222222  |10.108527131782946|
|[1004836] |[1004767] |0.13636363636363635 |2.778409090909091 |
|[1004750] |[1004870] |0.09090909090909091 |4.041322314049586 |
|[1004833] |[1004856] |0.09090909090909091 |1.4695717505634862|
|[1002544] |[1002629] |0.046511627906976744|10.108527131782944|
|[1002544] |[1005115] |0.046511627906976744|1.62458471

In [92]:
fpGrowth_s_b = FPGrowth(itemsCol="items", minSupport=0.0001, minConfidence=0.0)
model_s_b = fpGrowth_s_b.fit(df_session_brand)

In [93]:
# Display frequent itemsets.
model_s_b.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_b.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+----------------+----+
|items           |freq|
+----------------+----+
|[samsung]       |634 |
|[apple]         |473 |
|[xiaomi]        |198 |
|[huawei]        |100 |
|[oppo]          |38  |
|[lg]            |27  |
|[indesit]       |26  |
|[acer]          |21  |
|[midea]         |20  |
|[artel]         |19  |
|[bosch]         |17  |
|[hp]            |16  |
|[lenovo]        |15  |
|[elenberg]      |13  |
|[haier]         |12  |
|[apple, samsung]|12  |
|[vivo]          |11  |
|[beko]          |10  |
|[casio]         |10  |
|[jbl]           |9   |
+----------------+----+
only showing top 20 rows

+------------------+----------+------------------+------------------+
|antecedent        |consequent|confidence        |lift              |
+------------------+----------+------------------+------------------+
|[aerocool]        |[defender]|1.0               |1955.9999999999998|
|[defender]        |[aerocool]|1.0               |1955.9999999999998|
|[alphard, pioneer]|[hertz]   |1.0              

In [96]:
fpGrowth_s_c = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_s_c = fpGrowth_s_c.fit(df_session_categories)

In [97]:
# Display frequent itemsets.
model_s_c.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_c.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+------------------------------------------+----+
|items                                     |freq|
+------------------------------------------+----+
|[2053013555631882655]                     |1210|
|[2053013554658804075]                     |129 |
|[2053013554415534427]                     |72  |
|[2053013563810775923]                     |68  |
|[2053013565983425517]                     |52  |
|[2053013553341792533]                     |46  |
|[2053013558920217191]                     |45  |
|[2053013563911439225]                     |21  |
|[2053013555262783879]                     |16  |
|[2172371436436455782]                     |15  |
|[2053013554776244595]                     |14  |
|[2053013561579406073]                     |14  |
|[2053013555573162395]                     |14  |
|[2053013552326770905]                     |13  |
|[2053013552293216471]                     |12  |
|[2053013554658804075, 2053013555631882655]|10  |
|[2053013553031414015]                     |9   |


In [99]:
fpGrowth_s_p_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_s_p_v = fpGrowth_s_p_v.fit(df_session_product_v)

In [100]:
# Display frequent itemsets.
model_s_p_v.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_p_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+---------+----+
|items    |freq|
+---------+----+
|[1004856]|1307|
|[1005115]|1272|
|[1004767]|1067|
|[1005105]|793 |
|[1004249]|669 |
|[1004870]|629 |
|[1004833]|623 |
|[1002544]|537 |
|[4804056]|499 |
|[1004873]|445 |
|[1005135]|395 |
|[1004741]|381 |
|[1004739]|351 |
|[1004836]|351 |
|[1002524]|345 |
|[1004785]|324 |
|[1004258]|322 |
|[1005100]|321 |
|[1003317]|319 |
|[1005160]|308 |
+---------+----+
only showing top 20 rows

+------------------+----------+-------------------+------------------+
|antecedent        |consequent|confidence         |lift              |
+------------------+----------+-------------------+------------------+
|[3600666]         |[3600661] |0.525              |95.61671270718233 |
|[1004833, 1004767]|[1004856] |0.44680851063829785|11.26935160917482 |
|[1004839]         |[1004838] |0.42857142857142855|49.92175668854113 |
|[1005135, 1005115]|[1005105] |0.4230769230769231 |17.587302357163644|
|[1005135, 1005105]|[1005115] |0.4125             |10.69030070754717 

In [102]:
fpGrowth_s_b_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_s_b_v = fpGrowth_s_b_v.fit(df_session_brand_v)

In [103]:
# Display frequent itemsets.
model_s_b_v.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_b_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+-----------------+----+
|items            |freq|
+-----------------+----+
|[samsung]        |8305|
|[apple]          |7288|
|[xiaomi]         |4087|
|[huawei]         |1882|
|[apple, samsung] |868 |
|[lg]             |846 |
|[xiaomi, samsung]|816 |
|[oppo]           |775 |
|[acer]           |695 |
|[bosch]          |608 |
|[huawei, samsung]|558 |
|[lenovo]         |541 |
|[respect]        |528 |
|[hp]             |525 |
|[elenberg]       |512 |
|[indesit]        |473 |
|[artel]          |450 |
|[sony]           |424 |
|[asus]           |415 |
|[huawei, xiaomi] |410 |
+-----------------+----+
only showing top 20 rows

+------------------------+----------+------------------+------------------+
|antecedent              |consequent|confidence        |lift              |
+------------------------+----------+------------------+------------------+
|[asus, hp, lenovo]      |[acer]    |0.819672131147541 |38.87840547234344 |
|[oppo, huawei, apple]   |[samsung] |0.7924528301886793|3.145479536083

In [104]:
fpGrowth_s_c_v = FPGrowth(itemsCol="items", minSupport=0.001, minConfidence=0.0)
model_s_c_v = fpGrowth_s_c_v.fit(df_session_categories_v)

In [105]:
# Display frequent itemsets.
model_s_c_v.freqItemsets.orderBy(col("freq").desc()).show(truncate=False)

# Display generated association rules.
model_s_c_v.associationRules.orderBy(col("confidence").desc(), col("lift").desc()).show(30, truncate=False)

+---------------------+-----+
|items                |freq |
+---------------------+-----+
|[2053013555631882655]|15408|
|[2053013554658804075]|1897 |
|[2053013554415534427]|1293 |
|[2053013553341792533]|1187 |
|[2053013558920217191]|1171 |
|[2053013563810775923]|947  |
|[2053013565983425517]|857  |
|[2053013563911439225]|637  |
|[2053013561579406073]|566  |
|[2053013557192163841]|459  |
|[2053013565069067197]|425  |
|[2172371436436455782]|391  |
|[2053013553970938175]|377  |
|[2053013555573162395]|350  |
|[2053013561092866779]|334  |
|[2053013560346280633]|307  |
|[2053013560807654091]|272  |
|[2053013553945772349]|270  |
|[2053013565639492569]|269  |
|[2053013552293216471]|254  |
+---------------------+-----+
only showing top 20 rows

+---------------------+---------------------+--------------------+-------------------+
|antecedent           |consequent           |confidence          |lift               |
+---------------------+---------------------+--------------------+--------------

In [None]:
# models parquets

model_u_p.associationRules.write.mode("overwrite").parquet("model_user_products")
model_u_b.associationRules.write.mode("overwrite").parquet("model_user_brands")
model_u_c.associationRules.write.mode("overwrite").parquet("model_user_categories")

model_s_p.associationRules.write.mode("overwrite").parquet("model_session_products")
model_s_b.associationRules.write.mode("overwrite").parquet("model_session_brands")
model_s_c.associationRules.write.mode("overwrite").parquet("model_session_categories")