In [165]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import collect_set, size
from pyspark.ml.fpm import FPGrowth

In [166]:
# create spark session
spark_session = SparkSession.builder \
    .appName("CA3") \
    .getOrCreate()

In [167]:
# load data and show first 5 row
df = spark_session.read.csv("Groceries.csv", header=True, inferSchema=True)
df.show(5)

+-------------+----------+----------------+
|Member_number|      Date| itemDescription|
+-------------+----------+----------------+
|         1808|21-07-2015|  tropical fruit|
|         2552|05-01-2015|      whole milk|
|         2300|19-09-2015|       pip fruit|
|         1187|12-12-2015|other vegetables|
|         3037|01-02-2015|      whole milk|
+-------------+----------+----------------+
only showing top 5 rows



In [168]:
# # of rows
num_rows = df.count()
print(f"# of rows: {num_rows}")

# # of columns to check dataset
num_columns = len(df.columns)
print(f"# of columns: {num_columns}")

# of rows: 38765
# of columns: 3


In [169]:
df = df.drop("Date") # drop date column
df.columns # check columns of new dataframe

['Member_number', 'itemDescription']

In [170]:
# group based on member number purchase
grouped_df = df.groupBy("Member_number").agg(collect_set("itemDescription").alias("items"))
grouped_df.count()

3898

In [171]:
# filter based on purchase number
filtered_df = grouped_df.filter(size("items") > 10)
filtered_df.count()

1313

In [172]:
# fpgrowth algorithm
fp_growth = FPGrowth(itemsCol="items", minSupport=0.15, minConfidence=0.4)
model = fp_growth.fit(filtered_df)

In [173]:
# frequent itemset
frequent_itemsets = model.freqItemsets
frequent_itemsets.show()
frequent_itemsets.count()

+--------------------+----+
|               items|freq|
+--------------------+----+
|              [pork]| 250|
|     [bottled water]| 419|
|[bottled water, o...| 239|
|[bottled water, s...| 202|
|[bottled water, w...| 276|
|        [newspapers]| 296|
| [frozen vegetables]| 231|
|      [citrus fruit]| 354|
|[citrus fruit, wh...| 226|
|       [white bread]| 198|
|            [butter]| 263|
|        [rolls/buns]| 650|
|[rolls/buns, othe...| 372|
|[rolls/buns, othe...| 238|
|[rolls/buns, whol...| 416|
|    [tropical fruit]| 456|
|[tropical fruit, ...| 216|
|[tropical fruit, ...| 230|
|[tropical fruit, ...| 202|
|[tropical fruit, ...| 272|
+--------------------+----+
only showing top 20 rows



67

In [174]:
# association rule
association_rules = model.associationRules
association_rules.show()
association_rules.count()

+--------------------+------------------+-------------------+------------------+-------------------+
|          antecedent|        consequent|         confidence|              lift|            support|
+--------------------+------------------+-------------------+------------------+-------------------+
|  [other vegetables]|      [rolls/buns]| 0.5081967213114754|1.0265573770491803| 0.2833206397562833|
|  [other vegetables]|          [yogurt]|0.42349726775956287|0.9876588145085364| 0.2361005331302361|
|  [other vegetables]|      [whole milk]| 0.6516393442622951|1.0101563860878318| 0.3632901751713633|
|  [other vegetables]|            [soda]|0.43579234972677594|0.9649162819414111|0.24295506473724296|
|      [bottled beer]|      [whole milk]| 0.6597633136094675| 1.022749977295432|0.16984006092916984|
|[whipped/sour cream]|      [whole milk]| 0.6496815286624203|1.0071214251874356|0.15536938309215537|
|[rolls/buns, othe...|      [whole milk]| 0.6397849462365591|0.9917799697858349|0.181264280

53