In [0]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   3|
|      [2]|   3|
|   [2, 1]|   3|
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
+---------+----+

+----------+----------+------------------+----+------------------+
|antecedent|consequent|        confidence|lift|           support|
+----------+----------+------------------+----+------------------+
|       [5]|       [2]|               1.0| 1.0|0.6666666666666666|
|       [5]|       [1]|               1.0| 1.0|0.6666666666666666|
|    [5, 1]|       [2]|               1.0| 1.0|0.6666666666666666|
|    [5, 2]|       [1]|               1.0| 1.0|0.6666666666666666|
|       [2]|       [1]|               1.0| 1.0|               1.0|
|       [2]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|    [2, 1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|       [1]|       [2]|               1.0| 1.0|               1.0|
|       [1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
+-------

In [0]:
#Creating own dataset
df2 = spark.createDataFrame([
    (0, ["Pear", "Banana", "Apple", "Grapes", "Kiwi"]),
    (1, ["Pear", "Banana", "Apple", "Kiwi"]),
    (2, ["Banana", "Kiwi", "Apple", "Grapes"]),
    (3, ["Pear", "Kiwi"]),
    (4, ["Apple", "Banana"]),
    (5, ["Banana", "Kiwi", "Grapes"]),
    (6, ["Grapes", "Pear", "Kiwi"]),
    (7, ["Kiwi", "Banana"]),
    (8, ["Banana", "Kiwi", "Apple"]),
    (9, ["Apple"]),
    (10, ["Banana"]),
    (11, ["Kiwi"]),
    (12, ["Grapes", "Apple"]),
    (13, ["Grapes"]),
    (14, ["Pear", "Apple", "Grapes"]),
    (15, ["Grapes", "Pear"]),
    (16, ["Apple", "Grapes", "Banana"]),
    (17, ["Kiwi", "Pear", "Banana"])
], ["id", "items"])

fpGrowth2 = FPGrowth(itemsCol="items", minSupport=0.25, minConfidence=0.35)
model2 = fpGrowth2.fit(df2)

# Display frequent itemsets.
model2.freqItemsets.show()

# Display generated association rules.
model2.associationRules.show()



+---------------+----+
|          items|freq|
+---------------+----+
|         [Kiwi]|  10|
|       [Banana]|  10|
| [Banana, Kiwi]|   7|
|       [Grapes]|   9|
|        [Apple]|   9|
|[Apple, Grapes]|   5|
|[Apple, Banana]|   6|
|         [Pear]|   7|
|   [Pear, Kiwi]|   5|
+---------------+----+

+----------+----------+------------------+------------------+------------------+
|antecedent|consequent|        confidence|              lift|           support|
+----------+----------+------------------+------------------+------------------+
|  [Grapes]|   [Apple]|0.5555555555555556|1.1111111111111112|0.2777777777777778|
|   [Apple]|  [Grapes]|0.5555555555555556|1.1111111111111112|0.2777777777777778|
|   [Apple]|  [Banana]|0.6666666666666666|               1.2|0.3333333333333333|
|    [Pear]|    [Kiwi]|0.7142857142857143|1.2857142857142856|0.2777777777777778|
|  [Banana]|    [Kiwi]|               0.7|1.2599999999999998|0.3888888888888889|
|  [Banana]|   [Apple]|               0.6|          

In [0]:
df3 = spark.createDataFrame([
    (0, ["Pear", "Banana", "Apple", "Grapes",]),
    (1, ["Pear", "Banana", "Apple", "Kiwi"]),
    (2, ["Banana", "Apple", "Grapes"]),
    (3, ["Pear"]),
    (4, ["Apple", "Banana", "Kiwi"]),
    (5, ["Kiwi", "Grapes"]),
    (6, ["Pear", "Kiwi"])
], ["id", "items"])

fpGrowth3 = FPGrowth(itemsCol="items", minSupport=0.25, minConfidence=0.35)
model3 = fpGrowth3.fit(df3)
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model3.transform(df3).show()

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[Pear, Banana, Ap...|              [Kiwi]|
|  1|[Pear, Banana, Ap...|            [Grapes]|
|  2|[Banana, Apple, G...|        [Kiwi, Pear]|
|  3|              [Pear]|[Banana, Apple, K...|
|  4|[Apple, Banana, K...|      [Pear, Grapes]|
|  5|      [Kiwi, Grapes]|[Banana, Apple, P...|
|  6|        [Pear, Kiwi]|     [Banana, Apple]|
+---+--------------------+--------------------+



In [0]:
##References
#1. https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html
#2. Dr. Liao’s Code Examples & Tutorials: Blackboard/Pdf provided by Dr. Liao.

In [0]:
df4 = spark.read.format("text").option("header", "false").load("dbfs:/FileStore/shared_uploads/dagrawal@gmu.edu/groceries.csv").toDF("sample_items")
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, col
df4=df4.select(split(col("sample_items"),",").alias("items")).drop("sample_items")

In [0]:
TrainDF1, TestDF1 = df4.randomSplit([0.75, 0.25], seed=62)
print(TrainDF1.cache().count()) 
print(TestDF1.count())

7363
2472


In [0]:
fpGrowth4 = FPGrowth(itemsCol="items", minSupport=0.05, minConfidence=0.1)
model4 = fpGrowth4.fit(TrainDF1)

In [0]:
model4.freqItemsets.show()

+--------------------+----+
|               items|freq|
+--------------------+----+
|         [margarine]| 423|
|[whipped/sour cream]| 518|
|     [shopping bags]| 720|
|           [napkins]| 385|
|      [citrus fruit]| 603|
|        [rolls/buns]|1367|
|[rolls/buns, whol...| 412|
|       [frankfurter]| 432|
|   [root vegetables]| 808|
|       [canned beer]| 583|
|            [yogurt]|1031|
|[yogurt, whole milk]| 413|
|      [bottled beer]| 583|
|            [coffee]| 428|
|    [tropical fruit]| 749|
|[fruit/vegetable ...| 526|
|              [beef]| 391|
|            [pastry]| 642|
|  [other vegetables]|1426|
|[other vegetables...| 564|
+--------------------+----+
only showing top 20 rows



In [0]:
model4.associationRules.show()

+------------------+------------------+-------------------+------------------+-------------------+
|        antecedent|        consequent|         confidence|              lift|            support|
+------------------+------------------+-------------------+------------------+-------------------+
|[other vegetables]|      [whole milk]| 0.3955119214586255|1.5473720922953558|0.07659921227760423|
|          [yogurt]|      [whole milk]| 0.4005819592628516|1.5672077396665125|0.05609126714654353|
|      [rolls/buns]|      [whole milk]| 0.3013899049012436|1.1791359563166082|0.05595545294037756|
|      [whole milk]|      [rolls/buns]|0.21891604675876727|1.1791359563166084|0.05595545294037756|
|      [whole milk]|          [yogurt]|0.21944739638682254|1.5672077396665123|0.05609126714654353|
|      [whole milk]|[other vegetables]|0.29968119022316686| 1.547372092295356|0.07659921227760423|
+------------------+------------------+-------------------+------------------+-------------------+



In [0]:
fpGrowth4 = FPGrowth(itemsCol="items", minSupport=0.1, minConfidence=0.2)
model4 = fpGrowth4.fit(TestDF1)

In [0]:
model4.transform(df4).show()

+--------------------+----------+
|               items|prediction|
+--------------------+----------+
|[citrus fruit, se...|        []|
|[tropical fruit, ...|        []|
|        [whole milk]|        []|
|[pip fruit, yogur...|        []|
|[other vegetables...|        []|
|[whole milk, butt...|        []|
|        [rolls/buns]|        []|
|[other vegetables...|        []|
|        [pot plants]|        []|
|[whole milk, cere...|        []|
|[tropical fruit, ...|        []|
|[citrus fruit, tr...|        []|
|              [beef]|        []|
|[frankfurter, rol...|        []|
|[chicken, tropica...|        []|
|[butter, sugar, f...|        []|
|[fruit/vegetable ...|        []|
|[packaged fruit/v...|        []|
|         [chocolate]|        []|
|     [specialty bar]|        []|
+--------------------+----------+
only showing top 20 rows

