In [0]:
from pyspark.ml.fpm import FPGrowth

df = spark.createDataFrame([
    (0, [1, 2, 5]),
    (1, [1, 2, 3, 5]),
    (2, [1, 2])
], ["id", "items"])
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.5, minConfidence=0.6)
model = fpGrowth.fit(df)

# Display frequent itemsets.
model.freqItemsets.show()

# Display generated association rules.
model.associationRules.show()

# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   3|
|      [2]|   3|
|   [2, 1]|   3|
|      [5]|   2|
|   [5, 2]|   2|
|[5, 2, 1]|   2|
|   [5, 1]|   2|
+---------+----+

+----------+----------+------------------+----+------------------+
|antecedent|consequent|        confidence|lift|           support|
+----------+----------+------------------+----+------------------+
|       [5]|       [2]|               1.0| 1.0|0.6666666666666666|
|       [5]|       [1]|               1.0| 1.0|0.6666666666666666|
|    [5, 1]|       [2]|               1.0| 1.0|0.6666666666666666|
|    [5, 2]|       [1]|               1.0| 1.0|0.6666666666666666|
|       [2]|       [1]|               1.0| 1.0|               1.0|
|       [2]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|    [2, 1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
|       [1]|       [2]|               1.0| 1.0|               1.0|
|       [1]|       [5]|0.6666666666666666| 1.0|0.6666666666666666|
+-------

In [0]:
#Creating own dataset
df2 = spark.createDataFrame([
    (0, ["Pear", "Banana", "Apple", "Grapes", "Kiwi"]),
    (1, ["Pear", "Banana", "Apple", "Kiwi"]),
    (2, ["Banana", "Kiwi", "Apple", "Grapes"]),
    (3, ["Pear", "Kiwi"]),
    (4, ["Apple", "Banana"]),
    (5, ["Banana", "Kiwi", "Grapes"]),
    (6, ["Grapes", "Pear", "Kiwi"]),
    (7, ["Kiwi", "Banana"]),
    (8, ["Banana", "Kiwi", "Apple"]),
    (9, ["Apple"]),
    (10, ["Banana"]),
    (11, ["Kiwi"]),
    (12, ["Grapes", "Apple"]),
    (13, ["Grapes"]),
    (14, ["Pear", "Apple", "Grapes"]),
    (15, ["Grapes", "Pear"]),
    (16, ["Apple", "Grapes", "Banana"]),
    (17, ["Kiwi", "Pear", "Banana"])
], ["id", "items"])

fpGrowth2 = FPGrowth(itemsCol="items", minSupport=0.25, minConfidence=0.35)
model2 = fpGrowth2.fit(df2)

# Display frequent itemsets.
model2.freqItemsets.show()

# Display generated association rules.
model2.associationRules.show()



+---------------+----+
|          items|freq|
+---------------+----+
|         [Kiwi]|  10|
|       [Banana]|  10|
| [Banana, Kiwi]|   7|
|       [Grapes]|   9|
|        [Apple]|   9|
|[Apple, Grapes]|   5|
|[Apple, Banana]|   6|
|         [Pear]|   7|
|   [Pear, Kiwi]|   5|
+---------------+----+

+----------+----------+------------------+------------------+------------------+
|antecedent|consequent|        confidence|              lift|           support|
+----------+----------+------------------+------------------+------------------+
|  [Grapes]|   [Apple]|0.5555555555555556|1.1111111111111112|0.2777777777777778|
|   [Apple]|  [Grapes]|0.5555555555555556|1.1111111111111112|0.2777777777777778|
|   [Apple]|  [Banana]|0.6666666666666666|               1.2|0.3333333333333333|
|    [Pear]|    [Kiwi]|0.7142857142857143|1.2857142857142856|0.2777777777777778|
|  [Banana]|    [Kiwi]|               0.7|1.2599999999999998|0.3888888888888889|
|  [Banana]|   [Apple]|               0.6|          

In [0]:
df3 = spark.createDataFrame([
    (0, ["Pear", "Banana", "Apple", "Grapes",]),
    (1, ["Pear", "Banana", "Apple", "Kiwi"]),
    (2, ["Banana", "Apple", "Grapes"]),
    (3, ["Pear"]),
    (4, ["Apple", "Banana", "Kiwi"]),
    (5, ["Kiwi", "Grapes"]),
    (6, ["Pear", "Kiwi"])
], ["id", "items"])

fpGrowth3 = FPGrowth(itemsCol="items", minSupport=0.25, minConfidence=0.35)
model3 = fpGrowth3.fit(df3)
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model3.transform(df3).show()

+---+--------------------+--------------------+
| id|               items|          prediction|
+---+--------------------+--------------------+
|  0|[Pear, Banana, Ap...|              [Kiwi]|
|  1|[Pear, Banana, Ap...|            [Grapes]|
|  2|[Banana, Apple, G...|        [Kiwi, Pear]|
|  3|              [Pear]|[Banana, Apple, K...|
|  4|[Apple, Banana, K...|      [Pear, Grapes]|
|  5|      [Kiwi, Grapes]|[Banana, Apple, P...|
|  6|        [Pear, Kiwi]|     [Banana, Apple]|
+---+--------------------+--------------------+



In [0]:
##References
#1. https://spark.apache.org/docs/latest/ml-frequent-pattern-mining.html
#2. Dr. Liao’s Code Examples & Tutorials: Blackboard/Pdf provided by Dr. Liao.

In [0]:
df4 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/dagrawal@gmu.edu/groceries.csv")

In [0]:
trainDF, testDF = df4.randomSplit([0.7, 0.2], seed=42)
print(trainDF.cache().count()) # Cache because accessing training data multiple times
print(testDF.count())

7733
2101


In [0]:
d=df4.rdd.map(lambda line: line.strip().split())

In [0]:
fpGrowth4 = FPGrowth(d, minSupport=0.25, minConfidence=0.35)
model4 = fpGrowth4.fit(df4)

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
[0;32m<command-766503604987361>[0m in [0;36m<cell line: 1>[0;34m()[0m
[0;32m----> 1[0;31m [0mfpGrowth4[0m [0;34m=[0m [0mFPGrowth[0m[0;34m([0m[0md[0m[0;34m,[0m [0mminSupport[0m[0;34m=[0m[0;36m0.25[0m[0;34m,[0m [0mminConfidence[0m[0;34m=[0m[0;36m0.35[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      2[0m [0mmodel4[0m [0;34m=[0m [0mfpGrowth4[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mdf4[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/__init__.py[0m in [0;36mwrapper[0;34m(self, *args, **kwargs)[0m
[1;32m    131[0m     [0;32mdef[0m [0mwrapper[0m[0;34m([0m[0mself[0m[0;34m:[0m [0mAny[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m:[0m [0mAny[0m[0;34m)[0m [