In [1]:
from pyspark.context import SparkContext
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import SparkSession

In [2]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [3]:
df = spark.createDataFrame( [(0, ['1', '4', '5']),
                             (1, ['1', '2', '3', '5']),
                             (2, ['1', '2', '4', '5']),
                             (3, ['1', '3', '4', '5']),
                             (4, ['2', '3', '5']),
                             (5, ['2', '4', '5']),
                             (6, ['3', '4']),
                             (7, ['1', '2', '3']),
                             (8, ['1','4','5']),
                             (9, ['1', '2', '4'])], ["id", "items"])

In [4]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [54]:
fpGrowth = FPGrowth(itemsCol="items", minSupport=0.3, minConfidence=0.8)
model = fpGrowth.fit(df)

In [55]:
# Display frequent itemsets.
model.freqItemsets.show()

+---------+----+
|    items|freq|
+---------+----+
|      [1]|   7|
|   [1, 5]|   5|
|[1, 5, 4]|   4|
|   [1, 4]|   5|
|      [3]|   5|
|   [3, 1]|   3|
|   [3, 5]|   3|
|   [3, 2]|   3|
|      [5]|   7|
|   [5, 4]|   5|
|      [2]|   6|
|   [2, 1]|   4|
|   [2, 5]|   4|
|   [2, 4]|   3|
|      [4]|   7|
+---------+----+



In [56]:
# Display generated association rules.
model.associationRules.show()

+----------+----------+----------+-----------------+
|antecedent|consequent|confidence|             lift|
+----------+----------+----------+-----------------+
|    [1, 4]|       [5]|       0.8|1.142857142857143|
|    [5, 4]|       [1]|       0.8|1.142857142857143|
|    [1, 5]|       [4]|       0.8|1.142857142857143|
+----------+----------+----------+-----------------+



In [34]:
# transform examines the input items against all the association rules and summarize the
# consequents as prediction
model.transform(df).show()

+---+------------+----------+
| id|       items|prediction|
+---+------------+----------+
|  0|   [1, 4, 5]|        []|
|  1|[1, 2, 3, 5]|       [4]|
|  2|[1, 2, 4, 5]|        []|
|  3|[1, 3, 4, 5]|        []|
|  4|   [2, 3, 5]|    [1, 4]|
|  5|   [2, 4, 5]|       [1]|
|  6|      [3, 4]|    [1, 5]|
|  7|   [1, 2, 3]|    [5, 4]|
|  8|   [1, 4, 5]|        []|
|  9|   [1, 2, 4]|       [5]|
+---+------------+----------+



In [45]:
spark.stop()