<a href="https://colab.research.google.com/github/DimaFrank/Association_Rule_Learning/blob/test/Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [348]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [349]:
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import array, col, concat_ws, udf, array_remove, size, window, to_timestamp, date_format, concat, lit, collect_list, desc, sort_array, array_contains, array_intersect
from pyspark.sql import SparkSession 
import itertools

Resource:

https://www.softwaretestinghelp.com/apriori-algorithm/

In [350]:
spark = SparkSession.builder.master("local[*]").appName("user_ct_test").getOrCreate()
sc = spark.sparkContext

In [351]:
# TABLE-1
rdd = spark.sparkContext.parallelize(
    [Row("T1", ['I1', 'I2', 'I3']),
     Row("T2", ['I4', 'I3', 'I2']),
     Row("T3", ['I4', 'I5']), 
     Row("T4", ['I1', 'I2', 'I4']),
     Row("T5", ['I1', 'I2', 'I3', 'I5']),
     Row("T6", ['I1', 'I2', 'I3', 'I4']),   
     ]
)
schema = StructType([
    StructField("Tid", StringType(), True),
    StructField("Basket", ArrayType(StringType(), True), True)
])
df = spark.createDataFrame(rdd, schema)
df = df.withColumn('size',size(col('Basket')))
df.show(10)

+---+----------------+----+
|Tid|          Basket|size|
+---+----------------+----+
| T1|    [I1, I2, I3]|   3|
| T2|    [I4, I3, I2]|   3|
| T3|        [I4, I5]|   2|
| T4|    [I1, I2, I4]|   3|
| T5|[I1, I2, I3, I5]|   4|
| T6|[I1, I2, I3, I4]|   4|
+---+----------------+----+



In [352]:
df_test = df.select("Basket",sort_array("Basket",asc=True).alias('array_sorted'))
df_test.show()

+----------------+----------------+
|          Basket|    array_sorted|
+----------------+----------------+
|    [I1, I2, I3]|    [I1, I2, I3]|
|    [I4, I3, I2]|    [I2, I3, I4]|
|        [I4, I5]|        [I4, I5]|
|    [I1, I2, I4]|    [I1, I2, I4]|
|[I1, I2, I3, I5]|[I1, I2, I3, I5]|
|[I1, I2, I3, I4]|[I1, I2, I3, I4]|
+----------------+----------------+



In [353]:
min_sup=3

In [354]:
# TABLE-2
import pyspark.sql.functions as F

F1 = (df_test
           .withColumn("explode", F.explode("array_sorted"))
           .groupBy("explode")
           .count()
           .orderBy(F.desc("count")))
F1.show()

+-------+-----+
|explode|count|
+-------+-----+
|     I2|    5|
|     I4|    4|
|     I3|    4|
|     I1|    4|
|     I5|    2|
+-------+-----+



In [355]:
# TABLE-3
F2 = F1.filter(col('count')>=min_sup)
F2.show()

+-------+-----+
|explode|count|
+-------+-----+
|     I2|    5|
|     I4|    4|
|     I1|    4|
|     I3|    4|
+-------+-----+



In [356]:
# All possible combinations

lst = [str(i.explode) for i in F2.select("explode").collect()]
print(lst)
print('All Combinations: \n')

def create_possible_combinations(all_items, k):
    # This functions gets a list of items and k, and returns all possible k-item combinations.
    # all_items --> <list>
    # k         --> <int>
    res_tmp = set([])
    for subset in itertools.combinations(all_items, k):       
       res_tmp.add((subset))

    res_lst = list(res_tmp)
    result = [list(res_lst[i]) for i in range(len(res_lst))]

    return result


create_possible_combinations(lst,2)

['I2', 'I4', 'I3', 'I1']
All Combinations: 



[['I2', 'I1'],
 ['I2', 'I4'],
 ['I3', 'I1'],
 ['I4', 'I3'],
 ['I2', 'I3'],
 ['I4', 'I1']]

In [357]:
# Creating an empty DataFrame
emp_RDD = spark.sparkContext.emptyRDD()
columns1 = StructType([StructField('Item', ArrayType(StringType()), False),
                       StructField('count', LongType(), False)])
first_df = spark.createDataFrame(data=emp_RDD,
                                         schema=columns1)


for row in create_possible_combinations(lst,2):
    # print(row)
    res=df_test.withColumn("NewColumn", F.array([F.lit(x) for x in row]))
    res= res.select('array_sorted', size(array_intersect(res.array_sorted, res.NewColumn)).alias('Intersect'))
    res = res.filter(col('Intersect')>=2).count()
    # print(row, 'res=', res)
    columns=['Item','count']
    newRow = spark.createDataFrame([(row, res)], columns)
    first_df = first_df.union(newRow).filter(col('count')>=min_sup)
    

first_df.show()  

+--------+-----+
|    Item|count|
+--------+-----+
|[I2, I1]|    4|
|[I2, I4]|    3|
|[I3, I1]|    3|
|[I2, I3]|    4|
+--------+-----+



In [358]:
for row in create_possible_combinations(lst,3):
    # print(row)
    res=df_test.withColumn("NewColumn", F.array([F.lit(x) for x in row]))
    res= res.select('array_sorted', size(array_intersect(res.array_sorted, res.NewColumn)).alias('Intersect'))
    res = res.filter(col('Intersect')>=3).count()
    # print(row, 'res=', res)
    columns=['Item','count']
    newRow = spark.createDataFrame([(row, res)], columns)
    first_df = first_df.union(newRow).filter(col('count')>=min_sup)


first_df.show()  

+------------+-----+
|        Item|count|
+------------+-----+
|    [I2, I1]|    4|
|    [I2, I4]|    3|
|    [I3, I1]|    3|
|    [I2, I3]|    4|
|[I2, I3, I1]|    3|
+------------+-----+



### Building the Algorithm

In [451]:
import pyspark.sql.functions as F


def build_support(sessions_data, items, min_support):

    # sessions_data --> <DataFrame> Name of dataset contain sessions
    # items         --> <ArrayType(StringType())> Column name that contain items
    # min_support   --> <float> between 0 and 1


    # Sort item set
    data = sessions_data.select(items,sort_array(items,asc=True).alias('array'))

    # Create F1 + Filter by min support:
    F1 = (data.withColumn("explode", F.explode("array"))
          .groupBy("explode")
          .count()
          .orderBy(F.desc("count"))).filter(col('count')>min_support)

    #Create all possible item combinations
    lst = [str(i.explode) for i in F1.select("explode").collect()]


    # Create an empty DataFrame
    emp_RDD = spark.sparkContext.emptyRDD()
    columns1 = StructType([StructField('Item', ArrayType(StringType()), False),
                          StructField('Support', LongType(), False)])
    Support = spark.createDataFrame(data=emp_RDD,
                                            schema=columns1)
    
    # Create Support
    for k in range(1,4):
        combinations = create_possible_combinations(lst,k)
        for j in range(len(combinations)):
            # print(combinations[j])
            row = sorted(combinations[j])                   
            res = sessions_data.withColumn("NewColumn", F.array([F.lit(x) for x in row]))
            res = res.select(items,'NewColumn', size(array_intersect(res.Basket, res.NewColumn)).alias('Intersect'))
            res = res.filter(col('Intersect')>=k).count()
            # res.show()
            columns = ['Item','Support']
            newRow = spark.createDataFrame([(row, res)], columns)
            Support = Support.union(newRow).filter(col('Support')>=min_sup)



    return Support

In [452]:
support = build_support(df, 'Basket', 3)
support.show()

+------------+-------+
|        Item|Support|
+------------+-------+
|        [I2]|      5|
|        [I3]|      4|
|        [I1]|      4|
|        [I4]|      4|
|    [I1, I2]|      4|
|    [I2, I4]|      3|
|    [I1, I3]|      3|
|    [I2, I3]|      4|
|[I1, I2, I3]|      3|
+------------+-------+



In [450]:
df.show()

+---+----------------+----+
|Tid|          Basket|size|
+---+----------------+----+
| T1|    [I1, I2, I3]|   3|
| T2|    [I4, I3, I2]|   3|
| T3|        [I4, I5]|   2|
| T4|    [I1, I2, I4]|   3|
| T5|[I1, I2, I3, I5]|   4|
| T6|[I1, I2, I3, I4]|   4|
+---+----------------+----+



In [None]:
def association_rule_mining():
  return 