In [1]:
import findspark
from data_preprocessing import get_cleaned_data
findspark.init()

In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
spark=SparkSession.builder\
    .master("local[*]")\
    .appName("Apriori")\
    .getOrCreate()

In [3]:
sc=spark.sparkContext

In [4]:
data = get_cleaned_data()

target_value = 0

# select only the columns with target = 0
data = data[data["TARGET"] == target_value]

# convert numbers to categories
for col in data.columns:
    data[col] = data[col].map({x: col + "_" + str(x) for x in data[col].unique()})

df = spark.createDataFrame(data)

In [5]:
rdd = df.rdd

In [6]:
def create_C1(dataset):
    # Create a list of candidate item sets of size 1
    C1 = []
    for transaction in dataset:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    return list(map(frozenset, C1))

def scan_dataset(dataset, candidates, min_support):
    # Count the occurrence of each candidate item set in the dataset
    item_count = {}
                    
    for candidate in candidates:
        rdd2 = rdd.map(lambda x: candidate.issubset(list(x)))
        item_count[candidate] = rdd2.count()
        
    num_items = float(len(dataset))
    frequent_items = []
    support_data = {}
    # Calculate support and filter out candidates below min_support
    for item, count in item_count.items():
        support = count / num_items
        if support >= min_support:
            frequent_items.insert(0, item)
            support_data[item] = support
    return frequent_items, support_data

def apriori_gen(freq_sets, k):
    # Generate candidate item sets of size k
    candidates = []
    len_freq_sets = len(freq_sets)
    for i in range(len_freq_sets):
        for j in range(i + 1, len_freq_sets):
            L1 = list(freq_sets[i])[:k - 2]
            L2 = list(freq_sets[j])[:k - 2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                candidates.append(freq_sets[i] | freq_sets[j])
    return candidates

def my_apriori(dataset, min_support=0.5, max_iterations=2):
    C1 = create_C1(dataset)
    D = list(map(set, dataset))
    k = 2
    L1, support_data = scan_dataset(D, C1, min_support)
    L = [L1]
    while len(L[k - 2]) > 0 and max_iterations > 0:
        candidates = apriori_gen(L[k - 2], k)
        print(f"Scanning dataset For C{k - 2}")
        Lk, supK = scan_dataset(D, candidates, min_support)
        support_data.update(supK)
        L.append(Lk)
        k += 1
        max_iterations -= 1
    return L, support_data

In [7]:
def create_dataset(data, num=None):
    dataset = []
    if num is None:
        num = len(data)
    for i in range(0, num):
        dataset.append(list(map(str, data.iloc[i].values.tolist())))
    print("Dataset created")
    return dataset
dataset = create_dataset(data, 1000)
print("Dataset Created")
# print(dataset)
max_iterations = 100
L, support_data = my_apriori(dataset, min_support=0.5, max_iterations=max_iterations)
print(f"Frequent Item Sets: {len(L)}")
# print(L)
print(f"Support Data: {len(support_data)}")
# print(support_data)
file = open(f'../data/support_data'+ str(max_iterations) +'.txt','w')
for item, val in support_data.items():
	file.write(str(item) + " : " + str(val) + "\n")
file.close()

Dataset created
Dataset Created
