# **Cài đặt PySpark**

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [None]:
! wget -q https://archive.apache.org/dist/spark/spark-3.5.4/spark-3.5.4-bin-hadoop3.tgz

In [None]:
!tar xf spark-3.5.4-bin-hadoop3.tgz

In [None]:
! pip install -q findspark

In [None]:
! du -sh spark-3.5.4-bin-hadoop3.tgz

du: cannot access 'spark-3.5.4-bin-hadoop3.tgz': No such file or directory


In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.4-bin-hadoop3"

In [None]:
! echo $JAVA_HOME

/usr/lib/jvm/java-8-openjdk-amd64


In [None]:
! echo $SPARK_HOME

/content/spark-3.5.4-bin-hadoop3


In [None]:
import findspark
findspark.init()

In [None]:
import pyspark as spark

print(spark.__version__)

3.5.4


# **Câu 2**

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_set

from google.colab import drive
drive.mount('/content/drive')

# Initialize Spark session
spark = SparkSession.builder.appName("PCY_Algorithm").getOrCreate()

# Load data from Google Drive
file_path = "/content/drive/MyDrive/MMDS/baskets.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Group items by (Member_number, Date) to form baskets
baskets_df = df.groupBy("Member_number", "Date").agg(collect_set("itemDescription").alias("items"))

# Show sample baskets
baskets_df.show(truncate=False)

# Stop Spark session
spark.stop()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-------------+----------+--------------------------------------------------+
|Member_number|Date      |items                                             |
+-------------+----------+--------------------------------------------------+
|1000         |15/03/2015|[whole milk, sausage, yogurt, semi-finished bread]|
|1000         |24/06/2014|[pastry, whole milk, salty snack]                 |
|1000         |24/07/2015|[misc. beverages, canned beer]                    |
|1000         |25/11/2015|[sausage, hygiene articles]                       |
|1000         |27/05/2015|[pickled vegetables, soda]                        |
|1001         |02/07/2014|[whole milk, sausage, rolls/buns]                 |
|1001         |05/02/2015|[frankfurter, curd]                               |
|1001         |12/12/2014|[whole milk, soda]                                |
|1001        

In [None]:
from itertools import combinations
from collections import defaultdict

class PCYAlgorithm:
    def __init__(self, support_threshold, confidence_threshold, num_buckets=1000):
        self.s = support_threshold
        self.c = confidence_threshold
        self.num_buckets = num_buckets
        self.hash_buckets = [0] * num_buckets
        self.frequent_items = set()
        self.pair_counts = defaultdict(int)
        self.bitmap = []

    def hash_function(self, item1, item2):
        """Custom hash function for pair hashing."""
        return (hash(item1) ^ hash(item2)) % self.num_buckets

    def pass_one(self, baskets):
        """First pass: Count individual items and hash pairs."""
        item_counts = defaultdict(int)

        for basket in baskets:
            for item in basket:
                item_counts[item] += 1
            for item1, item2 in combinations(basket, 2):
                bucket = self.hash_function(item1, item2)
                self.hash_buckets[bucket] += 1

        self.frequent_items = {item for item, count in item_counts.items() if count >= self.s}
        self.bitmap = [count >= self.s for count in self.hash_buckets]

    def pass_two(self, baskets):
        """Second pass: Identify frequent pairs."""
        for basket in baskets:
            valid_items = [item for item in basket if item in self.frequent_items]
            for item1, item2 in combinations(valid_items, 2):
                if self.bitmap[self.hash_function(item1, item2)]:
                    self.pair_counts[(item1, item2)] += 1

        self.pair_counts = {pair: count for pair, count in self.pair_counts.items() if count >= self.s}

    def generate_association_rules(self, baskets):
        """Generate association rules with confidence threshold."""
        rules = []
        for (item1, item2), count in self.pair_counts.items():
            support = count / len(baskets)

            total_item1 = sum(1 for basket in baskets if item1 in basket)
            total_item2 = sum(1 for basket in baskets if item2 in basket)

            confidence1 = count / total_item1 if total_item1 > 0 else 0
            confidence2 = count / total_item2 if total_item2 > 0 else 0

            if confidence1 >= self.c:
                rules.append((item1, item2, support, confidence1))
            if confidence2 >= self.c:
                rules.append((item2, item1, support, confidence2))

        return rules

pcy = PCYAlgorithm(support_threshold=5, confidence_threshold=0.6)
pcy.pass_one(baskets)
pcy.pass_two(baskets)

association_rules = pcy.generate_association_rules(baskets)

print("Frequent Pairs:", pcy.pair_counts)
print("Association Rules:", association_rules)

spark.stop()

Frequent Pairs: {('whole milk', 'sausage'): 134, ('whole milk', 'yogurt'): 167, ('whole milk', 'semi-finished bread'): 25, ('sausage', 'yogurt'): 86, ('sausage', 'semi-finished bread'): 9, ('yogurt', 'semi-finished bread'): 12, ('pastry', 'whole milk'): 97, ('pastry', 'salty snack'): 10, ('whole milk', 'salty snack'): 29, ('misc. beverages', 'canned beer'): 8, ('sausage', 'hygiene articles'): 13, ('pickled vegetables', 'soda'): 12, ('whole milk', 'rolls/buns'): 209, ('sausage', 'rolls/buns'): 80, ('frankfurter', 'curd'): 20, ('whole milk', 'soda'): 174, ('beef', 'white bread'): 13, ('frankfurter', 'soda'): 46, ('frankfurter', 'whipped/sour cream'): 22, ('soda', 'whipped/sour cream'): 51, ('frozen vegetables', 'other vegetables'): 47, ('whole milk', 'butter'): 70, ('sugar', 'tropical fruit'): 15, ('butter milk', 'specialty chocolate'): 6, ('detergent', 'root vegetables'): 7, ('whole milk', 'pip fruit'): 99, ('whole milk', 'tropical fruit'): 123, ('pip fruit', 'tropical fruit'): 31, ('re

In [None]:
class AssociationRuleGenerator:
    def __init__(self, frequent_pairs, confidence_threshold, baskets):
        self.frequent_pairs = frequent_pairs  # Dictionary with pair counts
        self.c = confidence_threshold
        self.baskets = baskets
        self.rules = []

    def generate_rules(self):
        """Generate association rules with confidence threshold."""
        for (item1, item2), count in self.frequent_pairs.items():
            support = count / len(self.baskets)

            total_item1 = sum(1 for basket in self.baskets if item1 in basket)
            total_item2 = sum(1 for basket in self.baskets if item2 in basket)

            confidence1 = count / total_item1 if total_item1 > 0 else 0
            confidence2 = count / total_item2 if total_item2 > 0 else 0

            if confidence1 >= self.c:
                self.rules.append((item1, item2, support, confidence1))
            if confidence2 >= self.c:
                self.rules.append((item2, item1, support, confidence2))

        return self.rules


# ------------------ Step 4: Run PCY and Generate Rules ------------------

rule_generator = AssociationRuleGenerator(pcy.pair_counts, confidence_threshold=0.6, baskets=baskets)
association_rules = rule_generator.generate_rules()

print("Frequent Pairs:", pcy.pair_counts)
print("Association Rules:", association_rules)

spark.stop()

Frequent Pairs: {('whole milk', 'sausage'): 134, ('whole milk', 'yogurt'): 167, ('whole milk', 'semi-finished bread'): 25, ('sausage', 'yogurt'): 86, ('sausage', 'semi-finished bread'): 9, ('yogurt', 'semi-finished bread'): 12, ('pastry', 'whole milk'): 97, ('pastry', 'salty snack'): 10, ('whole milk', 'salty snack'): 29, ('misc. beverages', 'canned beer'): 8, ('sausage', 'hygiene articles'): 13, ('pickled vegetables', 'soda'): 12, ('whole milk', 'rolls/buns'): 209, ('sausage', 'rolls/buns'): 80, ('frankfurter', 'curd'): 20, ('whole milk', 'soda'): 174, ('beef', 'white bread'): 13, ('frankfurter', 'soda'): 46, ('frankfurter', 'whipped/sour cream'): 22, ('soda', 'whipped/sour cream'): 51, ('frozen vegetables', 'other vegetables'): 47, ('whole milk', 'butter'): 70, ('sugar', 'tropical fruit'): 15, ('butter milk', 'specialty chocolate'): 6, ('detergent', 'root vegetables'): 7, ('whole milk', 'pip fruit'): 99, ('whole milk', 'tropical fruit'): 123, ('pip fruit', 'tropical fruit'): 31, ('re

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_set
from google.colab import drive
from itertools import combinations
from collections import defaultdict

drive.mount('/content/drive')

# ------------------ Step 1: Load Data and Identify Baskets ------------------

spark = SparkSession.builder.appName("PCY_Algorithm").getOrCreate()

file_path = "/content/drive/MyDrive/MMDS/baskets.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

baskets_df = df.groupBy("Member_number", "Date").agg(collect_set("itemDescription").alias("items"))

baskets = baskets_df.rdd.map(lambda row: row["items"]).collect()

baskets_df.show(truncate=False)


# ------------------ Step 2: Implement PCY Algorithm ------------------
class PCYAlgorithm:
    def __init__(self, baskets, support_threshold, confidence_threshold, num_buckets=1000):
        self.baskets = baskets
        self.s = support_threshold
        self.c = confidence_threshold
        self.num_buckets = num_buckets
        self.hash_buckets = [0] * num_buckets
        self.frequent_items = set()
        self.pair_counts = defaultdict(int)
        self.bitmap = []

    def hash_function(self, item1, item2):
        return (hash(item1) ^ hash(item2)) % self.num_buckets

    def pass_one(self):
        """First pass: Count individual items and hash pairs into buckets."""
        item_counts = defaultdict(int)

        for basket in self.baskets:
            for item in basket:
                item_counts[item] += 1
            for item1, item2 in combinations(basket, 2):
                bucket = self.hash_function(item1, item2)
                self.hash_buckets[bucket] += 1

        # Identify frequent items
        self.frequent_items = {item for item, count in item_counts.items() if count >= self.s}
        self.bitmap = [count >= self.s for count in self.hash_buckets]

    def pass_two(self):
        """Second pass: Identify frequent pairs."""
        for basket in self.baskets:
            valid_items = [item for item in basket if item in self.frequent_items]
            for item1, item2 in combinations(valid_items, 2):
                if self.bitmap[self.hash_function(item1, item2)]:
                    self.pair_counts[(item1, item2)] += 1

        self.pair_counts = {pair: count for pair, count in self.pair_counts.items() if count >= self.s}


# ------------------ Step 3: Generate Association Rules ------------------

class AssociationRuleGenerator:
    def __init__(self, frequent_pairs, confidence_threshold, baskets):
        self.frequent_pairs = frequent_pairs  # Dictionary with pair counts
        self.c = confidence_threshold
        self.baskets = baskets
        self.rules = []

    def generate_rules(self):
        """Generate association rules with confidence threshold."""
        for (item1, item2), count in self.frequent_pairs.items():
            support = count / len(self.baskets)

            total_item1 = sum(1 for basket in self.baskets if item1 in basket)
            total_item2 = sum(1 for basket in self.baskets if item2 in basket)

            confidence1 = count / total_item1 if total_item1 > 0 else 0
            confidence2 = count / total_item2 if total_item2 > 0 else 0

            if confidence1 >= self.c:
                self.rules.append((item1, item2, support, confidence1))
            if confidence2 >= self.c:
                self.rules.append((item2, item1, support, confidence2))

        return self.rules


# ------------------ Step 4: Run PCY and Generate Rules ------------------
pcy = PCYAlgorithm(baskets, support_threshold=5, confidence_threshold=0.6)
pcy.pass_one()
pcy.pass_two()

rule_generator = AssociationRuleGenerator(pcy.pair_counts, confidence_threshold=0.6, baskets=baskets)
association_rules = rule_generator.generate_rules()

print("Frequent Pairs:", pcy.pair_counts)
print("Association Rules:", association_rules)

spark.stop()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-------------+----------+--------------------------------------------------+
|Member_number|Date      |items                                             |
+-------------+----------+--------------------------------------------------+
|1000         |15/03/2015|[whole milk, sausage, yogurt, semi-finished bread]|
|1000         |24/06/2014|[pastry, whole milk, salty snack]                 |
|1000         |24/07/2015|[misc. beverages, canned beer]                    |
|1000         |25/11/2015|[sausage, hygiene articles]                       |
|1000         |27/05/2015|[pickled vegetables, soda]                        |
|1001         |02/07/2014|[whole milk, sausage, rolls/buns]                 |
|1001         |05/02/2015|[frankfurter, curd]                               |
|1001         |12/12/2014|[whole milk, soda]                                |
|1001        