In [1]:
!pip install mlxtend pandas numpy scikit-learn




In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NameError: name 'pd' is not defined

In [1]:
import pandas as pd
import numpy as np
from mlxtend.frequent_patterns import apriori, association_rules
import time

# Load dataset
df = pd.read_csv('/content/drive/MyDrive/ratings.csv')

# Convert ratings into binary format (1 = liked, 0 = not liked)
df['liked'] = df['rating'] >= 4.0  # Consider movies rated 4+ as "liked"

# Drop unnecessary columns
df = df[['userId', 'movieId', 'liked']]

# Create a user-movie pivot table
basket = df.pivot_table(index='userId', columns='movieId', values='liked', fill_value=False).astype(bool)

print("Data preprocessing completed!")


Data preprocessing completed!


# Basic Apriori Implementation

In [5]:
start_time = time.time()

# Apply Apriori Algorithm
frequent_itemsets = apriori(basket, min_support=0.045, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

end_time = time.time()
print(f"Basic Apriori Execution Time: {end_time - start_time:.2f} seconds")

# Display top rules
rules.head(10)


Basic Apriori Execution Time: 23.77 seconds


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(32),(1),0.193443,0.240984,0.083607,0.432203,1.793497,1.0,0.03699,1.336775,0.548541,0.238318,0.251931,0.389571
1,(1),(32),0.240984,0.193443,0.083607,0.346939,1.793497,1.0,0.03699,1.235041,0.582899,0.238318,0.19031,0.389571
2,(1),(34),0.240984,0.122951,0.070492,0.292517,2.379138,1.0,0.040863,1.239675,0.763725,0.240223,0.193337,0.432925
3,(34),(1),0.122951,0.240984,0.070492,0.573333,2.379138,1.0,0.040863,1.778945,0.660943,0.240223,0.437869,0.432925
4,(1),(47),0.240984,0.229508,0.078689,0.326531,1.422741,1.0,0.023381,1.144064,0.391469,0.200837,0.125923,0.334694
5,(47),(1),0.229508,0.240984,0.078689,0.342857,1.422741,1.0,0.023381,1.155025,0.385638,0.200837,0.134218,0.334694
6,(1),(50),0.240984,0.267213,0.093443,0.387755,1.451108,1.0,0.029049,1.196885,0.409571,0.225296,0.164498,0.368724
7,(50),(1),0.267213,0.240984,0.093443,0.349693,1.451108,1.0,0.029049,1.167167,0.424232,0.225296,0.143224,0.368724
8,(1),(110),0.240984,0.272131,0.090164,0.37415,1.374887,1.0,0.024585,1.163008,0.359238,0.213178,0.140161,0.352737
9,(110),(1),0.272131,0.240984,0.090164,0.331325,1.374887,1.0,0.024585,1.135106,0.374611,0.213178,0.119025,0.352737


# Improved Apriori with Optimizations

In [8]:
from collections import defaultdict

# Transaction Reduction: Remove users with very few interactions
min_movies = 10  # Users must have rated at least 10 movies
filtered_users = df.groupby("userId").filter(lambda x: len(x) >= min_movies)

# Create a new user-movie pivot table
optimized_basket = filtered_users.pivot_table(index='userId', columns='movieId', values='liked', fill_value=False).astype(bool)

# Hash-Based Candidate Generation (Using a simple hash function)
# Hash-Based Candidate Generation (Using a simple hash function)
def hash_function(row):
    return hash(tuple(row.values)) % 1000  # Assigns a bucket from 0-999

# Apply hashing to reduce candidates
hashed_data = optimized_basket.apply(hash_function, axis=1)

start_time2 = time.time()

# Apply Improved Apriori
improved_itemsets = apriori(optimized_basket, min_support=0.045, use_colnames=True)

# Generate rules
improved_rules = association_rules(improved_itemsets, metric="lift", min_threshold=1.0)

end_time2 = time.time()
print(f"Improved Apriori Execution Time: {end_time2 - start_time2:.2f} seconds")

# Display top rules
improved_rules.head(10)


Improved Apriori Execution Time: 19.73 seconds


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,representativity,leverage,conviction,zhangs_metric,jaccard,certainty,kulczynski
0,(32),(1),0.193443,0.240984,0.083607,0.432203,1.793497,1.0,0.03699,1.336775,0.548541,0.238318,0.251931,0.389571
1,(1),(32),0.240984,0.193443,0.083607,0.346939,1.793497,1.0,0.03699,1.235041,0.582899,0.238318,0.19031,0.389571
2,(1),(34),0.240984,0.122951,0.070492,0.292517,2.379138,1.0,0.040863,1.239675,0.763725,0.240223,0.193337,0.432925
3,(34),(1),0.122951,0.240984,0.070492,0.573333,2.379138,1.0,0.040863,1.778945,0.660943,0.240223,0.437869,0.432925
4,(1),(47),0.240984,0.229508,0.078689,0.326531,1.422741,1.0,0.023381,1.144064,0.391469,0.200837,0.125923,0.334694
5,(47),(1),0.229508,0.240984,0.078689,0.342857,1.422741,1.0,0.023381,1.155025,0.385638,0.200837,0.134218,0.334694
6,(1),(50),0.240984,0.267213,0.093443,0.387755,1.451108,1.0,0.029049,1.196885,0.409571,0.225296,0.164498,0.368724
7,(50),(1),0.267213,0.240984,0.093443,0.349693,1.451108,1.0,0.029049,1.167167,0.424232,0.225296,0.143224,0.368724
8,(1),(110),0.240984,0.272131,0.090164,0.37415,1.374887,1.0,0.024585,1.163008,0.359238,0.213178,0.140161,0.352737
9,(110),(1),0.272131,0.240984,0.090164,0.331325,1.374887,1.0,0.024585,1.135106,0.374611,0.213178,0.119025,0.352737


#Compare Basic vs. Improved Apriori

In [9]:
print("Basic Apriori: Found", len(rules), "rules in", f"{end_time - start_time:.2f} seconds")
print("Improved Apriori: Found", len(improved_rules), "rules in", f"{end_time2 - start_time2:.2f} seconds")


Basic Apriori: Found 1136944 rules in 23.77 seconds
Improved Apriori: Found 1136944 rules in 19.73 seconds
