Apriori algorithm

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import numpy as np

In [2]:
# Load the dataset
data = pd.read_csv('data/data.csv')  # Replace with your file path

In [3]:

# Preprocessing: Categorizing continuous features
bins_tempo = [0, 100, 140, np.inf]
labels_tempo = ['Slow', 'Medium', 'Fast']
data['tempo_category'] = pd.cut(data['tempo'], bins=bins_tempo, labels=labels_tempo, right=False)

def categorize_feature(feature, bins, labels):
    return pd.cut(data[feature], bins=bins, labels=labels, right=False)

# Categorize other features
feature_bins_labels = {
    'danceability': ([0, 0.33, 0.66, 1], ['Low', 'Medium', 'High']),
    'energy': ([0, 0.33, 0.66, 1], ['Low', 'Medium', 'High']),
    # Add other features here...
}

for feature, (bins, labels) in feature_bins_labels.items():
    data[f'{feature}_cat'] = categorize_feature(feature, bins, labels)

# Convert to transactions
selected_columns = [col for col in data.columns if '_cat' in col]
transactions = data[selected_columns].values.tolist()
transactions = [[item for item in transaction if str(item) != 'nan'] for transaction in transactions]

# Apply TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply Apriori algorithm
frequent_itemsets = apriori(df, min_support=0.01, use_colnames=True)  # Adjust min_support as needed

# Display frequent itemsets
print(frequent_itemsets)

     support              itemsets
0   0.209654                (Fast)
1   0.474249                (High)
2   0.393905                 (Low)
3   0.858245              (Medium)
4   0.333038                (Slow)
5   0.102700          (Fast, High)
6   0.071273           (Fast, Low)
7   0.167181        (Fast, Medium)
8   0.085349           (High, Low)
9   0.398135        (High, Medium)
10  0.117765          (High, Slow)
11  0.286230         (Low, Medium)
12  0.173153           (Low, Slow)
13  0.233755        (Medium, Slow)
14  0.017978     (Fast, Low, High)
15  0.072820  (Fast, Medium, High)
16  0.040702   (Fast, Medium, Low)
17  0.043316   (High, Medium, Low)
18  0.024055     (High, Low, Slow)
19  0.071531  (High, Medium, Slow)
20  0.096049   (Low, Medium, Slow)


In [4]:
from mlxtend.frequent_patterns import association_rules

# Generate the rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)  # Adjust threshold as needed

# Calculate and add the lift metric
rules["lift"] = rules["confidence"] / (rules["consequent support"] / rules["antecedent support"])

# Print out the rules along with their support, confidence, and lift
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

       antecedents     consequents   support  confidence      lift
0           (Fast)          (High)  0.102700    0.489854  0.216552
1           (High)          (Fast)  0.102700    0.216552  0.489854
2           (Fast)           (Low)  0.071273    0.339958  0.180940
3            (Low)          (Fast)  0.071273    0.180940  0.339958
4           (Fast)        (Medium)  0.167181    0.797417  0.194795
5         (Medium)          (Fast)  0.167181    0.194795  0.797417
6           (High)           (Low)  0.085349    0.179966  0.216673
7            (Low)          (High)  0.085349    0.216673  0.179966
8           (High)        (Medium)  0.398135    0.839507  0.463895
9         (Medium)          (High)  0.398135    0.463895  0.839507
10          (High)          (Slow)  0.117765    0.248320  0.353609
11          (Slow)          (High)  0.117765    0.353609  0.248320
12           (Low)        (Medium)  0.286230    0.726648  0.333506
13        (Medium)           (Low)  0.286230    0.333506  0.72