In [2]:
import pandas as pd
import random
from datetime import datetime, timedelta

# Function to generate random date of birth within a specified range
def random_dob(start_date, end_date):
    days = (end_date - start_date).days
    random_days = random.randint(0, days)
    return start_date + timedelta(days=random_days)

# Sample categories list
categories = ['Entertainment', 'Food', 'HealthFitness', 'Home', 'KidsPets', 'PersonalCare',
              'Grocery', 'Miscellaneous', 'Shopping', 'Travel']

# Generate 1000 random records for the dataset
data = []
for _ in range(1000):
    year = random.randint(2010, 2022)
    month = random.randint(1, 12)
    HOM_tag = random.choice([1, 2])
    HOM_total = random.uniform(100, 500)
    gender = random.choice(['M', 'F'])
    job = random.choice(['Engineer', 'Teacher', 'Doctor', 'Artist', 'Manager'])
    dob = random_dob(datetime(1960, 1, 1), datetime(2005, 12, 31))
    total_spending = random.uniform(500, 2000)
    average_spending = total_spending / random.randint(10, 30)
    most_freq_category = random.choice(categories)

    row = [year, month, HOM_tag, HOM_total]
    for category in categories:
        subtotal = random.uniform(0, 200)
        freq = random.randint(0, 10)
        row.extend([subtotal, freq])
    row.extend([HOM_total, gender, job])
    row.extend([random.uniform(0, 200) for _ in range(len(categories) * 2)])
    row.extend([dob, random.randint(1, 10), total_spending, average_spending, most_freq_category])
    data.append(row)

# Create a DataFrame from the generated data
columns = ['year', 'month', 'HOM_tag', 'HOM_total']
columns.extend([f'{category}' for category in categories])
columns.extend([f'{category}_Freq' for category in categories])
columns.extend(['next_HOM_total', 'gender', 'job'])
columns.extend([f'{category}' for category in categories])
columns.extend([f'{category}_Freq' for category in categories])
columns.extend(['dob', 'Travel_Freq', 'Total_Spending', 'Average_Spending', 'Most_Frequent_Category'])

df = pd.DataFrame(data, columns=columns)

# Save the DataFrame as a CSV file
df.to_csv('sample_dataset.csv', index=False)


In [3]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [4]:
# Read the dataset
df = pd.read_csv('apriori.csv')

In [5]:
# Drop any rows with missing values (NaN)
df.dropna(inplace=True)

In [6]:
# Convert categorical columns to string, as Apriori requires string values
df['gender'] = df['gender'].astype(str)
df['job'] = df['job'].astype(str)

In [7]:
# Convert the 'dob' column to datetime type
df['dob'] = pd.to_datetime(df['dob'])

In [8]:
# Convert the 'month' and 'HOM_tag' columns to object (string) type
df['month'] = df['month'].astype(str)
df['HOM_tag'] = df['HOM_tag'].astype(str)

In [9]:
# Create a list of columns containing item categories
category_columns = [col for col in df.columns if col.endswith('_Freq')]


In [10]:
# Convert the item category columns to binary format (1 if frequency > 0, 0 otherwise)
for col in category_columns:
    df[col] = df[col].apply(lambda x: 1 if x > 0 else 0)

In [11]:
# Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df[category_columns], min_support=0.05, use_colnames=True)



In [12]:
# Association rules generation
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

In [13]:
# Display frequent itemsets
print("Frequent Itemsets:")
print(frequent_itemsets)

Frequent Itemsets:
      support                                           itemsets
0       1.000                               (Entertainment_Freq)
1       0.913                                        (Food_Freq)
2       1.000                               (HealthFitness_Freq)
3       0.918                                        (Home_Freq)
4       1.000                                    (KidsPets_Freq)
...       ...                                                ...
1018    0.669  (Food_Freq, Entertainment_Freq, Grocery_Freq, ...
1019    0.612  (Food_Freq, Entertainment_Freq, Grocery_Freq, ...
1020    0.671  (Entertainment_Freq, Grocery_Freq, Shopping_Fr...
1021    0.612  (Food_Freq, Grocery_Freq, Shopping_Freq, KidsP...
1022    0.612  (Food_Freq, Entertainment_Freq, Grocery_Freq, ...

[1023 rows x 2 columns]


In [14]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,1.000,(Entertainment_Freq)
1,0.913,(Food_Freq)
2,1.000,(HealthFitness_Freq)
3,0.918,(Home_Freq)
4,1.000,(KidsPets_Freq)
...,...,...
1018,0.669,"(Food_Freq, Entertainment_Freq, Grocery_Freq, ..."
1019,0.612,"(Food_Freq, Entertainment_Freq, Grocery_Freq, ..."
1020,0.671,"(Entertainment_Freq, Grocery_Freq, Shopping_Fr..."
1021,0.612,"(Food_Freq, Grocery_Freq, Shopping_Freq, KidsP..."


In [15]:
# Display association rules
print("\nAssociation Rules:")
print(rules)


Association Rules:
                antecedents   
0               (Food_Freq)  \
1      (Entertainment_Freq)   
2      (HealthFitness_Freq)   
3      (Entertainment_Freq)   
4               (Home_Freq)   
...                     ...   
56997         (Travel_Freq)   
56998           (Home_Freq)   
56999  (Miscellaneous_Freq)   
57000  (HealthFitness_Freq)   
57001   (PersonalCare_Freq)   

                                             consequents  antecedent support   
0                                   (Entertainment_Freq)               0.913  \
1                                            (Food_Freq)               1.000   
2                                   (Entertainment_Freq)               1.000   
3                                   (HealthFitness_Freq)               1.000   
4                                   (Entertainment_Freq)               0.918   
...                                                  ...                 ...   
56997  (Food_Freq, Entertainment_Freq, Grocery_

In [16]:
# Save frequent itemsets to a new CSV file
frequent_itemsets.to_csv('frequent_itemsets.csv', index=False)

In [17]:
# Save association rules to a new CSV file
rules.to_csv('association_rules.csv', index=False)

In [18]:
import pandas as pd

# Read the association rules from the CSV file
association_rules = pd.read_csv('association_rules.csv')

# Define the thresholds for the metrics to consider significant
min_confidence = 0.95
min_lift = 1.0
min_conviction = 1.0

# Filter the association rules based on the defined thresholds
significant_rules = association_rules[
    (association_rules['confidence'] >= min_confidence) &
    (association_rules['lift'] >= min_lift) &
    (association_rules['conviction'] >= min_conviction)
]

# Display the filtered significant association rules
print(significant_rules)


                                             antecedents   
0                               frozenset({'Food_Freq'})  \
2                      frozenset({'HealthFitness_Freq'})   
3                      frozenset({'Entertainment_Freq'})   
4                               frozenset({'Home_Freq'})   
6                           frozenset({'KidsPets_Freq'})   
...                                                  ...   
56256  frozenset({'Food_Freq', 'Grocery_Freq', 'Trave...   
56271  frozenset({'Food_Freq', 'Shopping_Freq', 'Trav...   
56276  frozenset({'Food_Freq', 'KidsPets_Freq', 'Trav...   
56280  frozenset({'Food_Freq', 'Travel_Freq', 'Home_F...   
56487  frozenset({'Food_Freq', 'Travel_Freq', 'Home_F...   

                                             consequents  antecedent support   
0                      frozenset({'Entertainment_Freq'})               0.913  \
2                      frozenset({'Entertainment_Freq'})               1.000   
3                      frozenset({'Heal

In [33]:
# Save the filtered significant association rules to a new CSV file
significant_rules.to_csv('significant_rules.csv', index=False)

In [37]:
import pandas as pd

# Read the association rules from the CSV file
association_rules = pd.read_csv('association_rules.csv')

# Define the thresholds for the metrics to consider significant
min_confidence = 0.90
min_lift = 1.0
min_conviction = 1.0

# Filter the association rules based on the defined thresholds
significant_rules = association_rules[
    (association_rules['confidence'] >= min_confidence) &
    (association_rules['lift'] >= min_lift) &
    (association_rules['conviction'] >= min_conviction)
]

# Sort the significant_rules DataFrame by a specific metric, e.g., confidence, in descending order
sorted_rules = significant_rules.sort_values(by='confidence', ascending=False)

# Filter the top 20 significant rules based on the chosen metric (confidence in this example)
top_20_significant_rules = sorted_rules.head(20)

# Save the top 20 significant association rules to a new CSV file
top_20_significant_rules.to_csv('top_20_significant_rules.csv', index=False)


In [42]:
import pandas as pd

# Read the association rules from the CSV file
association_rules = pd.read_csv('association_rules.csv')

# Define the thresholds for the metrics to consider significant
min_confidence = 0.9
min_lift = 1.0
min_conviction = 1.0

# Filter the association rules based on the defined thresholds and length of items in antecedents and consequents
filtered_rules = association_rules[
    (association_rules['confidence'] >= min_confidence) &
    (association_rules['lift'] >= min_lift) &
    (association_rules['conviction'] >= min_conviction) &
    ((association_rules['antecedents'].apply(lambda x: len(eval(x))) >=1) &
     (association_rules['antecedents'].apply(lambda x: len(eval(x))) <= 2)) &
    ((association_rules['consequents'].apply(lambda x: len(eval(x))) >= 1) &
     (association_rules['consequents'].apply(lambda x: len(eval(x))) <= 2))
]

# Sort the filtered_rules DataFrame based on a specific metric, e.g., confidence, in descending order
sorted_rules = filtered_rules.sort_values(by='confidence', ascending=False)

# Filter the top 20 significant rules with 2 to 3 combinations of categories
top_20_significant_rules = sorted_rules.head(20)

# Save the top 20 significant association rules to a new CSV file
top_20_significant_rules.to_csv('top_20_significant_rules_2to3_categories.csv', index=False)
