In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('store.csv',encoding='latin1')

# Data Familiarization and Quality Check
print(df.head(1))  # Display first few rows to understand the data
print(df.info())  # Check data types and missing values

# Data Cleaning
# Handle missing values
df.dropna(inplace=True)  # Remove rows with missing values
# Remove duplicates
df.drop_duplicates(inplace=True)

# Data Transformation for Association Rule Mining
# Convert to transactional format
transactions = df.groupby('Order.ID')['Product.Name'].apply(list).reset_index(name='Products')

# Save the transformed data to a new CSV file for association rule mining
transactions.to_csv('transactions.csv', index=False)


   Row.ID      Order.ID Order.Date Ship.Date  Ship.Mode Customer.ID  \
0   42433  AG-2011-2040   1/1/2011  6/1/2011          4    TB-11280   

     Customer.Name  Segment         City        State  ... Category  \
0  Toby Braunhardt        1  Constantine  Constantine  ...        2   

   Sub.Category         Product.Name  Sales Quantity  Discount  Profit  \
0       Storage  Tenex Lockers, Blue  408.3        2       0.0  106.14   

  Shipping.Cost  Status  Order.Priority  
0         35.46       1               4  

[1 rows x 25 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51290 entries, 0 to 51289
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Row.ID          51290 non-null  int64  
 1   Order.ID        51290 non-null  object 
 2   Order.Date      51290 non-null  object 
 3   Ship.Date       51290 non-null  object 
 4   Ship.Mode       51290 non-null  int64  
 5   Customer.ID     51290 non-nul

In [4]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Load the dataset
# df = pd.read_csv('your_dataset.csv')

# Data Cleaning and Preprocessing
# Convert datetime columns to datetime objects
df['Order.Date'] = pd.to_datetime(df['Order.Date'])
df['Ship.Date'] = pd.to_datetime(df['Ship.Date'])

# Select Relevant Columns
relevant_cols = ['Customer.ID', 'Product.ID', 'Category', 'Sub.Category']
df_relevant = df[relevant_cols]

# Convert categorical variables to string type
df_relevant['Product.ID'] = df_relevant['Product.ID'].astype(str)
df_relevant['Category'] = df_relevant['Category'].astype(str)
df_relevant['Sub.Category'] = df_relevant['Sub.Category'].astype(str)

# Transaction Encoding using One-Hot Encoding
te = TransactionEncoder()
te_ary = te.fit_transform(df_relevant.values)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Frequent Itemset Mining using Apriori
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.3)

# Display frequent itemsets and association rules
print("Frequent Itemsets:\n", frequent_itemsets)
print("\nAssociation Rules:\n", rules)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Product.ID'] = df_relevant['Product.ID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Category'] = df_relevant['Category'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_relevant['Sub.Category'] = df_relevant['Sub.Category'].astype(str)


Frequent Itemsets:
      support          itemsets
0   0.212227               (1)
1   0.602962               (2)
2   0.184811               (3)
3   0.077547     (Accessories)
4   0.046628      (Appliances)
5   0.079648             (Art)
6   0.152391         (Binders)
7   0.022814       (Bookcases)
8   0.061737          (Chairs)
9   0.025415       (Envelopes)
10  0.021713       (Fasteners)
11  0.095757     (Furnishings)
12  0.036422          (Labels)
13  0.011507        (Machines)
14  0.137082           (Paper)
15  0.088953          (Phones)
16  0.084651         (Storage)
17  0.019011        (Supplies)
18  0.031919          (Tables)
19  0.022814    (1, Bookcases)
20  0.061737       (1, Chairs)
21  0.095757  (1, Furnishings)
22  0.031919       (1, Tables)
23  0.046628   (2, Appliances)
24  0.079648          (2, Art)
25  0.152391      (2, Binders)
26  0.025415    (2, Envelopes)
27  0.021713    (2, Fasteners)
28  0.036422       (2, Labels)
29  0.137082        (Paper, 2)
30  0.084651      (

In [5]:
rules = association_rules(frequent_itemsets, metric='confidence', min_threshold=0.3)

# Compute additional metrics for rules
rules['lift'] = rules['support'] / (rules['antecedent support'] * rules['consequent support'])
rules['kulczynski'] = (rules['confidence'] + rules['support']) / 2
rules['imbalance_ratio'] = abs(rules['antecedent support'] - rules['consequent support']) / (rules['antecedent support'] + rules['consequent support'] - rules['support'])

print("Association Rules with Additional Metrics:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'kulczynski', 'imbalance_ratio']])


Association Rules with Additional Metrics:
      antecedents    consequents   support  confidence      lift  kulczynski  \
0     (Bookcases)            (1)  0.022814    1.000000  4.711928    0.511407   
1        (Chairs)            (1)  0.061737    1.000000  4.711928    0.530869   
2             (1)  (Furnishings)  0.095757    0.451202  4.711928    0.273480   
3   (Furnishings)            (1)  0.095757    1.000000  4.711928    0.547879   
4        (Tables)            (1)  0.031919    1.000000  4.711928    0.515960   
5    (Appliances)            (2)  0.046628    1.000000  1.658480    0.523314   
6           (Art)            (2)  0.079648    1.000000  1.658480    0.539824   
7       (Binders)            (2)  0.152391    1.000000  1.658480    0.576196   
8     (Envelopes)            (2)  0.025415    1.000000  1.658480    0.512708   
9     (Fasteners)            (2)  0.021713    1.000000  1.658480    0.510857   
10       (Labels)            (2)  0.036422    1.000000  1.658480    0.518211 

In [6]:
# Assuming '1', '2', '3', etc. are product IDs representing columns in df_encoded
multilevel_frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True)
multilevel_rules = association_rules(multilevel_frequent_itemsets, metric='confidence', min_threshold=0.3)
print("Multilevel Frequent Itemsets:")
print(multilevel_frequent_itemsets)
print("\nMultilevel Association Rules:")
print(multilevel_rules)


Multilevel Frequent Itemsets:
     support          itemsets
0   0.212227               (1)
1   0.602962               (2)
2   0.184811               (3)
3   0.077547     (Accessories)
4   0.046628      (Appliances)
5   0.079648             (Art)
6   0.152391         (Binders)
7   0.022814       (Bookcases)
8   0.061737          (Chairs)
9   0.025415       (Envelopes)
10  0.021713       (Fasteners)
11  0.095757     (Furnishings)
12  0.036422          (Labels)
13  0.011507        (Machines)
14  0.137082           (Paper)
15  0.088953          (Phones)
16  0.084651         (Storage)
17  0.019011        (Supplies)
18  0.031919          (Tables)
19  0.022814    (1, Bookcases)
20  0.061737       (1, Chairs)
21  0.095757  (1, Furnishings)
22  0.031919       (1, Tables)
23  0.046628   (2, Appliances)
24  0.079648          (2, Art)
25  0.152391      (2, Binders)
26  0.025415    (2, Envelopes)
27  0.021713    (2, Fasteners)
28  0.036422       (2, Labels)
29  0.137082        (Paper, 2)
30  0.084