In [None]:
!pip install "kagglehub[pandas-datasets]"


In [None]:
# Install dependencies as needed:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "train.csv"

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "rohitsahoo/sales-forecasting",
  file_path,
  # Provide any additional arguments like 
  # sql_query or pandas_kwargs. See the 
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())

  df = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/rohitsahoo/sales-forecasting?dataset_version_number=2&file_name=train.csv...


100%|██████████| 480k/480k [00:00<00:00, 605kB/s]

Extracting zip of train.csv...
First 5 records:    Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   

  




In [None]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# prep data for Apriori algorithm
apriori_df = df[['Order ID', 'Sub-Category']].copy()
apriori_df.dropna(subset=['Order ID', 'Sub-Category'], inplace=True)

In [None]:
# aggregate data to get counts of each Sub-Category per Order ID
transaction_counts = apriori_df.groupby(['Order ID', 'Sub-Category'])['Sub-Category'].count().reset_index(name='Count')

# pivot table to create the basket format
basket = transaction_counts.pivot_table(
    index='Order ID', 
    columns='Sub-Category', 
    values='Count', 
    aggfunc='sum'
).fillna(0)

# convert any value > 1 to 1 (to indicate presence/absence)
def encode_units(x):
    return 1 if x >= 1 else 0

basket_sets = basket.applymap(encode_units)

st.subheader("Transaction Basket (One-Hot Encoded)")
st.caption("Rows are Orders, Columns are Sub-Categories (1 = Present, 0 = Absent)")
st.dataframe(basket_sets.head())

In [None]:
st.header("Apriori Algorithm: Interactive Rules")

# sliders for parameters
min_support = st.slider('Min Support (Frequency of itemset)', 0.001, 0.1, 0.005, 0.001)
min_confidence = st.slider('Min Confidence (Likelihood of Consequent)', 0.1, 1.0, 0.7, 0.05)
min_lift = st.slider('Min Lift (Strength of Rule)', 0.5, 5.0, 1.2, 0.1)

# generate Frequent Itemsets ---
frequent_itemsets = apriori(basket_sets, min_support=min_support, use_colnames=True)
frequent_itemsets['length'] = frequent_itemsets['itemsets'].apply(lambda x: len(x))

# generate Association Rules 
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)

# filter by confidence
rules = rules[rules['confidence'] >= min_confidence]

st.subheader("Discovered Association Rules")
st.write(f"Found **{len(rules)}** rules with the current parameters.")

# Display rules with key metrics
display_rules = rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values('lift', ascending=False)

# Convert frozensets to strings for better display
display_rules['antecedents'] = display_rules['antecedents'].apply(lambda x: ', '.join(list(x)))
display_rules['consequents'] = display_rules['consequents'].apply(lambda x: ', '.join(list(x)))

# Interactive filtering by item
item_filter = st.text_input('Filter Rules by Consequent (e.g., "Chairs"):')
if item_filter:
    display_rules = display_rules[display_rules['consequents'].str.contains(item_filter, case=False, na=False)]

st.dataframe(display_rules)