In [None]:
# Бернуллийн Наив Байесын загварын хэрэглээ: Хэрэглэгчийн худалдан авах төлөвийг урьдчилан таамаглах нь

In [None]:
%pip install "kagglehub[pandas-datasets]"

In [39]:
# ============================================
# 1. Load dataset (raw text, safe)
# ============================================

import kagglehub
import pandas as pd
import numpy as np
from itertools import chain

# Download dataset directory
path = kagglehub.dataset_download("irfanasrullah/groceries")

# Load using raw text (avoids CSV parser errors)
transactions = []
with open(path + "/groceries.csv", "r") as f:
    for line in f:
        transactions.append(line.strip().split(","))

df = pd.DataFrame({"items": transactions})
df.head()


Unnamed: 0,items
0,"[citrus fruit, semi-finished bread, margarine,..."
1,"[tropical fruit, yogurt, coffee]"
2,[whole milk]
3,"[pip fruit, yogurt, cream cheese, meat spreads]"
4,"[other vegetables, whole milk, condensed milk,..."


In [40]:
# ============================================
# 2. Build vocabulary (unique items)
# ============================================

vocabulary = sorted(set(chain.from_iterable(df["items"])))
vocab_index = {item: i for i, item in enumerate(vocabulary)}

len(vocabulary), vocabulary[:10]

(169,
 ['Instant food products',
  'UHT-milk',
  'abrasive cleaner',
  'artif. sweetener',
  'baby cosmetics',
  'baby food',
  'bags',
  'baking powder',
  'bathroom cleaner',
  'beef'])

In [41]:
# ============================================
# 3. Convert transactions into Bernoulli matrix (X_df)
# ============================================

def one_hot_encode(items):
    row = np.zeros(len(vocabulary), dtype=int)
    for item in items:
        if item in vocab_index:
            row[vocab_index[item]] = 1
    return row

X = np.array([one_hot_encode(items) for items in df["items"]])
X_df = pd.DataFrame(X, columns=vocabulary)

X_df.head()


Unnamed: 0,Instant food products,UHT-milk,abrasive cleaner,artif. sweetener,baby cosmetics,baby food,bags,baking powder,bathroom cleaner,beef,...,turkey,vinegar,waffles,whipped/sour cream,whisky,white bread,white wine,whole milk,yogurt,zwieback
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [42]:
# ============================================
# 4. Train a Bernoulli Naive Bayes model for one item
# ============================================

from sklearn.naive_bayes import BernoulliNB

def train_model_for_item(target_item):
    y = X_df[target_item]                 # binary label
    model = BernoulliNB()
    model.fit(X_df, y)
    return model

In [43]:
# ============================================
# 5. Convert basket → model input vector (with feature names)
# ============================================

def basket_to_vector(basket_items):
    row = np.zeros(len(vocabulary), dtype=int)
    for item in basket_items:
        if item in vocab_index:
            row[vocab_index[item]] = 1
    return pd.DataFrame([row], columns=vocabulary)


In [44]:
# ============================================
# 6. Predict probability for a specific item
# ============================================

def predict_item_probability(basket_items, target_item):
    model = train_model_for_item(target_item)
    vector_df = basket_to_vector(basket_items)
    prob = model.predict_proba(vector_df)[0][1]   # probability of presence
    return prob

# Example
predict_item_probability(["whole milk", "yogurt"], "other vegetables")

2.6235988872054888e-05

In [48]:
# ============================================
# 7. Predict probabilities for ALL items
# ============================================

def predict_all_probabilities(basket_items):
    vector_df = basket_to_vector(basket_items)
    results = []
    
    for item in vocabulary:
        y = X_df[item]
        model = BernoulliNB()
        model.fit(X_df, y)
        prob = model.predict_proba(vector_df)[0][1]
        results.append((item, prob))
    
    # Sort by probability
    return sorted(results, key=lambda x: x[1], reverse=True)

In [49]:
# ============================================
# 8. Next-item Recommendation (FIXED)
#    - Removes items already in basket
# ============================================

def recommend_next_items(basket_items, top_n=10):
    all_probs = predict_all_probabilities(basket_items)

    # Remove items already purchased
    filtered = [(item, prob) for item, prob in all_probs if item not in basket_items]
    
    return filtered[:top_n]

# Example:
recommend_next_items(["whole milk", "yogurt"], top_n=10)

[('rolls/buns', 8.634629253134407e-05),
 ('canned beer', 7.43601756168794e-05),
 ('newspapers', 6.833871619946316e-05),
 ('bottled beer', 6.488521456883784e-05),
 ('beverages', 6.147490295906677e-05),
 ('photo/film', 5.995996821312183e-05),
 ('bottled water', 5.6532264872071996e-05),
 ('soda', 4.925262993790484e-05),
 ('pastry', 4.397669579555303e-05),
 ('brown bread', 3.9248359553000384e-05)]