In [None]:
import pandas as pd
from mlxtend.frequent_patterns import association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder

from fswe_demo.infra.db.get_conn import get_db_connection

In [None]:
conn = get_db_connection()

In [None]:
# Get data from the db

transaction_df = pd.read_sql_table("int_product_baskets", conn)

In [None]:
transaction_df

In [None]:
# Basic EDA
transaction_df.info()

In [None]:
product_baskets_list = transaction_df["product_basket"].tolist()
product_baskets_list

In [None]:
# Encoding the transactions
te = TransactionEncoder()
encoded_transaction_df = pd.DataFrame(
    te.fit(product_baskets_list).transform(product_baskets_list), columns=te.columns_,
)

In [None]:
encoded_transaction_df

In [None]:
import pandas as pd

df = encoded_transaction_df

print(df.shape)  # (n_transactions, n_items)
print(df.dtypes.unique())  # should be only bool (or 0/1 numeric)
print(df.sum().sum())  # total True/1s; should be > 0
print(df.head())

In [None]:
n_tx = len(df)
per_item_support = df.mean().sort_values(ascending=False)  # support = fraction of txns
print(per_item_support.head(10))
print("Support threshold count =", 0.2 * n_tx)

In [None]:
# Step 1: Parameter Tweaking

# Test different support thresholds to maximize item pairs while maintaining quality
support_candidates = [0.0005, 0.001, 0.002, 0.005, 0.01]
optimization_results = []

for min_support in support_candidates:
    print(f"Testing support threshold: {min_support:.4f}")

    # Generate frequent itemsets
    freq_itemsets = fpgrowth(
        encoded_transaction_df, min_support=min_support, use_colnames=True,
    )

    if len(freq_itemsets) > 0:
        # Add length column
        freq_itemsets["length"] = freq_itemsets["itemsets"].apply(lambda x: len(x))

        # Count itemsets by length
        length_counts = freq_itemsets["length"].value_counts().sort_index()

        # Try to generate association rules
        itemset_pairs = freq_itemsets[freq_itemsets["length"] == 2]
        rules_count = 0

        if len(itemset_pairs) > 0:
            try:
                rules = association_rules(
                    freq_itemsets, metric="confidence", min_threshold=0.1,
                )
                rules_count = len(rules)
            except:
                rules_count = 0

        optimization_results.append(
            {
                "min_support": min_support,
                "total_itemsets": len(freq_itemsets),
                "single_items": length_counts.get(1, 0),
                "item_pairs": length_counts.get(2, 0),
                "larger_sets": sum(length_counts.get(i, 0) for i in range(3, 20)),
                "association_rules": rules_count,
                "coverage_ratio": length_counts.get(2, 0)
                / max(length_counts.get(1, 1), 1),  # pairs per single item
            },
        )

        print(
            f"{len(freq_itemsets)} itemsets, {length_counts.get(2, 0)} pairs, {rules_count} rules",
        )
    else:
        print("No frequent itemsets found")
        optimization_results.append(
            {
                "min_support": min_support,
                "total_itemsets": 0,
                "single_items": 0,
                "item_pairs": 0,
                "larger_sets": 0,
                "association_rules": 0,
                "coverage_ratio": 0,
            },
        )

# Display optimization results
opt_df = pd.DataFrame(optimization_results)
print(opt_df.to_string(index=False))

# Select optimal parameters (maximize item pairs while having reasonable rules)
optimal_row = (
    opt_df[opt_df["item_pairs"] > 0].iloc[0]
    if len(opt_df[opt_df["item_pairs"] > 0]) > 0
    else opt_df.iloc[0]
)
optimal_support = optimal_row["min_support"]

In [None]:
# Step 2: Generate Comprehensive Itemsets and Rules for Recommendations
# Use optimal support threshold
print(f"Using optimal support threshold: {optimal_support:.4f}")

# Generate frequent itemsets
frequent_itemsets = fpgrowth(
    encoded_transaction_df, min_support=optimal_support, use_colnames=True,
)
frequent_itemsets["length"] = frequent_itemsets["itemsets"].apply(lambda x: len(x))

print(f"Generated {len(frequent_itemsets)} frequent itemsets")

# Analyze itemset distribution
length_distribution = frequent_itemsets["length"].value_counts().sort_index()
print("Itemset distribution:")
for length, count in length_distribution.items():
    print(f"   Length {length}: {count} itemsets")

# Generate association rules with multiple confidence thresholds
confidence_thresholds = [0.1, 0.2, 0.3, 0.4, 0.5]
best_rules = None
best_threshold = None

print("Finding optimal confidence threshold:")
for conf_threshold in confidence_thresholds:
    try:
        rules = association_rules(
            frequent_itemsets, metric="confidence", min_threshold=conf_threshold,
        )
        if len(rules) > 0:
            print(
                f"   Confidence {conf_threshold:.1f}: {len(rules)} rules (avg lift: {rules['lift'].mean():.2f})",
            )
            if best_rules is None or len(rules) > len(best_rules):
                best_rules = rules
                best_threshold = conf_threshold
    except:
        print(f"   Confidence {conf_threshold:.1f}: No rules generated")

In [None]:
# Step 3: Build Item-to-Item Recommendation Matrix


def build_item_recommendation_matrix(rules_df, itemsets_df, top_k=10):
    # Get all unique items from frequent itemsets
    all_items = set()
    for itemset in itemsets_df["itemsets"]:
        all_items.update(itemset)

    all_items = sorted(list(all_items))
    print(f"Total unique items in catalog: {len(all_items)}")

    # Initialize recommendation matrix
    recommendations = {}

    # Method 1: Use association rules (antecedent -> consequent)
    if len(rules_df) > 0:
        for _, rule in rules_df.iterrows():
            antecedents = list(rule["antecedents"])
            consequents = list(rule["consequents"])

            # Add recommendations for each antecedent
            for ant_item in antecedents:
                if ant_item not in recommendations:
                    recommendations[ant_item] = []

                for cons_item in consequents:
                    recommendations[ant_item].append(
                        {
                            "item": cons_item,
                            "confidence": rule["confidence"],
                            "lift": rule["lift"],
                            "support": rule["support"],
                            "method": "association_rule",
                        },
                    )

    # Method 2: Use frequent item pairs (co-occurrence based)
    item_pairs = itemsets_df[itemsets_df["length"] == 2]
    print(f"Processing {len(item_pairs)} frequent item pairs")

    for _, pair_row in item_pairs.iterrows():
        items_in_pair = list(pair_row["itemsets"])
        if len(items_in_pair) == 2:
            item1, item2 = items_in_pair
            pair_support = pair_row["support"]

            # Calculate individual item supports
            item1_support = encoded_transaction_df[item1].mean()
            item2_support = encoded_transaction_df[item2].mean()

            # Calculate confidence and lift for both directions
            confidence_1_to_2 = pair_support / item1_support if item1_support > 0 else 0
            confidence_2_to_1 = pair_support / item2_support if item2_support > 0 else 0

            lift_1_to_2 = confidence_1_to_2 / item2_support if item2_support > 0 else 0
            lift_2_to_1 = confidence_2_to_1 / item1_support if item1_support > 0 else 0

            # Add bidirectional recommendations
            for source_item, target_item, conf, lift_val in [
                (item1, item2, confidence_1_to_2, lift_1_to_2),
                (item2, item1, confidence_2_to_1, lift_2_to_1),
            ]:
                if source_item not in recommendations:
                    recommendations[source_item] = []

                recommendations[source_item].append(
                    {
                        "item": target_item,
                        "confidence": conf,
                        "lift": lift_val,
                        "support": pair_support,
                        "method": "frequent_pair",
                    },
                )

    # Sort and limit recommendations for each item
    final_recommendations = {}
    items_with_recommendations = 0

    for item, recs in recommendations.items():
        # Remove duplicates and sort by lift then confidence
        unique_recs = {}
        for rec in recs:
            target_item = rec["item"]
            if (
                target_item not in unique_recs
                or rec["lift"] > unique_recs[target_item]["lift"]
            ):
                unique_recs[target_item] = rec

        # Sort by lift (descending) then confidence (descending)
        sorted_recs = sorted(
            unique_recs.values(),
            key=lambda x: (x["lift"], x["confidence"]),
            reverse=True,
        )

        # Keep top K recommendations
        final_recommendations[item] = sorted_recs[:top_k]

        if len(sorted_recs) > 0:
            items_with_recommendations += 1

    return final_recommendations, all_items


# Build the recommendation matrix
item_recommendations, catalog_items = build_item_recommendation_matrix(
    best_rules if best_rules is not None else pd.DataFrame(),
    frequent_itemsets,
    top_k=10,
)

print(f"Items with recommendations: {len(item_recommendations)}")
total_recommendations = sum(len(recs) for recs in item_recommendations.values())
print(f"Total recommendation pairs: {total_recommendations}")
if len(item_recommendations) > 0:
    avg_recs_per_item = total_recommendations / len(item_recommendations)
    print(f"Average recommendations per item: {avg_recs_per_item:.1f}")

In [None]:
class ItemRecommendationEngine:
    def __init__(self, recommendation_matrix, fallback_items=None):
        self.recommendation_matrix = recommendation_matrix
        self.fallback_items = fallback_items or []
        self.total_items = len(recommendation_matrix)

    def get_recommendations(
        self, item_id, num_recommendations=5, min_lift=1.0, min_confidence=0.1,
    ):
        """
        Get recommendations for a specific item

        Args:
            item_id: Product ID to get recommendations for
            num_recommendations: Number of recommendations to return
            min_lift: Minimum lift threshold for recommendations
            min_confidence: Minimum confidence threshold

        Returns:
            List of recommended items with scores

        """
        if item_id not in self.recommendation_matrix:
            return {
                "item_id": item_id,
                "recommendations": self.fallback_items[:num_recommendations],
                "method": "fallback",
                "message": "No specific recommendations found, using popular items",
            }

        # Filter recommendations by quality thresholds
        candidates = self.recommendation_matrix[item_id]
        filtered_recs = [
            rec
            for rec in candidates
            if rec["lift"] >= min_lift and rec["confidence"] >= min_confidence
        ]

        # Limit to requested number
        final_recs = filtered_recs[:num_recommendations]

        return {
            "item_id": item_id,
            "recommendations": final_recs,
            "method": "fpgrowth",
            "total_candidates": len(candidates),
            "after_filtering": len(filtered_recs),
            "returned": len(final_recs),
        }

    def get_batch_recommendations(self, item_ids, num_recommendations=5):
        """Get recommendations for multiple items at once"""
        return {
            item_id: self.get_recommendations(item_id, num_recommendations)
            for item_id in item_ids
        }

    def get_similar_items(self, item_id, similarity_threshold=2.0):
        """Get items similar to given item (high lift values)"""
        if item_id not in self.recommendation_matrix:
            return []

        similar_items = [
            rec
            for rec in self.recommendation_matrix[item_id]
            if rec["lift"] >= similarity_threshold
        ]

        return sorted(similar_items, key=lambda x: x["lift"], reverse=True)

    def get_stats(self):
        """Get engine statistics"""
        if not self.recommendation_matrix:
            return {"total_items": 0, "coverage": 0}

        total_recs = sum(len(recs) for recs in self.recommendation_matrix.values())
        avg_recs = (
            total_recs / len(self.recommendation_matrix)
            if self.recommendation_matrix
            else 0
        )

        return {
            "total_items_with_recs": len(self.recommendation_matrix),
            "total_recommendation_pairs": total_recs,
            "average_recs_per_item": round(avg_recs, 2),
            "coverage_percentage": round(
                len(self.recommendation_matrix) / max(self.total_items, 1) * 100, 1,
            ),
        }


# Create fallback recommendations (most popular items)
popular_items = (
    encoded_transaction_df.sum().sort_values(ascending=False).head(20).index.tolist()
)

# Initialize the recommendation engine
rec_engine = ItemRecommendationEngine(
    recommendation_matrix=item_recommendations, fallback_items=popular_items,
)

stats = rec_engine.get_stats()
for key, value in stats.items():
    print(f"{key.replace('_', ' ').title()}: {value}")

# Test the recommendation engine
test_items = list(item_recommendations.keys())[:3] if item_recommendations else []

for test_item in test_items:
    print(f"\nRecommendations for {test_item}")
    result = rec_engine.get_recommendations(test_item, num_recommendations=5)

    print(f"Method: {result['method']}")
    print(f"Candidates: {result.get('total_candidates', 0)}")

    for i, rec in enumerate(result["recommendations"], 1):
        if isinstance(rec, dict):
            print(f"{i}. {rec['item']}")
            print(f"Confidence: {rec['confidence']:.3f}, Lift: {rec['lift']:.2f}")
        else:
            print(f"{i}. {rec} (fallback)")

In [None]:
# Step 5: Recommendation Analysis and Visualization

import matplotlib.pyplot as plt
import numpy as np


# Analyze recommendation quality distribution
def analyze_recommendation_quality(rec_matrix):
    """Analyze the quality distribution of recommendations"""
    all_confidences = []
    all_lifts = []
    all_supports = []

    for item, recs in rec_matrix.items():
        for rec in recs:
            all_confidences.append(rec["confidence"])
            all_lifts.append(rec["lift"])
            all_supports.append(rec["support"])

    return {
        "confidences": all_confidences,
        "lifts": all_lifts,
        "supports": all_supports,
        "total_pairs": len(all_confidences),
    }


quality_metrics = analyze_recommendation_quality(item_recommendations)

print(f"Total recommendation pairs: {quality_metrics['total_pairs']}")
if quality_metrics["total_pairs"] > 0:
    print(f"Average confidence: {np.mean(quality_metrics['confidences']):.3f}")
    print(f"Average lift: {np.mean(quality_metrics['lifts']):.3f}")
    print(f"Average support: {np.mean(quality_metrics['supports']):.4f}")
    print(
        f"High-quality pairs (lift > 2): {sum(1 for l in quality_metrics['lifts'] if l > 2)}",
    )

# Create comprehensive visualizations
if quality_metrics["total_pairs"] > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    fig.suptitle(
        "FP Growth Item-to-Item Recommendation Analysis", fontsize=16, fontweight="bold",
    )

    # 1. Confidence distribution
    axes[0, 0].hist(
        quality_metrics["confidences"], bins=20, alpha=0.7, edgecolor="black",
    )
    axes[0, 0].set_xlabel("Confidence")
    axes[0, 0].set_ylabel("Frequency")
    axes[0, 0].set_title("Distribution of Recommendation\\nConfidence Values")
    axes[0, 0].axvline(
        np.mean(quality_metrics["confidences"]),
        color="red",
        linestyle="--",
        label="Mean",
    )
    axes[0, 0].legend()

    # 2. Lift distribution
    axes[0, 1].hist(quality_metrics["lifts"], bins=20, alpha=0.7, edgecolor="black")
    axes[0, 1].set_xlabel("Lift")
    axes[0, 1].set_ylabel("Frequency")
    axes[0, 1].set_title("Distribution of Recommendation\\nLift Values")
    axes[0, 1].axvline(1, color="red", linestyle="--", label="Lift = 1")
    axes[0, 1].axvline(
        np.mean(quality_metrics["lifts"]), color="orange", linestyle="--", label="Mean",
    )
    axes[0, 1].legend()

    # 3. Support distribution
    axes[0, 2].hist(quality_metrics["supports"], bins=20, alpha=0.7, edgecolor="black")
    axes[0, 2].set_xlabel("Support")
    axes[0, 2].set_ylabel("Frequency")
    axes[0, 2].set_title("Distribution of Recommendation\\nSupport Values")
    axes[0, 2].axvline(
        np.mean(quality_metrics["supports"]), color="red", linestyle="--", label="Mean",
    )
    axes[0, 2].legend()

    # 4. Confidence vs Lift scatter
    axes[1, 0].scatter(
        quality_metrics["confidences"],
        quality_metrics["lifts"],
        c=quality_metrics["supports"],
        alpha=0.6,
        cmap="viridis",
    )
    axes[1, 0].set_xlabel("Confidence")
    axes[1, 0].set_ylabel("Lift")
    axes[1, 0].set_title("Confidence vs Lift\\n(colored by Support)")
    axes[1, 0].axhline(1, color="red", linestyle="--", alpha=0.5)

    # 5. Recommendations per item distribution
    recs_per_item = [len(recs) for recs in item_recommendations.values()]
    axes[1, 1].hist(
        recs_per_item,
        bins=min(20, max(recs_per_item) if recs_per_item else 1),
        alpha=0.7,
        edgecolor="black",
    )
    axes[1, 1].set_xlabel("Number of Recommendations")
    axes[1, 1].set_ylabel("Number of Items")
    axes[1, 1].set_title("Distribution of Recommendations\\nper Item")

    # 6. Top items by recommendation count
    item_rec_counts = {item: len(recs) for item, recs in item_recommendations.items()}
    top_items = sorted(item_rec_counts.items(), key=lambda x: x[1], reverse=True)[:10]

    if top_items:
        items, counts = zip(*top_items, strict=False)
        y_pos = np.arange(len(items))
        axes[1, 2].barh(y_pos, counts)
        axes[1, 2].set_yticks(y_pos)
        axes[1, 2].set_yticklabels(
            [item[:10] + "..." if len(item) > 10 else item for item in items],
        )
        axes[1, 2].set_xlabel("Number of Recommendations")
        axes[1, 2].set_title("Top 10 Items by\\nRecommendation Count")
        axes[1, 2].invert_yaxis()

    plt.tight_layout()
    plt.show()
else:
    print("No recommendations to visualize")

# Coverage analysis
print("COVERAGE ANALYSIS:")
total_catalog_items = len(catalog_items)
items_with_recs = len(item_recommendations)
coverage_percentage = (
    (items_with_recs / total_catalog_items * 100) if total_catalog_items > 0 else 0
)

print(f"Total catalog items: {total_catalog_items}")
print(f"Items with recommendations: {items_with_recs}")
print(f"Coverage: {coverage_percentage:.1f}%")

# Identify items without recommendations (cold start problem)
items_without_recs = set(catalog_items) - set(item_recommendations.keys())
print(f"   Items without recommendations: {len(items_without_recs)}")

if len(items_without_recs) > 0 and len(items_without_recs) <= 10:
    print(f"Items needing fallback: {list(items_without_recs)}")
elif len(items_without_recs) > 10:
    print(f"Sample items needing fallback: {list(items_without_recs)[:10]}...")