In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [4]:
file_path = "/home/alnd/code/Alanoudis/food-delivery-rec/data/updated_data/full_data100k.csv"
full_data = pd.read_csv(file_path,index_col=0)
full_data.head()

Unnamed: 0,customer_id,customer_geohash,order_id,vendor_id,product_id,day_of_week,order_time,order_day,name,unit_price,chain_id,vendor_geohash,cuisine_origin,order_frequency,product_rating
0,1ba124d4e5,w21z7,0,212753d2,783e85338f1c,0,12:03:29,85 days,japanese garlic karaage don,6.0,66c9978d,w21z7,japanese,1,4
1,1ba124d4e5,w21z7,0,212753d2,084ab73246e6,0,12:03:29,85 days,chicken cutlet don,6.8,66c9978d,w21z7,japanese,1,5
2,1ba124d4e5,w21z7,0,212753d2,30eba3cc2676,0,12:03:29,85 days,beef sukiyaki don,6.8,66c9978d,w21z7,japanese,1,3
3,1ba124d4e5,w21z7,0,212753d2,3910309eea60,0,12:03:29,85 days,japanese beef yakiniku don,6.8,66c9978d,w21z7,japanese,1,5
4,1ba124d4e5,w21z7,0,212753d2,20049fb602cb,0,12:03:29,85 days,teriyaki salmon don,8.0,66c9978d,w21z7,japanese,1,5


In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

class AutomatedRecommendationSystem:
    def __init__(self, data):
        self.full_data = data
        self.model = None
        self.interaction_matrix = None
        self.vendor_similarity = None
        self.content_matrix = None
        self.reverse_user_map = None
        self.vendor_map = None
        self.train_data = None
        self.test_data = None

    def prepare_data(self, data=None):
        """Prepare and preprocess the data"""
        if data is None:
            data = self.full_data

        print("Preparing data...")
        df = data[['customer_id', 'vendor_id', 'order_frequency', 'product_rating']].copy()
        df['score'] = df['order_frequency'] * df['product_rating']

        # Encode users and vendors
        df['user_code'] = df['customer_id'].astype('category').cat.codes
        df['vendor_code'] = df['vendor_id'].astype('category').cat.codes

        # Build interaction matrix
        interaction_matrix = coo_matrix(
            (df['score'], (df['user_code'], df['vendor_code']))
        ).T.tocsr()

        # Build lookup tables
        user_map = dict(enumerate(df['customer_id'].astype('category').cat.categories))
        vendor_map = dict(enumerate(df['vendor_id'].astype('category').cat.categories))
        reverse_user_map = {v: k for k, v in user_map.items()}

        print(f"Data prepared: {len(user_map)} users, {len(vendor_map)} vendors")
        return df, interaction_matrix, user_map, vendor_map, reverse_user_map

    def build_content_features(self, data=None):
        """Build vendor content-based features"""
        if data is None:
            data = self.full_data

        print("Building content features...")
        vendor_features = data.groupby('vendor_id').agg({
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean'
        }).reset_index()

        # One-hot encode cuisine
        cuisine_encoded = pd.get_dummies(vendor_features['cuisine_origin'])

        # Normalize numerical features
        vendor_features['unit_price_norm'] = (
            vendor_features['unit_price'] - vendor_features['unit_price'].min()
        ) / (vendor_features['unit_price'].max() - vendor_features['unit_price'].min())

        vendor_features['product_rating_norm'] = (
            vendor_features['product_rating'] - vendor_features['product_rating'].min()
        ) / (vendor_features['product_rating'].max() - vendor_features['product_rating'].min())

        # Combine all features
        content_matrix = pd.concat([
            cuisine_encoded,
            vendor_features[['unit_price_norm', 'product_rating_norm']]
        ], axis=1)
        content_matrix.index = vendor_features['vendor_id']

        # Compute vendor similarity
        vendor_similarity = pd.DataFrame(
            cosine_similarity(content_matrix),
            index=content_matrix.index,
            columns=content_matrix.index
        )
        print("Content features built successfully")
        return content_matrix, vendor_similarity

    def train_als_model(self, interaction_matrix, factors=50, regularization=0.1, iterations=30):
        """Train the ALS model"""
        print("Training ALS model...")
        model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            random_state=42
        )
        model.fit(interaction_matrix.T)
        print("ALS model trained successfully")
        return model

    def split_train_test_data(self, test_ratio=0.2):
        """Split data into train and test sets for evaluation"""
        print("Splitting data into train and test sets...")

        def split_user_data(df, test_ratio=0.2):
            train_list, test_list = [], []
            for user_id, user_df in df.groupby('customer_id'):
                if len(user_df) < 2:  # Reduced minimum for more users in test
                    continue
                train, test = train_test_split(user_df, test_size=test_ratio, random_state=42)
                train_list.append(train)
                test_list.append(test)
            return pd.concat(train_list), pd.concat(test_list)

        self.train_data, self.test_data = split_user_data(self.full_data, test_ratio)
        print(f"Train data: {len(self.train_data)} records, {self.train_data['customer_id'].nunique()} users")
        print(f"Test data: {len(self.test_data)} records, {self.test_data['customer_id'].nunique()} users")
        return self.train_data, self.test_data

    # ==================== EVALUATION METRICS ====================

    def precision_at_k(self, recommended, actual, k=10):
        """Calculate Precision@K"""
        if len(recommended) == 0:
            return 0.0
        recommended_k = recommended[:k]
        relevant = [1 if item in actual else 0 for item in recommended_k]
        return sum(relevant) / len(recommended_k)

    def ndcg_at_k(self, recommended, actual, k=10):
        """Calculate NDCG@K"""
        if len(recommended) == 0:
            return 0.0

        recommended_k = recommended[:k]
        relevance = [1 if item in actual else 0 for item in recommended_k]

        # Calculate DCG
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance)])

        # Calculate IDCG
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_relevance)])

        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, recommended, actual, k=10):
        """Calculate MRR@K"""
        for idx, item in enumerate(recommended[:k]):
            if item in actual:
                return 1.0 / (idx + 1)
        return 0.0

    def evaluate_metrics(self, N=10):
        """Evaluate all metrics on test data"""
        print(f"\n📊 EVALUATING MODEL PERFORMANCE (N={N})")
        print("=" * 50)

        if self.train_data is None or self.test_data is None:
            print("Splitting data first...")
            self.split_train_test_data()

        # Prepare training data for evaluation model
        print("Training evaluation model on training data...")
        df_train, interaction_matrix_train, user_map_train, vendor_map_train, reverse_user_map_train = self.prepare_data(self.train_data)

        # Train ALS model on training data
        eval_model = self.train_als_model(interaction_matrix_train)

        # Build content features on training data
        content_matrix_train, vendor_similarity_train = self.build_content_features(self.train_data)

        scores = []
        test_users = self.test_data['customer_id'].unique()

        print(f"Evaluating on {len(test_users)} test users...")

        processed_count = 0
        for user_id in test_users:
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()

            try:
                # Skip if user not in training data
                if user_id not in reverse_user_map_train:
                    continue

                # Get user index
                user_idx = reverse_user_map_train[user_id]
                user_items = interaction_matrix_train.T.tocsr()

                # Get ALS recommendations
                recommended = eval_model.recommend(user_idx, user_items[user_idx], N=N)
                recommended_vendors = [vendor_map_train[int(i[0])] for i in recommended]

                # Calculate metrics
                prec = self.precision_at_k(recommended_vendors, actual, k=N)
                ndcg = self.ndcg_at_k(recommended_vendors, actual, k=N)
                mrr = self.mrr_at_k(recommended_vendors, actual, k=N)

                scores.append((prec, ndcg, mrr))
                processed_count += 1

                if processed_count % 100 == 0:
                    print(f"Processed {processed_count}/{len(test_users)} users...")

            except Exception as e:
                continue

        if not scores:
            print("❌ No valid scores computed! Check your data.")
            return None

        scores = np.array(scores)

        results = {
            'Precision@10': scores[:, 0].mean(),
            'NDCG@10': scores[:, 1].mean(),
            'MRR@10': scores[:, 2].mean(),
            'Users_Evaluated': len(scores)
        }

        # Print results
        print("\n📈 EVALUATION RESULTS:")
        print("-" * 40)
        for metric, value in results.items():
            if metric == 'Users_Evaluated':
                print(f"{metric}: {value}")
            else:
                print(f"{metric}: {value:.4f}")

        return results

    def get_customer_order_history(self, customer_id):
        """Get detailed order history for a customer"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return f"No order history found for customer: {customer_id}"

        # Aggregate order history
        order_summary = customer_orders.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'order_frequency': 'sum',
            'product_rating': 'mean',
            'unit_price': 'mean'
        }).reset_index()

        return order_summary

    def get_customer_taste_profile(self, customer_id):
        """Analyze customer taste preferences"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return "No taste profile available (new customer)"

        taste_profile = {
            'total_orders': len(customer_orders),
            'unique_vendors': customer_orders['vendor_id'].nunique(),
            'preferred_cuisines': customer_orders['cuisine_origin'].value_counts().head(3).to_dict(),
            'avg_rating_given': customer_orders['product_rating'].mean(),
            'avg_spending': customer_orders['unit_price'].mean(),
            'favorite_vendors': customer_orders.groupby('vendor_id')['order_frequency']
                                                .sum().sort_values(ascending=False).head(3).to_dict()
        }

        return taste_profile

    def recommend_vendors(self, customer_id, N=10):
        """Get ALS-based recommendations"""
        if customer_id not in self.reverse_user_map:
            return "Cold-start user. Recommend popular restaurants."

        user_idx = self.reverse_user_map[customer_id]
        user_items = self.interaction_matrix.T.tocsr()

        recommended = self.model.recommend(user_idx, user_items[user_idx], N=N)
        recommended_vendors = [self.vendor_map[int(i[0])] for i in recommended]

        return recommended_vendors

    def hybrid_recommend(self, customer_id, N=10, als_weight=0.5, content_weight=0.5):
        """Get hybrid recommendations combining ALS and content-based"""
        # Get user's ordered vendors from full data
        user_vendors = self.full_data[self.full_data['customer_id'] == customer_id]['vendor_id'].unique()

        if len(user_vendors) == 0:
            return "Cold-start user. Recommend popular restaurants."

        # Content-based similarity scores
        content_scores = self.vendor_similarity[user_vendors].mean(axis=1)
        content_scores = content_scores.drop(user_vendors, errors='ignore')

        # ALS recommendations
        als_recs = self.recommend_vendors(customer_id, N=100)

        if isinstance(als_recs, str):
            return als_recs

        als_scores = pd.Series([1 / (i + 1) for i in range(len(als_recs))], index=als_recs)

        # Combine scores
        hybrid_scores = pd.concat([als_scores, content_scores], axis=1).fillna(0)
        hybrid_scores.columns = ['als', 'content']
        hybrid_scores['hybrid'] = (als_weight * hybrid_scores['als'] +
                                 content_weight * hybrid_scores['content'])

        # Return top N
        top_hybrid = hybrid_scores['hybrid'].sort_values(ascending=False).head(N).index.tolist()
        return top_hybrid

    def get_recommendation_details(self, vendor_ids):
        """Get detailed information about recommended vendors"""
        if isinstance(vendor_ids, str):
            return vendor_ids

        vendor_details = self.full_data[self.full_data['vendor_id'].isin(vendor_ids)]

        summary = vendor_details.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean',
            'order_frequency': 'mean'
        }).reset_index()

        return summary

    def automated_customer_report(self, customer_id, N=10):
        """Generate complete automated report for a customer"""
        print(f"🔍 CUSTOMER ANALYSIS REPORT: {customer_id}")
        print("=" * 60)

        # 1. Order History
        print("\n📊 ORDER HISTORY:")
        print("-" * 30)
        order_history = self.get_customer_order_history(customer_id)
        if isinstance(order_history, pd.DataFrame):
            print(order_history.to_string(index=False))
        else:
            print(order_history)

        # 2. Taste Profile
        print("\n👤 TASTE PROFILE:")
        print("-" * 30)
        taste_profile = self.get_customer_taste_profile(customer_id)
        if isinstance(taste_profile, dict):
            for key, value in taste_profile.items():
                print(f"{key.replace('_', ' ').title()}: {value}")
        else:
            print(taste_profile)

        # 3. Recommendations
        print(f"\n🎯 HYBRID RECOMMENDATIONS (Top {N}):")
        print("-" * 40)
        recommendations = self.hybrid_recommend(customer_id, N=N)
        rec_details = self.get_recommendation_details(recommendations)

        if isinstance(rec_details, pd.DataFrame):
            print(rec_details.to_string(index=False))
        else:
            print(rec_details)

        return {
            'customer_id': customer_id,
            'order_history': order_history,
            'taste_profile': taste_profile,
            'recommendations': rec_details
        }

# ==================== FIXED MAIN FUNCTION ====================

def main():
    # Assuming full_data is your dataset
    print("🚀 INITIALIZING AUTOMATED RECOMMENDATION SYSTEM")
    print("=" * 60)

    # Initialize the system
    recommender = AutomatedRecommendationSystem(full_data)

    # Step 1: Prepare data and train main model
    print("\n1. 🔄 TRAINING MAIN MODEL...")
    df, interaction_matrix, user_map, vendor_map, reverse_user_map = recommender.prepare_data()
    recommender.interaction_matrix = interaction_matrix
    recommender.vendor_map = vendor_map
    recommender.reverse_user_map = reverse_user_map

    content_matrix, vendor_similarity = recommender.build_content_features()
    recommender.content_matrix = content_matrix
    recommender.vendor_similarity = vendor_similarity

    recommender.model = recommender.train_als_model(interaction_matrix)

    # Step 2: Evaluate model performance
    print("\n2. 📊 EVALUATING MODEL PERFORMANCE...")
    evaluation_results = recommender.evaluate_metrics(N=10)

    # Step 3: Generate customer reports
    print("\n3. 👥 GENERATING CUSTOMER REPORTS...")

    # Test with specific customers or random ones
    test_customers = ["2e7276ad3a", "f374c8c54c"]  # Replace with actual customer IDs

    reports = {}
    for customer_id in test_customers:
        try:
            print(f"\n{'='*80}")
            report = recommender.automated_customer_report(customer_id, N=10)
            reports[customer_id] = report
            print(f"✅ Report completed for {customer_id}")
        except Exception as e:
            print(f"❌ Error processing {customer_id}: {str(e)}")

    print("\n🎉 ALL TASKS COMPLETED!")
    return recommender, reports, evaluation_results


In [None]:
# ==================== COMPLETE RUNNABLE CODE ====================

def main():
    # Assuming full_data is your dataset
    print("🚀 INITIALIZING AUTOMATED RECOMMENDATION SYSTEM")
    print("=" * 60)

    # Initialize the system
    recommender = AutomatedRecommendationSystem(full_data)

    # Step 1: Prepare data and train main model
    print("\n1. 🔄 TRAINING MAIN MODEL...")
    recommender.prepare_data()
    recommender.build_content_features()
    recommender.train_als_model()

    # Step 2: Evaluate model performance
    print("\n2. 📊 EVALUATING MODEL PERFORMANCE...")
    evaluation_results = recommender.evaluate_metrics(N=10)

    # Step 3: Generate customer reports
    print("\n3. 👥 GENERATING CUSTOMER REPORTS...")

    # Test with specific customers or random ones
    test_customers = ["2e7276ad3a", "f374c8c54c"]  # Replace with actual customer IDs

    reports = {}
    for customer_id in test_customers:
        try:
            print(f"\n{'='*80}")
            report = recommender.automated_customer_report(customer_id, N=10)
            reports[customer_id] = report
            print(f"✅ Report completed for {customer_id}")
        except Exception as e:
            print(f"❌ Error processing {customer_id}: {str(e)}")

    print("\n🎉 ALL TASKS COMPLETED!")
    return recommender, reports, evaluation_results

# Run the complete system
# Run the complete system
print("Starting the recommendation system...")
recommender_system, customer_reports, model_metrics = main()

# Quick evaluation only (if you want to run just metrics)
def quick_evaluation():
    """Run just the evaluation metrics"""
    print("📊 RUNNING QUICK MODEL EVALUATION...")
    recommender = AutomatedRecommendationSystem(full_data)
    results = recommender.evaluate_metrics(N=10)
    return results

# Uncomment to run evaluation only:
# eval_results = quick_evaluation()
# print("\nQuick Evaluation Results:")
# for metric, value in eval_results.items():
#     print(f"{metric}: {value}")

Starting the recommendation system...
🚀 INITIALIZING AUTOMATED RECOMMENDATION SYSTEM

1. 🔄 TRAINING MAIN MODEL...
Preparing data...
Data prepared: 11174 users, 5818 vendors
Building content features...
Content features built successfully


TypeError: AutomatedRecommendationSystem.train_als_model() missing 1 required positional argument: 'interaction_matrix'

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

class AutomatedRecommendationSystem:
    def __init__(self, data):
        self.full_data = data
        self.model = None
        self.interaction_matrix = None
        self.vendor_similarity = None
        self.content_matrix = None
        self.reverse_user_map = None
        self.vendor_map = None
        self.train_data = None
        self.test_data = None

    def prepare_data(self, data=None):
        """Prepare and preprocess the data"""
        if data is None:
            data = self.full_data

        print("Preparing data...")
        df = data[['customer_id', 'vendor_id', 'order_frequency', 'product_rating']].copy()
        df['score'] = df['order_frequency'] * df['product_rating']

        # Encode users and vendors
        df['user_code'] = df['customer_id'].astype('category').cat.codes
        df['vendor_code'] = df['vendor_id'].astype('category').cat.codes

        # Build interaction matrix
        interaction_matrix = coo_matrix(
            (df['score'], (df['user_code'], df['vendor_code']))
        ).T.tocsr()

        # Build lookup tables
        user_map = dict(enumerate(df['customer_id'].astype('category').cat.categories))
        vendor_map = dict(enumerate(df['vendor_id'].astype('category').cat.categories))
        reverse_user_map = {v: k for k, v in user_map.items()}

        print(f"Data prepared: {len(user_map)} users, {len(vendor_map)} vendors")
        return df, interaction_matrix, user_map, vendor_map, reverse_user_map

    def build_content_features(self, data=None):
        """Build vendor content-based features"""
        if data is None:
            data = self.full_data

        print("Building content features...")
        vendor_features = data.groupby('vendor_id').agg({
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean'
        }).reset_index()

        # One-hot encode cuisine
        cuisine_encoded = pd.get_dummies(vendor_features['cuisine_origin'])

        # Normalize numerical features
        vendor_features['unit_price_norm'] = (
            vendor_features['unit_price'] - vendor_features['unit_price'].min()
        ) / (vendor_features['unit_price'].max() - vendor_features['unit_price'].min())

        vendor_features['product_rating_norm'] = (
            vendor_features['product_rating'] - vendor_features['product_rating'].min()
        ) / (vendor_features['product_rating'].max() - vendor_features['product_rating'].min())

        # Combine all features
        content_matrix = pd.concat([
            cuisine_encoded,
            vendor_features[['unit_price_norm', 'product_rating_norm']]
        ], axis=1)
        content_matrix.index = vendor_features['vendor_id']

        # Compute vendor similarity
        vendor_similarity = pd.DataFrame(
            cosine_similarity(content_matrix),
            index=content_matrix.index,
            columns=content_matrix.index
        )
        print("Content features built successfully")
        return content_matrix, vendor_similarity

    def train_als_model(self, interaction_matrix, factors=50, regularization=0.1, iterations=30):
        """Train the ALS model"""
        print("Training ALS model...")
        model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            random_state=42
        )
        model.fit(interaction_matrix.T)
        print("ALS model trained successfully")
        return model

    def split_train_test_data(self, test_ratio=0.2):
        """Split data into train and test sets for evaluation"""
        print("Splitting data into train and test sets...")

        def split_user_data(df, test_ratio=0.2):
            train_list, test_list = [], []
            for user_id, user_df in df.groupby('customer_id'):
                if len(user_df) < 2:  # Reduced minimum for more users in test
                    continue
                train, test = train_test_split(user_df, test_size=test_ratio, random_state=42)
                train_list.append(train)
                test_list.append(test)
            return pd.concat(train_list), pd.concat(test_list)

        self.train_data, self.test_data = split_user_data(self.full_data, test_ratio)
        print(f"Train data: {len(self.train_data)} records, {self.train_data['customer_id'].nunique()} users")
        print(f"Test data: {len(self.test_data)} records, {self.test_data['customer_id'].nunique()} users")
        return self.train_data, self.test_data

    # ==================== EVALUATION METRICS ====================

    def precision_at_k(self, recommended, actual, k=10):
        """Calculate Precision@K"""
        if len(recommended) == 0:
            return 0.0
        recommended_k = recommended[:k]
        relevant = [1 if item in actual else 0 for item in recommended_k]
        return sum(relevant) / len(recommended_k)

    def ndcg_at_k(self, recommended, actual, k=10):
        """Calculate NDCG@K"""
        if len(recommended) == 0:
            return 0.0

        recommended_k = recommended[:k]
        relevance = [1 if item in actual else 0 for item in recommended_k]

        # Calculate DCG
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance)])

        # Calculate IDCG
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_relevance)])

        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, recommended, actual, k=10):
        """Calculate MRR@K"""
        for idx, item in enumerate(recommended[:k]):
            if item in actual:
                return 1.0 / (idx + 1)
        return 0.0

    def evaluate_metrics(self, N=10):
        """Evaluate all metrics on test data"""
        print(f"\n📊 EVALUATING MODEL PERFORMANCE (N={N})")
        print("=" * 50)

        if self.train_data is None or self.test_data is None:
            print("Splitting data first...")
            self.split_train_test_data()

        # Prepare training data for evaluation model
        print("Training evaluation model on training data...")
        df_train, interaction_matrix_train, user_map_train, vendor_map_train, reverse_user_map_train = self.prepare_data(self.train_data)

        # Train ALS model on training data
        eval_model = self.train_als_model(interaction_matrix_train)

        # Build content features on training data
        content_matrix_train, vendor_similarity_train = self.build_content_features(self.train_data)

        scores = []
        test_users = self.test_data['customer_id'].unique()

        print(f"Evaluating on {len(test_users)} test users...")

        processed_count = 0
        for user_id in test_users:
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()

            try:
                # Skip if user not in training data
                if user_id not in reverse_user_map_train:
                    continue

                # Get user index
                user_idx = reverse_user_map_train[user_id]
                user_items = interaction_matrix_train.T.tocsr()

                # Get ALS recommendations
                recommended = eval_model.recommend(user_idx, user_items[user_idx], N=N)
                recommended_vendors = [vendor_map_train[int(i[0])] for i in recommended]

                # Calculate metrics
                prec = self.precision_at_k(recommended_vendors, actual, k=N)
                ndcg = self.ndcg_at_k(recommended_vendors, actual, k=N)
                mrr = self.mrr_at_k(recommended_vendors, actual, k=N)

                scores.append((prec, ndcg, mrr))
                processed_count += 1

                if processed_count % 100 == 0:
                    print(f"Processed {processed_count}/{len(test_users)} users...")

            except Exception as e:
                continue

        if not scores:
            print("❌ No valid scores computed! Check your data.")
            return None

        scores = np.array(scores)

        results = {
            'Precision@10': scores[:, 0].mean(),
            'NDCG@10': scores[:, 1].mean(),
            'MRR@10': scores[:, 2].mean(),
            'Users_Evaluated': len(scores)
        }

        # Print results
        print("\n📈 EVALUATION RESULTS:")
        print("-" * 40)
        for metric, value in results.items():
            if metric == 'Users_Evaluated':
                print(f"{metric}: {value}")
            else:
                print(f"{metric}: {value:.4f}")

        return results

    def get_customer_order_history(self, customer_id):
        """Get detailed order history for a customer"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return f"No order history found for customer: {customer_id}"

        # Aggregate order history
        order_summary = customer_orders.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'order_frequency': 'sum',
            'product_rating': 'mean',
            'unit_price': 'mean'
        }).reset_index()

        return order_summary

    def get_customer_taste_profile(self, customer_id):
        """Analyze customer taste preferences"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return "No taste profile available (new customer)"

        taste_profile = {
            'total_orders': len(customer_orders),
            'unique_vendors': customer_orders['vendor_id'].nunique(),
            'preferred_cuisines': customer_orders['cuisine_origin'].value_counts().head(3).to_dict(),
            'avg_rating_given': customer_orders['product_rating'].mean(),
            'avg_spending': customer_orders['unit_price'].mean(),
            'favorite_vendors': customer_orders.groupby('vendor_id')['order_frequency']
                                                .sum().sort_values(ascending=False).head(3).to_dict()
        }

        return taste_profile

    def recommend_vendors(self, customer_id, N=10):
        """Get ALS-based recommendations"""
        if customer_id not in self.reverse_user_map:
            return "Cold-start user. Recommend popular restaurants."

        user_idx = self.reverse_user_map[customer_id]
        user_items = self.interaction_matrix.T.tocsr()

        recommended = self.model.recommend(user_idx, user_items[user_idx], N=N)
        recommended_vendors = [self.vendor_map[int(i[0])] for i in recommended]

        return recommended_vendors

    def hybrid_recommend(self, customer_id, N=10, als_weight=0.5, content_weight=0.5):
        """Get hybrid recommendations combining ALS and content-based"""
        # Get user's ordered vendors from full data
        user_vendors = self.full_data[self.full_data['customer_id'] == customer_id]['vendor_id'].unique()

        if len(user_vendors) == 0:
            return "Cold-start user. Recommend popular restaurants."

        # Content-based similarity scores
        content_scores = self.vendor_similarity[user_vendors].mean(axis=1)
        content_scores = content_scores.drop(user_vendors, errors='ignore')

        # ALS recommendations
        als_recs = self.recommend_vendors(customer_id, N=100)

        if isinstance(als_recs, str):
            return als_recs

        als_scores = pd.Series([1 / (i + 1) for i in range(len(als_recs))], index=als_recs)

        # Combine scores
        hybrid_scores = pd.concat([als_scores, content_scores], axis=1).fillna(0)
        hybrid_scores.columns = ['als', 'content']
        hybrid_scores['hybrid'] = (als_weight * hybrid_scores['als'] +
                                 content_weight * hybrid_scores['content'])

        # Return top N
        top_hybrid = hybrid_scores['hybrid'].sort_values(ascending=False).head(N).index.tolist()
        return top_hybrid

    def get_recommendation_details(self, vendor_ids):
        """Get detailed information about recommended vendors"""
        if isinstance(vendor_ids, str):
            return vendor_ids

        vendor_details = self.full_data[self.full_data['vendor_id'].isin(vendor_ids)]

        summary = vendor_details.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean',
            'order_frequency': 'mean'
        }).reset_index()

        return summary

    def automated_customer_report(self, customer_id, N=10):
        """Generate complete automated report for a customer"""
        print(f"🔍 CUSTOMER ANALYSIS REPORT: {customer_id}")
        print("=" * 60)

        # 1. Order History
        print("\n📊 ORDER HISTORY:")
        print("-" * 30)
        order_history = self.get_customer_order_history(customer_id)
        if isinstance(order_history, pd.DataFrame):
            print(order_history.to_string(index=False))
        else:
            print(order_history)

        # 2. Taste Profile
        print("\n👤 TASTE PROFILE:")
        print("-" * 30)
        taste_profile = self.get_customer_taste_profile(customer_id)
        if isinstance(taste_profile, dict):
            for key, value in taste_profile.items():
                print(f"{key.replace('_', ' ').title()}: {value}")
        else:
            print(taste_profile)

        # 3. Recommendations
        print(f"\n🎯 HYBRID RECOMMENDATIONS (Top {N}):")
        print("-" * 40)
        recommendations = self.hybrid_recommend(customer_id, N=N)
        rec_details = self.get_recommendation_details(recommendations)

        if isinstance(rec_details, pd.DataFrame):
            print(rec_details.to_string(index=False))
        else:
            print(rec_details)

        return {
            'customer_id': customer_id,
            'order_history': order_history,
            'taste_profile': taste_profile,
            'recommendations': rec_details
        }

# ==================== FIXED MAIN FUNCTION ====================

def main():
    # Assuming full_data is your dataset
    print("🚀 INITIALIZING AUTOMATED RECOMMENDATION SYSTEM")
    print("=" * 60)

    # Initialize the system
    recommender = AutomatedRecommendationSystem(full_data)

    # Step 1: Prepare data and train main model
    print("\n1. 🔄 TRAINING MAIN MODEL...")
    df, interaction_matrix, user_map, vendor_map, reverse_user_map = recommender.prepare_data()
    recommender.interaction_matrix = interaction_matrix
    recommender.vendor_map = vendor_map
    recommender.reverse_user_map = reverse_user_map

    content_matrix, vendor_similarity = recommender.build_content_features()
    recommender.content_matrix = content_matrix
    recommender.vendor_similarity = vendor_similarity

    recommender.model = recommender.train_als_model(interaction_matrix)

    # Step 2: Evaluate model performance
    print("\n2. 📊 EVALUATING MODEL PERFORMANCE...")
    evaluation_results = recommender.evaluate_metrics(N=10)

    # Step 3: Generate customer reports
    print("\n3. 👥 GENERATING CUSTOMER REPORTS...")

    # Test with specific customers or random ones
    test_customers = ["2e7276ad3a", "f374c8c54c"]  # Replace with actual customer IDs

    reports = {}
    for customer_id in test_customers:
        try:
            print(f"\n{'='*80}")
            report = recommender.automated_customer_report(customer_id, N=10)
            reports[customer_id] = report
            print(f"✅ Report completed for {customer_id}")
        except Exception as e:
            print(f"❌ Error processing {customer_id}: {str(e)}")

    print("\n🎉 ALL TASKS COMPLETED!")
    return recommender, reports, evaluation_results

# Run the complete system
print("Starting the recommendation system...")
recommender_system, customer_reports, model_metrics = main()

# Quick evaluation only (if you want to run just metrics)
def quick_evaluation():
    """Run just the evaluation metrics"""
    print("📊 RUNNING QUICK MODEL EVALUATION...")
    recommender = AutomatedRecommendationSystem(full_data)
    results = recommender.evaluate_metrics(N=10)
    return results

# Uncomment to run evaluation only:
# eval_results = quick_evaluation()
# print("\nQuick Evaluation Results:")
# for metric, value in eval_results.items():
#     print(f"{metric}: {value}")

Starting the recommendation system...
🚀 INITIALIZING AUTOMATED RECOMMENDATION SYSTEM

1. 🔄 TRAINING MAIN MODEL...
Preparing data...
Data prepared: 11174 users, 5818 vendors
Building content features...
Content features built successfully
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

ALS model trained successfully

2. 📊 EVALUATING MODEL PERFORMANCE...

📊 EVALUATING MODEL PERFORMANCE (N=10)
Splitting data first...
Splitting data into train and test sets...
Train data: 74671 records, 9306 users
Test data: 23461 records, 9306 users
Training evaluation model on training data...
Preparing data...
Data prepared: 9306 users, 5632 vendors
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

ALS model trained successfully
Building content features...
Content features built successfully
Evaluating on 9306 test users...
Processed 100/9306 users...
Processed 200/9306 users...
Processed 300/9306 users...
Processed 400/9306 users...
Processed 500/9306 users...
Processed 600/9306 users...
Processed 700/9306 users...
Processed 800/9306 users...
Processed 900/9306 users...
Processed 1000/9306 users...
Processed 1100/9306 users...
Processed 1200/9306 users...
Processed 1300/9306 users...
Processed 1400/9306 users...
Processed 1500/9306 users...
Processed 1600/9306 users...
Processed 1700/9306 users...
Processed 1800/9306 users...
Processed 1900/9306 users...
Processed 2000/9306 users...
Processed 2100/9306 users...
Processed 2200/9306 users...
Processed 2300/9306 users...
Processed 2400/9306 users...
Processed 2500/9306 users...
Processed 2600/9306 users...
Processed 2700/9306 users...
Processed 2800/9306 users...
Processed 2900/9306 users...
Processed 3000/9306 users...
Processed 

# NEW CODE TEST

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

class RecommendationModelTrainer:
    def __init__(self, data):
        self.full_data = data
        self.model = None
        self.interaction_matrix = None
        self.vendor_similarity = None
        self.content_matrix = None
        self.reverse_user_map = None
        self.vendor_map = None
        self.train_data = None
        self.test_data = None

    def prepare_data(self, data=None):
        """Prepare and preprocess the data"""
        if data is None:
            data = self.full_data

        print("Preparing data...")
        df = data[['customer_id', 'vendor_id', 'order_frequency', 'product_rating']].copy()
        df['score'] = df['order_frequency'] * df['product_rating']

        # Encode users and vendors
        df['user_code'] = df['customer_id'].astype('category').cat.codes
        df['vendor_code'] = df['vendor_id'].astype('category').cat.codes

        # Build interaction matrix
        interaction_matrix = coo_matrix(
            (df['score'], (df['user_code'], df['vendor_code']))
        ).T.tocsr()

        # Build lookup tables
        user_map = dict(enumerate(df['customer_id'].astype('category').cat.categories))
        vendor_map = dict(enumerate(df['vendor_id'].astype('category').cat.categories))
        reverse_user_map = {v: k for k, v in user_map.items()}

        print(f"Data prepared: {len(user_map)} users, {len(vendor_map)} vendors")
        return df, interaction_matrix, user_map, vendor_map, reverse_user_map

    def build_content_features(self, data=None):
        """Build vendor content-based features"""
        if data is None:
            data = self.full_data

        print("Building content features...")
        vendor_features = data.groupby('vendor_id').agg({
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean'
        }).reset_index()

        # One-hot encode cuisine
        cuisine_encoded = pd.get_dummies(vendor_features['cuisine_origin'])

        # Normalize numerical features
        vendor_features['unit_price_norm'] = (
            vendor_features['unit_price'] - vendor_features['unit_price'].min()
        ) / (vendor_features['unit_price'].max() - vendor_features['unit_price'].min())

        vendor_features['product_rating_norm'] = (
            vendor_features['product_rating'] - vendor_features['product_rating'].min()
        ) / (vendor_features['product_rating'].max() - vendor_features['product_rating'].min())

        # Combine all features
        content_matrix = pd.concat([
            cuisine_encoded,
            vendor_features[['unit_price_norm', 'product_rating_norm']]
        ], axis=1)
        content_matrix.index = vendor_features['vendor_id']

        # Compute vendor similarity
        vendor_similarity = pd.DataFrame(
            cosine_similarity(content_matrix),
            index=content_matrix.index,
            columns=content_matrix.index
        )
        print("Content features built successfully")
        return content_matrix, vendor_similarity

    def train_als_model(self, interaction_matrix, factors=50, regularization=0.1, iterations=30):
        """Train the ALS model"""
        print("Training ALS model...")
        model = AlternatingLeastSquares(
            factors=factors,
            regularization=regularization,
            iterations=iterations,
            random_state=42
        )
        model.fit(interaction_matrix.T)
        print("ALS model trained successfully")
        return model

    def split_train_test_data(self, test_ratio=0.2):
        """Split data into train and test sets for evaluation"""
        print("Splitting data into train and test sets...")

        def split_user_data(df, test_ratio=0.2):
            train_list, test_list = [], []
            for user_id, user_df in df.groupby('customer_id'):
                if len(user_df) < 2:  # Reduced minimum for more users in test
                    continue
                train, test = train_test_split(user_df, test_size=test_ratio, random_state=42)
                train_list.append(train)
                test_list.append(test)
            return pd.concat(train_list), pd.concat(test_list)

        self.train_data, self.test_data = split_user_data(self.full_data, test_ratio)
        print(f"Train data: {len(self.train_data)} records, {self.train_data['customer_id'].nunique()} users")
        print(f"Test data: {len(self.test_data)} records, {self.test_data['customer_id'].nunique()} users")
        return self.train_data, self.test_data

    # ==================== EVALUATION METRICS ====================

    def precision_at_k(self, recommended, actual, k=10):
        """Calculate Precision@K"""
        if len(recommended) == 0:
            return 0.0
        recommended_k = recommended[:k]
        relevant = [1 if item in actual else 0 for item in recommended_k]
        return sum(relevant) / len(recommended_k)

    def ndcg_at_k(self, recommended, actual, k=10):
        """Calculate NDCG@K"""
        if len(recommended) == 0:
            return 0.0

        recommended_k = recommended[:k]
        relevance = [1 if item in actual else 0 for item in recommended_k]

        # Calculate DCG
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance)])

        # Calculate IDCG
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_relevance)])

        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, recommended, actual, k=10):
        """Calculate MRR@K"""
        for idx, item in enumerate(recommended[:k]):
            if item in actual:
                return 1.0 / (idx + 1)
        return 0.0

    def evaluate_model_performance(self, N=10):
        """Comprehensive model evaluation with train/test split"""
        print(f"📊 EVALUATING MODEL PERFORMANCE (N={N})")
        print("=" * 50)

        # Split data
        self.split_train_test_data()

        # Prepare training data for evaluation model
        print("Training evaluation model on training data...")
        df_train, interaction_matrix_train, user_map_train, vendor_map_train, reverse_user_map_train = self.prepare_data(self.train_data)

        # Train ALS model on training data
        eval_model = self.train_als_model(interaction_matrix_train)

        scores = []
        test_users = self.test_data['customer_id'].unique()

        print(f"Evaluating on {len(test_users)} test users...")

        processed_count = 0
        for user_id in test_users:
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()

            try:
                # Skip if user not in training data
                if user_id not in reverse_user_map_train:
                    continue

                # Get user index
                user_idx = reverse_user_map_train[user_id]
                user_items = interaction_matrix_train.T.tocsr()

                # Get ALS recommendations
                recommended = eval_model.recommend(user_idx, user_items[user_idx], N=N)
                recommended_vendors = [vendor_map_train[int(i[0])] for i in recommended]

                # Calculate metrics
                prec = self.precision_at_k(recommended_vendors, actual, k=N)
                ndcg = self.ndcg_at_k(recommended_vendors, actual, k=N)
                mrr = self.mrr_at_k(recommended_vendors, actual, k=N)

                scores.append((prec, ndcg, mrr))
                processed_count += 1

                if processed_count % 100 == 0:
                    print(f"Processed {processed_count}/{len(test_users)} users...")

            except Exception as e:
                continue

        if not scores:
            print("❌ No valid scores computed! Check your data.")
            return None

        scores = np.array(scores)

        results = {
            'Precision@10': scores[:, 0].mean(),
            'NDCG@10': scores[:, 1].mean(),
            'MRR@10': scores[:, 2].mean(),
            'Users_Evaluated': len(scores)
        }

        # Print results
        print("\n📈 EVALUATION RESULTS:")
        print("-" * 40)
        for metric, value in results.items():
            if metric == 'Users_Evaluated':
                print(f"{metric}: {value}")
            else:
                print(f"{metric}: {value:.4f}")

        return results

# ==================== TRAINING AND TESTING EXECUTION ====================

def train_and_test_model():
    """Train the model and run comprehensive evaluation"""
    print("🚀 MODEL TRAINING AND EVALUATION")
    print("=" * 50)

    # Initialize trainer
    trainer = RecommendationModelTrainer(full_data)

    # Run evaluation
    results = trainer.evaluate_model_performance(N=10)

    # Train final model on full data for production use
    print("\n🔧 TRAINING FINAL PRODUCTION MODEL...")
    df_full, interaction_matrix_full, user_map_full, vendor_map_full, reverse_user_map_full = trainer.prepare_data()
    content_matrix_full, vendor_similarity_full = trainer.build_content_features()
    final_model = trainer.train_als_model(interaction_matrix_full)

    # Save the trained components for the recommendation system
    model_components = {
        'model': final_model,
        'interaction_matrix': interaction_matrix_full,
        'vendor_map': vendor_map_full,
        'reverse_user_map': reverse_user_map_full,
        'content_matrix': content_matrix_full,
        'vendor_similarity': vendor_similarity_full
    }

    print("✅ Training completed! Model ready for recommendations.")
    return trainer, model_components, results

# Run training and testing
print("Starting model training and evaluation...")
trainer, model_components, evaluation_results = train_and_test_model()

Starting model training and evaluation...
🚀 MODEL TRAINING AND EVALUATION
📊 EVALUATING MODEL PERFORMANCE (N=10)
Splitting data into train and test sets...
Train data: 74671 records, 9306 users
Test data: 23461 records, 9306 users
Training evaluation model on training data...
Preparing data...
Data prepared: 9306 users, 5632 vendors
Training ALS model...




  0%|          | 0/30 [00:00<?, ?it/s]

ALS model trained successfully
Evaluating on 9306 test users...
Processed 100/9306 users...
Processed 200/9306 users...
Processed 300/9306 users...
Processed 400/9306 users...
Processed 500/9306 users...
Processed 600/9306 users...
Processed 700/9306 users...
Processed 800/9306 users...
Processed 900/9306 users...
Processed 1000/9306 users...
Processed 1100/9306 users...
Processed 1200/9306 users...
Processed 1300/9306 users...
Processed 1400/9306 users...
Processed 1500/9306 users...
Processed 1600/9306 users...
Processed 1700/9306 users...
Processed 1800/9306 users...
Processed 1900/9306 users...
Processed 2000/9306 users...
Processed 2100/9306 users...
Processed 2200/9306 users...
Processed 2300/9306 users...
Processed 2400/9306 users...
Processed 2500/9306 users...
Processed 2600/9306 users...
Processed 2700/9306 users...
Processed 2800/9306 users...
Processed 2900/9306 users...
Processed 3000/9306 users...
Processed 3100/9306 users...
Processed 3200/9306 users...
Processed 3300/93



  0%|          | 0/30 [00:00<?, ?it/s]

ALS model trained successfully
✅ Training completed! Model ready for recommendations.


In [None]:
class RecommendationSystem:
    def __init__(self, full_data, model_components):
        self.full_data = full_data
        self.model = model_components['model']
        self.interaction_matrix = model_components['interaction_matrix']
        self.vendor_map = model_components['vendor_map']
        self.reverse_user_map = model_components['reverse_user_map']
        self.vendor_similarity = model_components['vendor_similarity']

    def get_customer_order_history(self, customer_id):
        """Get detailed order history for a customer"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return f"No order history found for customer: {customer_id}"

        # Aggregate order history
        order_summary = customer_orders.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'order_frequency': 'sum',
            'product_rating': 'mean',
            'unit_price': 'mean'
        }).reset_index()

        return order_summary

    def get_customer_taste_profile(self, customer_id):
        """Analyze customer taste preferences"""
        customer_orders = self.full_data[self.full_data['customer_id'] == customer_id]

        if customer_orders.empty:
            return "No taste profile available (new customer)"

        taste_profile = {
            'total_orders': len(customer_orders),
            'unique_vendors': customer_orders['vendor_id'].nunique(),
            'preferred_cuisines': customer_orders['cuisine_origin'].value_counts().head(3).to_dict(),
            'avg_rating_given': customer_orders['product_rating'].mean(),
            'avg_spending': customer_orders['unit_price'].mean(),
            'favorite_vendors': customer_orders.groupby('vendor_id')['order_frequency']
                                                .sum().sort_values(ascending=False).head(3).to_dict()
        }

        return taste_profile

    def recommend_vendors(self, customer_id, N=10):
        """Get ALS-based recommendations"""
        if customer_id not in self.reverse_user_map:
            return "Cold-start user. Recommend popular restaurants."

        user_idx = self.reverse_user_map[customer_id]
        user_items = self.interaction_matrix.T.tocsr()

        recommended = self.model.recommend(user_idx, user_items[user_idx], N=N)
        recommended_vendors = [self.vendor_map[int(i[0])] for i in recommended]

        return recommended_vendors

    def hybrid_recommend(self, customer_id, N=10, als_weight=0.5, content_weight=0.5):
        """Get hybrid recommendations combining ALS and content-based"""
        # Get user's ordered vendors from full data
        user_vendors = self.full_data[self.full_data['customer_id'] == customer_id]['vendor_id'].unique()

        if len(user_vendors) == 0:
            return "Cold-start user. Recommend popular restaurants."

        # Content-based similarity scores
        content_scores = self.vendor_similarity[user_vendors].mean(axis=1)
        content_scores = content_scores.drop(user_vendors, errors='ignore')

        # ALS recommendations
        als_recs = self.recommend_vendors(customer_id, N=100)

        if isinstance(als_recs, str):
            return als_recs

        als_scores = pd.Series([1 / (i + 1) for i in range(len(als_recs))], index=als_recs)

        # Combine scores
        hybrid_scores = pd.concat([als_scores, content_scores], axis=1).fillna(0)
        hybrid_scores.columns = ['als', 'content']
        hybrid_scores['hybrid'] = (als_weight * hybrid_scores['als'] +
                                 content_weight * hybrid_scores['content'])

        # Return top N
        top_hybrid = hybrid_scores['hybrid'].sort_values(ascending=False).head(N).index.tolist()
        return top_hybrid

    def get_recommendation_details(self, vendor_ids):
        """Get detailed information about recommended vendors"""
        if isinstance(vendor_ids, str):
            return vendor_ids

        vendor_details = self.full_data[self.full_data['vendor_id'].isin(vendor_ids)]

        summary = vendor_details.groupby('vendor_id').agg({
            'name': 'first',
            'cuisine_origin': 'first',
            'unit_price': 'mean',
            'product_rating': 'mean',
            'order_frequency': 'mean'
        }).reset_index()

        return summary

    def generate_customer_report(self, customer_id, N=10):
        """Generate complete automated report for a customer"""
        print(f"🔍 CUSTOMER ANALYSIS REPORT: {customer_id}")
        print("=" * 60)

        # 1. Order History
        print("\n📊 ORDER HISTORY:")
        print("-" * 30)
        order_history = self.get_customer_order_history(customer_id)
        if isinstance(order_history, pd.DataFrame):
            print(order_history.to_string(index=False))
        else:
            print(order_history)

        # 2. Taste Profile
        print("\n👤 TASTE PROFILE:")
        print("-" * 30)
        taste_profile = self.get_customer_taste_profile(customer_id)
        if isinstance(taste_profile, dict):
            for key, value in taste_profile.items():
                print(f"{key.replace('_', ' ').title()}: {value}")
        else:
            print(taste_profile)

        # 3. Recommendations
        print(f"\n🎯 HYBRID RECOMMENDATIONS (Top {N}):")
        print("-" * 40)
        recommendations = self.hybrid_recommend(customer_id, N=N)
        rec_details = self.get_recommendation_details(recommendations)

        if isinstance(rec_details, pd.DataFrame):
            print(rec_details.to_string(index=False))
        else:
            print(rec_details)

        return {
            'customer_id': customer_id,
            'order_history': order_history,
            'taste_profile': taste_profile,
            'recommendations': rec_details
        }

    def batch_recommendations(self, customer_ids, N=10):
        """Generate recommendations for multiple customers"""
        reports = {}
        for customer_id in customer_ids:
            try:
                print(f"\n{'='*80}")
                report = self.generate_customer_report(customer_id, N=N)
                reports[customer_id] = report
                print(f"✅ Report completed for {customer_id}")
            except Exception as e:
                print(f"❌ Error processing {customer_id}: {str(e)}")
        return reports

# ==================== RECOMMENDATION SYSTEM EXECUTION ====================

def run_recommendation_system():
    """Run the recommendation system with the trained model"""
    print("🎯 RECOMMENDATION SYSTEM")
    print("=" * 40)

    # Initialize recommendation system with trained components
    recommender = RecommendationSystem(full_data, model_components)

    # Test with specific customers
    test_customers = ["2e7276ad3a", "f374c8c54c"]  # Replace with actual customer IDs

    print(f"Generating recommendations for {len(test_customers)} customers...")

    # Generate reports
    reports = recommender.batch_recommendations(test_customers, N=10)

    print("\n🎉 RECOMMENDATION SYSTEM COMPLETED!")
    return recommender, reports

# Run the recommendation system
print("Starting recommendation system...")
recommender_system, customer_reports = run_recommendation_system()

# ==================== QUERY INDIVIDUAL CUSTOMERS ====================

def query_specific_customer(customer_id):
    """Query a specific customer on demand"""
    print(f"\n🔍 QUERYING CUSTOMER: {customer_id}")
    report = recommender_system.generate_customer_report(customer_id, N=10)
    return report

# Example: Query a specific customer
# customer_report = query_specific_customer("2e7276ad3a")

# ==================== GET ALL CUSTOMER IDs ====================

def get_all_customer_ids():
    """Get all customer IDs for selection"""
    customer_ids = full_data['customer_id'].unique()
    print(f"Total customers in dataset: {len(customer_ids)}")
    print("Sample customer IDs:", customer_ids[:10])  # Show first 10
    return customer_ids

# View all available customers
# all_customers = get_all_customer_ids()

Starting recommendation system...
🎯 RECOMMENDATION SYSTEM
Generating recommendations for 2 customers...

🔍 CUSTOMER ANALYSIS REPORT: 2e7276ad3a

📊 ORDER HISTORY:
------------------------------
vendor_id                             name cuisine_origin  order_frequency  product_rating  unit_price
 23c3cbb7                   chocolate pint         snacks               30        4.000000    5.600000
 24f02f22            roasted herb potatoes        italian               30        4.000000    9.800000
 2a89ea8c le parisien ham  butter baguette         snacks               15        4.000000    4.800000
 31883abc           green curry mild spicy           thai               60        3.500000    5.100000
 389d8451                             coke         snacks               15        3.000000    1.200000
 3c8b6666                    fungi risotto        italian               45        2.666667    8.266667
 42112a93                          brisket     vietnamese               30        3.50

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

class RecommendationEvaluator:
    def __init__(self, data):
        self.full_data = data
        self.train_data = None
        self.test_data = None

    def split_data_stratified(self, test_ratio=0.2, min_orders=2):
        """Split data with stratification"""
        print("Splitting data for evaluation...")

        train_list, test_list = [], []
        for user_id, user_df in self.full_data.groupby('customer_id'):
            if len(user_df) < min_orders:
                continue  # Skip users with too few interactions
            train, test = train_test_split(user_df, test_size=test_ratio, random_state=42)
            train_list.append(train)
            test_list.append(test)

        self.train_data = pd.concat(train_list)
        self.test_data = pd.concat(test_list)

        print(f"Train: {len(self.train_data)} records, {self.train_data['customer_id'].nunique()} users")
        print(f"Test: {len(self.test_data)} records, {self.test_data['customer_id'].nunique()} users")
        return self.train_data, self.test_data

    def prepare_evaluation_model(self, train_data):
        """Prepare model specifically for evaluation"""
        df = train_data[['customer_id', 'vendor_id', 'order_frequency', 'product_rating']].copy()
        df['score'] = df['order_frequency'] * df['product_rating']

        # Encode users and vendors
        df['user_code'] = df['customer_id'].astype('category').cat.codes
        df['vendor_code'] = df['vendor_id'].astype('category').cat.codes

        # Build interaction matrix
        interaction_matrix = coo_matrix(
            (df['score'], (df['user_code'], df['vendor_code']))
        ).T.tocsr()

        # Build lookup tables
        user_map = dict(enumerate(df['customer_id'].astype('category').cat.categories))
        vendor_map = dict(enumerate(df['vendor_id'].astype('category').cat.categories))
        reverse_user_map = {v: k for k, v in user_map.items()}

        # Train model
        model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=30, random_state=42)
        model.fit(interaction_matrix.T)

        return model, interaction_matrix, user_map, vendor_map, reverse_user_map

    def precision_at_k(self, recommended, actual, k=10):
        """Precision@K: Percentage of relevant recommendations in top K"""
        if len(recommended) == 0:
            return 0.0
        recommended_k = recommended[:k]
        hits = len(set(recommended_k) & set(actual))
        return hits / len(recommended_k)

    def recall_at_k(self, recommended, actual, k=10):
        """Recall@K: Percentage of actual items found in top K"""
        if len(actual) == 0:
            return 0.0
        recommended_k = recommended[:k]
        hits = len(set(recommended_k) & set(actual))
        return hits / len(actual)

    def ndcg_at_k(self, recommended, actual, k=10):
        """Normalized Discounted Cumulative Gain@K"""
        if len(recommended) == 0:
            return 0.0

        recommended_k = recommended[:k]
        relevance = [1 if item in actual else 0 for item in recommended_k]

        # Calculate DCG
        dcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevance)])

        # Calculate IDCG
        ideal_relevance = sorted(relevance, reverse=True)
        idcg = sum([rel / np.log2(idx + 2) for idx, rel in enumerate(ideal_relevance)])

        return dcg / idcg if idcg > 0 else 0.0

    def mrr_at_k(self, recommended, actual, k=10):
        """Mean Reciprocal Rank@K"""
        for idx, item in enumerate(recommended[:k]):
            if item in actual:
                return 1.0 / (idx + 1)
        return 0.0

    def map_at_k(self, recommended, actual, k=10):
        """Mean Average Precision@K"""
        if len(actual) == 0:
            return 0.0

        precision_scores = []
        hits = 0
        for i, item in enumerate(recommended[:k]):
            if item in actual:
                hits += 1
                precision_scores.append(hits / (i + 1))

        if not precision_scores:
            return 0.0

        return sum(precision_scores) / min(len(actual), k)

    def evaluate_recommendations(self, k_values=[5, 10, 20]):
        """Comprehensive evaluation at different K values"""
        print("🚀 RUNNING COMPREHENSIVE EVALUATION")
        print("=" * 60)

        # Split data
        self.split_data_stratified()

        # Train model on training data
        model, interaction_matrix, user_map, vendor_map, reverse_user_map = self.prepare_evaluation_model(self.train_data)

        results = {}
        all_user_metrics = []

        test_users = self.test_data['customer_id'].unique()
        print(f"Evaluating on {len(test_users)} test users...")

        for user_id in test_users:
            if user_id not in reverse_user_map:
                continue

            # Get actual interactions from test set
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()

            if not actual:  # Skip if no actual interactions
                continue

            try:
                # Get recommendations
                user_idx = reverse_user_map[user_id]
                user_items = interaction_matrix.T.tocsr()
                recommended = model.recommend(user_idx, user_items[user_idx], N=max(k_values))
                recommended_vendors = [vendor_map[int(i[0])] for i in recommended]

                user_metrics = {'user_id': user_id}
                for k in k_values:
                    user_metrics[f'precision@{k}'] = self.precision_at_k(recommended_vendors, actual, k)
                    user_metrics[f'recall@{k}'] = self.recall_at_k(recommended_vendors, actual, k)
                    user_metrics[f'ndcg@{k}'] = self.ndcg_at_k(recommended_vendors, actual, k)
                    user_metrics[f'mrr@{k}'] = self.mrr_at_k(recommended_vendors, actual, k)
                    user_metrics[f'map@{k}'] = self.map_at_k(recommended_vendors, actual, k)

                all_user_metrics.append(user_metrics)

            except Exception as e:
                continue

        # Aggregate results
        metrics_df = pd.DataFrame(all_user_metrics)

        for k in k_values:
            results[k] = {
                'Precision': metrics_df[f'precision@{k}'].mean(),
                'Recall': metrics_df[f'recall@{k}'].mean(),
                'NDCG': metrics_df[f'ndcg@{k}'].mean(),
                'MRR': metrics_df[f'mrr@{k}'].mean(),
                'MAP': metrics_df[f'map@{k}'].mean(),
                'Users_Evaluated': len(metrics_df)
            }

        return results, metrics_df

    def print_evaluation_results(self, results):
        """Print formatted evaluation results"""
        print("\n📊 COMPREHENSIVE EVALUATION RESULTS")
        print("=" * 70)

        for k, metrics in results.items():
            print(f"\n🎯 Top-{k} Recommendations:")
            print("-" * 40)
            for metric, value in metrics.items():
                if metric == 'Users_Evaluated':
                    print(f"  {metric}: {value}")
                else:
                    print(f"  {metric}: {value:.4f}")

    def plot_evaluation_results(self, results):
        """Plot evaluation metrics"""
        k_values = list(results.keys())
        metrics = ['Precision', 'Recall', 'NDCG', 'MRR']

        fig, axes = plt.subplots(2, 2, figsize=(12, 10))
        axes = axes.ravel()

        for i, metric in enumerate(metrics):
            values = [results[k][metric] for k in k_values]
            axes[i].bar([str(k) for k in k_values], values, color='skyblue', alpha=0.7)
            axes[i].set_title(f'{metric}@K')
            axes[i].set_xlabel('K')
            axes[i].set_ylabel(metric)

            # Add value labels on bars
            for j, v in enumerate(values):
                axes[i].text(j, v + 0.01, f'{v:.3f}', ha='center', va='bottom')

        plt.tight_layout()
        plt.show()

    def benchmark_against_baselines(self, k=10):
        """Compare against simple baselines"""
        print("\n🔍 BENCHMARKING AGAINST BASELINES")
        print("=" * 50)

        # Popularity baseline (most ordered vendors)
        popular_vendors = self.full_data['vendor_id'].value_counts().head(100).index.tolist()

        # Random baseline
        all_vendors = self.full_data['vendor_id'].unique().tolist()

        baseline_results = {}

        # Evaluate popularity baseline
        pop_scores = []
        for user_id in self.test_data['customer_id'].unique():
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()
            if actual:
                pop_precision = self.precision_at_k(popular_vendors, actual, k)
                pop_scores.append(pop_precision)

        baseline_results['Popularity'] = np.mean(pop_scores) if pop_scores else 0

        # Evaluate random baseline
        random_scores = []
        for user_id in self.test_data['customer_id'].unique():
            actual = self.test_data[self.test_data['customer_id'] == user_id]['vendor_id'].unique().tolist()
            if actual:
                random_recs = np.random.choice(all_vendors, k, replace=False).tolist()
                random_precision = self.precision_at_k(random_recs, actual, k)
                random_scores.append(random_precision)

        baseline_results['Random'] = np.mean(random_scores) if random_scores else 0

        print("Baseline Performance (Precision@10):")
        for baseline, score in baseline_results.items():
            print(f"  {baseline}: {score:.4f}")

        return baseline_results

# ==================== RUN COMPREHENSIVE EVALUATION ====================

def run_complete_evaluation():
    """Run full evaluation pipeline"""
    print("🚀 STARTING COMPLETE RECOMMENDATION SYSTEM EVALUATION")
    print("=" * 70)

    # Initialize evaluator
    evaluator = RecommendationEvaluator(full_data)

    # Run main evaluation
    results, detailed_metrics = evaluator.evaluate_recommendations(k_values=[5, 10, 20])

    # Print results
    evaluator.print_evaluation_results(results)

    # Plot results
    evaluator.plot_evaluation_results(results)

    # Benchmark against baselines
    baselines = evaluator.benchmark_against_baselines(k=10)

    # Interpret results
    interpret_evaluation_results(results, baselines)

    return evaluator, results, detailed_metrics, baselines

def interpret_evaluation_results(results, baselines):
    """Help interpret what the metrics mean"""
    print("\n💡 HOW TO INTERPRET THESE RESULTS:")
    print("=" * 50)

    main_metrics = results[10]  # Focus on Top-10

    print("\n📈 METRIC INTERPRETATION:")
    print(f"Precision@10: {main_metrics['Precision']:.3f}")
    print("  → Percentage of recommendations that are relevant")
    print("  → Good if > 0.1, Excellent if > 0.3")

    print(f"\nRecall@10: {main_metrics['Recall']:.3f}")
    print("  → Percentage of user's actual preferences found in recommendations")
    print("  → Good if > 0.05, Excellent if > 0.15")

    print(f"\nNDCG@10: {main_metrics['NDCG']:.3f}")
    print("  → Measures ranking quality (0-1 scale)")
    print("  → Good if > 0.1, Excellent if > 0.3")

    print(f"\nMRR@10: {main_metrics['MRR']:.3f}")
    print("  → How quickly you find the first relevant recommendation")
    print("  → Good if > 0.1, Excellent if > 0.3")

    print(f"\nMAP@10: {main_metrics['MAP']:.3f}")
    print("  → Overall ranking quality considering all relevant items")
    print("  → Good if > 0.05, Excellent if > 0.2")

    print(f"\n🎯 PERFORMANCE ASSESSMENT:")

    # Overall assessment
    precision_score = main_metrics['Precision']
    ndcg_score = main_metrics['NDCG']

    if precision_score > 0.3 and ndcg_score > 0.3:
        print("✅ EXCELLENT - Your system is performing very well!")
    elif precision_score > 0.15 and ndcg_score > 0.15:
        print("✅ GOOD - Your system is performing well")
    elif precision_score > 0.05 and ndcg_score > 0.05:
        print("⚠️  FAIR - There's room for improvement")
    else:
        print("❌ POOR - Consider revising your approach")

    # Compare with baselines
    your_precision = main_metrics['Precision']
    pop_precision = baselines.get('Popularity', 0)
    random_precision = baselines.get('Random', 0)

    print(f"\n📊 COMPARED TO BASELINES:")
    print(f"Your model: {your_precision:.3f}")
    print(f"Popularity: {pop_precision:.3f}")
    print(f"Random: {random_precision:.3f}")

    if your_precision > pop_precision * 1.5:
        print("✅ You're significantly better than simple popularity!")
    elif your_precision > pop_precision:
        print("⚠️  You're better than popularity, but could improve more")
    else:
        print("❌ You're not beating simple popularity - needs work")

# ==================== QUICK EVALUATION ====================

def quick_evaluation():
    """Run a quick evaluation for fast feedback"""
    print("⚡ RUNNING QUICK EVALUATION")

    evaluator = RecommendationEvaluator(full_data)
    results, detailed_metrics = evaluator.evaluate_recommendations(k_values=[10])

    print("\n📊 QUICK RESULTS (Top-10):")
    metrics = results[10]
    for metric, value in metrics.items():
        if metric != 'Users_Evaluated':
            print(f"  {metric}: {value:.3f}")

    return results[10]

# ==================== RUN THE EVALUATION ====================

print("Starting recommendation system evaluation...")

# Option 1: Quick evaluation (fast)
quick_results = quick_evaluation()

# Option 2: Complete evaluation (comprehensive)
# evaluator, full_results, detailed_metrics, baselines = run_complete_evaluation()

Starting recommendation system evaluation...
⚡ RUNNING QUICK EVALUATION
🚀 RUNNING COMPREHENSIVE EVALUATION
Splitting data for evaluation...
Train: 74671 records, 9306 users
Test: 23461 records, 9306 users




  0%|          | 0/30 [00:00<?, ?it/s]

Evaluating on 9306 test users...

📊 QUICK RESULTS (Top-10):
  Precision: 0.001
  Recall: 0.001
  NDCG: 0.002
  MRR: 0.002
  MAP: 0.001


In [None]:
# Run this to see if your system is good
def check_system_quality():
    """Simple function to check if your recommendation system is good"""
    results = quick_evaluation()

    precision = results['Precision']
    ndcg = results['NDCG']

    print("\n🎯 SYSTEM QUALITY CHECK:")
    print("=" * 40)

    if precision > 0.3:
        print("✅ EXCELLENT! Your Precision@10 is great!")
        print("   Users will find many relevant recommendations")
    elif precision > 0.15:
        print("✅ GOOD! Your system is working well")
        print("   Most recommendations are relevant to users")
    elif precision > 0.05:
        print("⚠️  FAIR - There's room for improvement")
        print("   Some recommendations are relevant, but many aren't")
    else:
        print("❌ NEEDS WORK - The system isn't capturing user preferences well")
        print("   Consider feature engineering or algorithm tuning")

    if ndcg > 0.3:
        print("✅ EXCELLENT ranking quality!")
        print("   You're putting the most relevant items first")
    elif ndcg > 0.15:
        print("✅ GOOD ranking - relevant items appear early")
    else:
        print("⚠️  Ranking could be improved")
        print("   Relevant items aren't appearing early enough")

    return precision, ndcg

# Get your answer!
precision_score, ndcg_score = check_system_quality()

⚡ RUNNING QUICK EVALUATION
🚀 RUNNING COMPREHENSIVE EVALUATION
Splitting data for evaluation...
Train: 74671 records, 9306 users
Test: 23461 records, 9306 users




  0%|          | 0/30 [00:00<?, ?it/s]

Evaluating on 9306 test users...

📊 QUICK RESULTS (Top-10):
  Precision: 0.001
  Recall: 0.001
  NDCG: 0.002
  MRR: 0.002
  MAP: 0.001

🎯 SYSTEM QUALITY CHECK:
❌ NEEDS WORK - The system isn't capturing user preferences well
   Consider feature engineering or algorithm tuning
⚠️  Ranking could be improved
   Relevant items aren't appearing early enough


In [20]:
!pip install -r requirements.txt

[31mERROR: Invalid requirement: 'Package                   Version': Expected end or semicolon (after name and no valid version specifier)
    Package                   Version
                              ^ (from line 1 of requirements.txt)[0m[31m
[0m