In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from implicit.bpr import BayesianPersonalizedRanking
import warnings
warnings.filterwarnings('ignore')


In [None]:
file_path1 =

In [None]:

def robust_temporal_split(df, user_col='customer_id', time_col='order_day',
                         test_size=0.2, min_test_orders=2, val_size=0.1):
    """Improved temporal split with proper validation set"""
    df_sorted = df.sort_values([user_col, time_col])
    splits = {'train': [], 'val': [], 'test': []}

    for user_id in df_sorted[user_col].unique():
        user_orders = df_sorted[df_sorted[user_col] == user_id]

        if len(user_orders) <= min_test_orders + 2:
            # Small user: use last order for test, previous for val
            train_val = user_orders.iloc[:-1]
            test = user_orders.iloc[-1:]
            train = train_val.iloc[:-1]
            val = train_val.iloc[-1:]
        else:
            # Proper split with validation
            split_idx = max(min_test_orders, int(len(user_orders) * (1 - test_size)))
            val_idx = max(1, int(split_idx * (1 - val_size)))

            train = user_orders.iloc[:val_idx]
            val = user_orders.iloc[val_idx:split_idx]
            test = user_orders.iloc[split_idx:]

        splits['train'].append(train)
        splits['val'].append(val)
        splits['test'].append(test)

    return {k: pd.concat(v).reset_index(drop=True) for k, v in splits.items()}

def create_interaction_matrix(df, user_col='customer_id', item_col='vendor_id'):
    """Create user-item interaction matrix for collaborative filtering"""
    # Create categorical codes
    user_codes = df[user_col].astype('category').cat.codes
    item_codes = df[item_col].astype('category').cat.codes

    # Get the unique users and items
    unique_users = df[user_col].unique()
    unique_items = df[item_col].unique()

    # Create mapping dictionaries
    user_map = {code: user_id for code, user_id in enumerate(df[user_col].astype('category').cat.categories)}
    item_map = {code: item_id for code, item_id in enumerate(df[item_col].astype('category').cat.categories)}

    # Create reverse mappings
    reverse_user_map = {user_id: code for code, user_id in user_map.items()}
    reverse_item_map = {item_id: code for code, item_id in item_map.items()}

    # Create sparse matrix
    interaction_matrix = csr_matrix(
        (np.ones(len(df)), (user_codes, item_codes)),
        shape=(len(unique_users), len(unique_items))
    )

    return interaction_matrix, user_map, item_map, reverse_user_map, reverse_item_map

class SimpleContentBased:
    """Content-based filtering using available features"""

    def __init__(self):
        self.scaler = StandardScaler()
        self.label_encoders = {}
        self.tfidf_encoder = TfidfVectorizer(max_features=50, stop_words='english')
        self.vendor_features = None
        self.vendor_ids = None
        self.is_fitted = False

    def preprocess_text(self, text_series):
        """Clean and preprocess text data"""
        processed = text_series.fillna('').astype(str)
        processed = processed.str.lower()
        processed = processed.str.replace(r'[^\w\s]', '', regex=True)
        return processed

    def prepare_features(self, vendor_data):
        """Prepare vendor features using available columns"""
        print("Preparing content-based features...")
        vendor_data = vendor_data.copy()

        # Check which columns are available
        available_columns = vendor_data.columns.tolist()
        print(f"Available columns in vendor_data: {available_columns}")

        feature_components = []

        # Handle text features - cuisine (if available)
        if 'cuisine_origin' in vendor_data.columns:
            cuisine_text = self.preprocess_text(vendor_data['cuisine_origin'])
            cuisine_emb = self.tfidf_encoder.fit_transform(cuisine_text).toarray()
            feature_components.append(cuisine_emb)
            print(f"Added cuisine embeddings: {cuisine_emb.shape}")
        else:
            # Create dummy cuisine feature
            dummy_cuisine = np.zeros((len(vendor_data), 50))
            feature_components.append(dummy_cuisine)
            print("Added dummy cuisine features")

        # Handle numerical features (check which ones exist)
        numerical_features = []
        potential_numerical = ['vendor_rating', 'num_products', 'total_order_value']

        for feature in potential_numerical:
            if feature in vendor_data.columns:
                numerical_features.append(feature)

        if numerical_features:
            numerical_data = vendor_data[numerical_features].fillna(0).values
            feature_components.append(numerical_data)
            print(f"Added numerical features {numerical_features}: {numerical_data.shape}")
        else:
            # Add dummy numerical features
            dummy_numerical = np.zeros((len(vendor_data), 2))
            feature_components.append(dummy_numerical)
            print("Added dummy numerical features")

        # Handle categorical features
        categorical_features = []
        potential_categorical = ['vendor_geohash', 'chain_id']

        for feature in potential_categorical:
            if feature in vendor_data.columns:
                self.label_encoders[feature] = LabelEncoder()
                encoded = self.label_encoders[feature].fit_transform(
                    vendor_data[feature].fillna('unknown')
                ).reshape(-1, 1)
                feature_components.append(encoded)
                categorical_features.append(feature)

        if categorical_features:
            print(f"Added categorical features {categorical_features}")

        # Combine all features
        if feature_components:
            combined_features = np.concatenate(feature_components, axis=1)
        else:
            # Fallback: use vendor_id as simple feature
            combined_features = np.arange(len(vendor_data)).reshape(-1, 1)

        # Scale features
        self.vendor_features = self.scaler.fit_transform(combined_features)
        self.vendor_ids = vendor_data['vendor_id'].values
        self.is_fitted = True

        print(f"Created {combined_features.shape[1]} features for {len(vendor_data)} vendors")
        return self.vendor_features

    def get_similar_vendors(self, vendor_id, top_k=10):
        """Find similar vendors based on content features"""
        if not self.is_fitted:
            raise ValueError("Model not fitted yet. Call prepare_features first.")

        if vendor_id not in self.vendor_ids:
            return []

        vendor_idx = np.where(self.vendor_ids == vendor_id)[0][0]
        vendor_vector = self.vendor_features[vendor_idx].reshape(1, -1)

        # Calculate similarities to all vendors
        similarities = cosine_similarity(vendor_vector, self.vendor_features)[0]

        # Get top similar vendors (excluding itself)
        similar_indices = np.argsort(similarities)[::-1][1:top_k+1]

        recommendations = []
        for idx in similar_indices:
            recommendations.append({
                'vendor_id': self.vendor_ids[idx],
                'similarity_score': similarities[idx]
            })

        return recommendations

    def get_user_content_scores(self, user_preferred_vendors, top_k=20):
        """Get content-based scores for a user based on their preferred vendors"""
        if not self.is_fitted:
            return {}

        scores = {}

        # For each preferred vendor, find similar ones and accumulate scores
        for vendor_id in user_preferred_vendors[:3]:  # Use top 3 preferred vendors
            try:
                similar_vendors = self.get_similar_vendors(vendor_id, top_k=top_k)
                for rec in similar_vendors:
                    vendor_id_rec = rec['vendor_id']
                    similarity = rec['similarity_score']

                    # Take the maximum similarity score from any preferred vendor
                    if vendor_id_rec not in scores or similarity > scores[vendor_id_rec]:
                        scores[vendor_id_rec] = similarity
            except Exception as e:
                continue  # Skip if vendor not found or other error

        return scores

class HybridRecommendationModel:
    """Hybrid model combining collaborative filtering and content-based filtering"""

    def __init__(self, cf_weight=0.7, content_weight=0.3):
        self.cf_model = None
        self.content_model = SimpleContentBased()
        self.user_profiles = {}
        self.cf_weight = cf_weight
        self.content_weight = content_weight
        self.user_map = None
        self.item_map = None
        self.reverse_user_map = None
        self.reverse_item_map = None

    def fit(self, train_data, vendor_data, val_data=None):
        """Train the hybrid model"""
        print("Training Hybrid Recommendation Model...")

        # 1. Build user profiles (with robust column handling)
        self._build_user_profiles(train_data)

        # 2. Train Collaborative Filtering
        print("Training Collaborative Filtering (BPR)...")
        self._train_collaborative_filtering(train_data)

        # 3. Train Content-Based Model
        print("Training Content-Based Model...")
        self.content_model.prepare_features(vendor_data)

        print("Training completed!")

    def _build_user_profiles(self, train_data):
        """Build user preference profiles with robust column handling"""
        print("Building user profiles...")

        # Check available columns
        available_columns = train_data.columns.tolist()
        print(f"Available columns in train_data: {available_columns}")

        # User order statistics - only use available columns
        agg_dict = {'vendor_id': 'count'}  # Always count vendor interactions

        # Add available numerical columns
        if 'vendor_rating' in available_columns:
            agg_dict['vendor_rating'] = 'mean'
        if 'total_order_value' in available_columns:
            agg_dict['total_order_value'] = ['mean', 'sum']

        user_stats = train_data.groupby('customer_id').agg(agg_dict)

        # Flatten multi-level columns if they exist
        if isinstance(user_stats.columns, pd.MultiIndex):
            user_stats.columns = ['_'.join(col).strip() for col in user_stats.columns]
        user_stats = user_stats.reset_index()

        # Rename columns for consistency
        column_rename = {
            'vendor_id_count': 'order_count',
            'vendor_rating_mean': 'avg_rating',
            'total_order_value_mean': 'avg_order_value',
            'total_order_value_sum': 'total_spent'
        }
        user_stats = user_stats.rename(columns={k: v for k, v in column_rename.items() if k in user_stats.columns})

        # Preferred cuisines (if available)
        if 'cuisine_origin' in available_columns:
            user_cuisines = train_data.groupby(['customer_id', 'cuisine_origin']).size().reset_index(name='count')
            user_top_cuisines = user_cuisines.sort_values(['customer_id', 'count'], ascending=[True, False])
            user_top_cuisines = user_top_cuisines.groupby('customer_id').head(3)
        else:
            user_top_cuisines = pd.DataFrame(columns=['customer_id', 'cuisine_origin'])

        for user_id in train_data['customer_id'].unique():
            user_orders = train_data[train_data['customer_id'] == user_id]
            preferred_vendors = user_orders['vendor_id'].unique().tolist()

            # Get user's preferred cuisines (if available)
            user_cuisine_data = user_top_cuisines[user_top_cuisines['customer_id'] == user_id]
            preferred_cuisines = user_cuisine_data['cuisine_origin'].tolist() if 'cuisine_origin' in user_cuisine_data.columns else []

            # Get user stats
            user_stat_data = user_stats[user_stats['customer_id'] == user_id]
            avg_rating = user_stat_data['avg_rating'].iloc[0] if 'avg_rating' in user_stat_data.columns and len(user_stat_data) > 0 else 0
            order_count = user_stat_data['order_count'].iloc[0] if 'order_count' in user_stat_data.columns and len(user_stat_data) > 0 else len(user_orders)

            self.user_profiles[user_id] = {
                'preferred_vendors': preferred_vendors,
                'order_count': order_count,
                'preferred_cuisines': preferred_cuisines,
                'avg_rating': avg_rating
            }

        print(f"Built profiles for {len(self.user_profiles)} users")

    def _train_collaborative_filtering(self, train_data):
        """Train BPR model for collaborative filtering"""
        # Create interaction matrix with proper mappings
        train_matrix, self.user_map, self.item_map, self.reverse_user_map, self.reverse_item_map = create_interaction_matrix(train_data)

        # Train BPR model
        self.cf_model = BayesianPersonalizedRanking(
            factors=64,
            learning_rate=0.01,
            regularization=0.1,
            iterations=30,  # Reduced for faster training
            random_state=42
        )

        self.cf_model.fit(train_matrix)
        print(f"Trained CF model on {train_matrix.shape[0]} users and {train_matrix.shape[1]} items")

    def _get_cf_scores(self, user_id):
        """Get collaborative filtering scores for a user"""
        if user_id not in self.reverse_user_map:
            return {}

        user_idx = self.reverse_user_map[user_id]

        scores = {}

        # Iterate through all items in the item_map (which uses integer codes as keys)
        for item_code, vendor_id in self.item_map.items():
            if item_code < len(self.cf_model.item_factors):
                try:
                    vendor_vector = self.cf_model.item_factors[item_code]
                    user_vector = self.cf_model.user_factors[user_idx]
                    score = np.dot(user_vector, vendor_vector)
                    scores[vendor_id] = score
                except Exception as e:
                    continue

        return scores

    def recommend(self, user_id, top_k=10):
        """Generate recommendations for a user"""
        if user_id not in self.user_profiles:
            # Cold start: return popular items
            return self._get_popular_items(top_k)

        # Get CF scores
        cf_scores = self._get_cf_scores(user_id)

        # Get content-based scores
        preferred_vendors = self.user_profiles[user_id]['preferred_vendors']
        content_scores = self.content_model.get_user_content_scores(preferred_vendors, top_k=30)

        # Combine scores
        combined_scores = {}
        all_vendors = set(cf_scores.keys()) | set(content_scores.keys())

        for vendor in all_vendors:
            cf_score = cf_scores.get(vendor, 0)
            content_score = content_scores.get(vendor, 0)

            # Normalize scores
            max_cf = max(cf_scores.values()) if cf_scores else 1
            max_content = max(content_scores.values()) if content_scores else 1

            cf_score_norm = cf_score / max_cf if max_cf > 0 else 0
            content_score_norm = content_score / max_content if max_content > 0 else 0

            combined_score = (self.cf_weight * cf_score_norm) + (self.content_weight * content_score_norm)
            combined_scores[vendor] = combined_score

        # Return top-k vendors
        top_vendors = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        return [vendor for vendor, score in top_vendors]

    def _get_popular_items(self, top_k):
        """Get popular vendors for cold start"""
        if hasattr(self, 'train_data'):
            popular_vendors = self.train_data['vendor_id'].value_counts().head(top_k).index.tolist()
            return popular_vendors
        else:
            return []

    def evaluate(self, test_data, k=10):
        """Comprehensive evaluation on test data"""
        print("Evaluating model...")
        ndcg_scores = []
        precision_scores = []
        recall_scores = []
        mrr_scores = []

        evaluated_users = 0

        for user_id in test_data['customer_id'].unique():
            user_test_data = test_data[test_data['customer_id'] == user_id]
            true_vendors = user_test_data['vendor_id'].tolist()

            if user_id in self.user_profiles:
                try:
                    pred_vendors = self.recommend(user_id, top_k=k)

                    if len(true_vendors) > 0 and len(pred_vendors) > 0:
                        # Calculate NDCG
                        ndcg = self._calculate_ndcg(true_vendors, pred_vendors, k)
                        ndcg_scores.append(ndcg)

                        # Calculate Precision
                        precision = len(set(true_vendors) & set(pred_vendors)) / len(pred_vendors)
                        precision_scores.append(precision)

                        # Calculate Recall
                        recall = len(set(true_vendors) & set(pred_vendors)) / len(true_vendors)
                        recall_scores.append(recall)

                        # Calculate MRR
                        mrr = 0
                        for i, vendor in enumerate(pred_vendors):
                            if vendor in true_vendors:
                                mrr = 1 / (i + 1)
                                break
                        mrr_scores.append(mrr)

                        evaluated_users += 1
                except Exception as e:
                    # Skip users that cause errors
                    continue

        print(f"Evaluated on {evaluated_users} users")

        results = {
            'ndcg': np.mean(ndcg_scores) if ndcg_scores else 0,
            'precision': np.mean(precision_scores) if precision_scores else 0,
            'recall': np.mean(recall_scores) if recall_scores else 0,
            'mrr': np.mean(mrr_scores) if mrr_scores else 0
        }

        return results

    def _calculate_ndcg(self, true_list, pred_list, k=10):
        """Calculate NDCG@k manually"""
        # Binary relevance (1 if vendor in true list, 0 otherwise)
        relevance = [1 if item in true_list else 0 for item in pred_list[:k]]

        # Calculate DCG
        dcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(relevance)])

        # Calculate IDCG (ideal ordering)
        ideal_relevance = [1] * min(len(true_list), k) + [0] * max(0, k - len(true_list))
        idcg = sum([rel / np.log2(i + 2) for i, rel in enumerate(ideal_relevance)])

        return dcg / idcg if idcg > 0 else 0

def run_complete_pipeline(order_data, vendor_data, test_size=0.2):
    """Run the complete training and evaluation pipeline"""

    print("=" * 50)
    print("FOOD DELIVERY RECOMMENDATION SYSTEM")
    print("=" * 50)

    # 1. Split the data
    print("\n1. Splitting data...")
    splits = robust_temporal_split(order_data, test_size=test_size)
    train_data, val_data, test_data = splits['train'], splits['val'], splits['test']

    print(f"Training set: {len(train_data)} orders")
    print(f"Validation set: {len(val_data)} orders")
    print(f"Test set: {len(test_data)} orders")
    print(f"Unique users: {order_data['customer_id'].nunique()}")
    print(f"Unique vendors: {order_data['vendor_id'].nunique()}")

    # 2. Train the hybrid model
    print("\n2. Training hybrid model...")
    hybrid_model = HybridRecommendationModel(cf_weight=0.7, content_weight=0.3)
    hybrid_model.train_data = train_data  # Store for popular items

    hybrid_model.fit(
        train_data=train_data,
        vendor_data=vendor_data
    )

    # 3. Evaluate on test set
    print("\n3. Evaluating on test set...")
    test_metrics = hybrid_model.evaluate(test_data, k=10)

    print("\n" + "=" * 50)
    print("FINAL RESULTS")
    print("=" * 50)
    for metric, value in test_metrics.items():
        print(f"{metric.upper()}@10: {value:.4f}")

    # 4. Compare with benchmark
    benchmark = {
        'ndcg': 0.6620,
        'precision': 0.0662,
        'mrr': 0.6620
    }

    print("\n" + "=" * 50)
    print("BENCHMARK COMPARISON")
    print("=" * 50)
    for metric, value in test_metrics.items():
        if metric in benchmark:
            bench_value = benchmark[metric]
            improvement = ((value - bench_value) / bench_value) * 100
            status = "✅ ABOVE" if value >= bench_value else "❌ BELOW"
            print(f"{metric.upper()}@10: {value:.4f} vs {bench_value:.4f} | {status} | {improvement:+.1f}%")

    return hybrid_model, test_metrics

# Run the corrected pipeline
print("Starting the corrected pipeline...")
model, metrics = run_complete_pipeline(order_level_data, full_data2)

# Generate recommendations for a sample user
if model:
    sample_user = order_level_data['customer_id'].iloc[0]
    if sample_user in model.user_profiles:
        recommendations = model.recommend(sample_user, top_k=5)
        print(f"\nSample recommendations for user {sample_user}:")
        print(recommendations)
    else:
        print(f"\nSample user {sample_user} not in training data (cold start)")