In [23]:
# Enhanced benchmarking evaluation and training pipeline with complete metrics and weather features included
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle
import os

# Create directory for saved models if it doesn't exist
os.makedirs("saved_models", exist_ok=True)

# --- Shared Evaluation ---
def compute_ranking_metrics(y_true, y_score, k=10):
    # Sort indices by descending score
    sorted_indices = np.argsort(y_score)[::-1]
    # Get top-k true labels
    top_k = np.array(y_true)[sorted_indices][:k]
    # Calculate precision
    precision = np.mean(top_k)
    # Calculate recall
    recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
    # Calculate DCG
    dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
    # Calculate ideal DCG
    ideal_k = min(int(np.sum(y_true)), k)
    idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
    # Calculate NDCG
    ndcg = dcg / idcg if idcg > 0 else 0
    return precision, recall, ndcg

def mean_reciprocal_rank(y_true, y_score, k=10):
    # Sort indices by descending score
    sorted_indices = np.argsort(y_score)[::-1][:k]
    top_k = np.array(y_true)[sorted_indices]
    ranks = np.where(top_k == 1)[0]
    if len(ranks) == 0:
        return 0.0
    return 1.0 / (ranks[0] + 1)

def compute_all_metrics(y_true, y_score):
    # Ensure there are enough positive examples for AUC calculation
    if len(np.unique(y_true)) < 2:
        metrics = {
            "AUC": np.nan,
            "MAP": np.nan
        }
    else:
        metrics = {
            "AUC": roc_auc_score(y_true, y_score),
            "MAP": average_precision_score(y_true, y_score)
        }
    
    # Add MRR metric
    metrics["MRR"] = mean_reciprocal_rank(y_true, y_score)
    
    for k in [5, 10]:
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        metrics[f"Precision@{k}"] = p
        metrics[f"Recall@{k}"] = r
        metrics[f"NDCG@{k}"] = n
    return metrics

def evaluate_per_user(df, model, tfidf_title, tfidf_interests, scaler, encoder, k=10):
    """Evaluate model performance per user and average the metrics"""
    results = {
        f"Precision@{k}": [], 
        f"Recall@{k}": [], 
        f"NDCG@{k}": [], 
        f"MRR@{k}": []
    }
    
    users = df["user_id"].unique()
    for user in users:
        user_df = df[df["user_id"] == user]
        if len(user_df) < 2 or user_df["interaction_label"].sum() == 0:
            continue
            
        # Prepare features
        user_df = user_df.copy()
        user_df["title"] = user_df["title"].fillna("")
        user_df["user_interests"] = user_df["user_interests"].fillna("")
        
        # TF-IDF features
        tfidf_title_mat = tfidf_title.transform(user_df["title"])
        tfidf_interests_mat = tfidf_interests.transform(user_df["user_interests"])
        
        # Numeric features
        numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
        user_df[numeric_cols] = user_df[numeric_cols].fillna(0)
        X_numeric = scaler.transform(user_df[numeric_cols])
        
        # Categorical features
        categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
        for col in categorical_cols:
            user_df[col] = user_df[col].fillna('unknown')
        X_cat = encoder.transform(user_df[categorical_cols])
        
        # Combine features
        X = hstack([tfidf_title_mat, tfidf_interests_mat, X_numeric, X_cat]).toarray()
        
        # Get predictions
        y_true = user_df["interaction_label"].values
        y_score = model.predict_proba(X)[:, 1]
        
        # Calculate metrics
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        mrr = mean_reciprocal_rank(y_true, y_score, k)
        
        results[f"Precision@{k}"].append(p)
        results[f"Recall@{k}"].append(r)
        results[f"NDCG@{k}"].append(n)
        results[f"MRR@{k}"].append(mrr)
    
    # Average metrics across users
    return {metric: np.mean(values) for metric, values in results.items() if values}

def train_content_model(train_df, val_df):
    # Clean and prepare training data
    train_clean = train_df.copy()
    train_clean["title"] = train_clean["title"].fillna("")
    train_clean["user_interests"] = train_clean["user_interests"].fillna("")
    
    # Clean and prepare validation data
    val_clean = val_df.copy()
    val_clean["title"] = val_clean["title"].fillna("")
    val_clean["user_interests"] = val_clean["user_interests"].fillna("")

    # TF-IDF features - fit only on training data
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat_train = tfidf_title.fit_transform(train_clean["title"])
    tfidf_interests_mat_train = tfidf_interests.fit_transform(train_clean["user_interests"])
    
    # Transform validation data using fitted vectorizers
    tfidf_title_mat_val = tfidf_title.transform(val_clean["title"])
    tfidf_interests_mat_val = tfidf_interests.transform(val_clean["user_interests"])

    # Numeric features
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    train_clean[numeric_cols] = train_clean[numeric_cols].fillna(0)
    val_clean[numeric_cols] = val_clean[numeric_cols].fillna(0)
    
    # Fit scaler on training data only
    scaler = StandardScaler().fit(train_clean[numeric_cols])
    X_numeric_train = scaler.transform(train_clean[numeric_cols])
    X_numeric_val = scaler.transform(val_clean[numeric_cols])
    
    # Categorical features with OneHotEncoder
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    
    # Fill missing values in categorical columns
    for col in categorical_cols:
        train_clean[col] = train_clean[col].fillna('unknown')
        val_clean[col] = val_clean[col].fillna('unknown')
    
    # Create OneHotEncoder for categorical features
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_clean[categorical_cols])
    X_cat_val = encoder.transform(val_clean[categorical_cols])

    # Combine all features
    X_train = hstack([tfidf_title_mat_train, tfidf_interests_mat_train, X_numeric_train, X_cat_train]).toarray()
    X_val = hstack([tfidf_title_mat_val, tfidf_interests_mat_val, X_numeric_val, X_cat_val]).toarray()
    
    y_train = train_clean["interaction_label"].astype(int)
    y_val = val_clean["interaction_label"].astype(int)

    # Train CatBoost on training data only
    model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        loss_function='Logloss',
        verbose=False
    )
    model.fit(X_train, y_train)
    
    # Save model and preprocessors
    model.save_model("saved_models/content_model.cbm")
    with open("saved_models/content_tfidf_title.pkl", "wb") as f:
        pickle.dump(tfidf_title, f)
    with open("saved_models/content_tfidf_interests.pkl", "wb") as f:
        pickle.dump(tfidf_interests, f)
    with open("saved_models/content_scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    with open("saved_models/content_encoder.pkl", "wb") as f:
        pickle.dump(encoder, f)
    
    # Predict on validation data
    val_scores = model.predict_proba(X_val)[:, 1]

    return model, compute_all_metrics(y_val, val_scores), tfidf_title, tfidf_interests, scaler, encoder

def train_svd(train_df, val_df):
    # Prepare training data for SVD
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    
    # Convert all to strings to ensure compatibility
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    train_svd["interaction_label"] = train_svd["interaction_label"].astype(float)
    
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    
    # Train SVD model on training data only
    svd = SVD(n_epochs=50).fit(trainset)
    
    # Save the SVD model
    with open("saved_models/svd_model.pkl", "wb") as f:
        pickle.dump(svd, f)
    
    # Generate predictions for validation data
    val_copy = val_df.copy()
    val_copy["svd_score"] = val_copy.apply(
        lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, 
        axis=1
    )
    
    return svd, compute_all_metrics(val_copy["interaction_label"].astype(int), val_copy["svd_score"])

def hybrid_model(train_df, val_df):
    from sklearn.preprocessing import OneHotEncoder
    
    # Step 1: Train SVD model on train_df
    reader = Reader(rating_scale=(0, 1))
    train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
    trainset = train_svd_data.build_full_trainset()
    svd_model = SVD(n_epochs=20).fit(trainset)
    
    # Save SVD component of hybrid model
    with open("saved_models/hybrid_svd_component.pkl", "wb") as f:
        pickle.dump(svd_model, f)

    # Step 2: Compute SVD scores for both train and val
    train_df = train_df.copy()
    val_df = val_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    val_df["svd_score"] = val_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )

    # Step 3: TF-IDF + numeric features
    tfidf_title = TfidfVectorizer(max_features=50)
    tfidf_interests = TfidfVectorizer(max_features=50)
    tfidf_title.fit(train_df["title"].fillna(""))
    tfidf_interests.fit(train_df["user_interests"].fillna(""))

    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    X_val_text = hstack([
        tfidf_title.transform(val_df["title"].fillna("")),
        tfidf_interests.transform(val_df["user_interests"].fillna(""))
    ])

    # Numeric features including SVD score
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    train_df[numeric_cols] = train_df[numeric_cols].fillna(0)
    val_df[numeric_cols] = val_df[numeric_cols].fillna(0)

    scaler = StandardScaler().fit(train_df[numeric_cols])
    X_train_numeric = scaler.transform(train_df[numeric_cols])
    X_val_numeric = scaler.transform(val_df[numeric_cols])
    
    # Categorical features with OneHotEncoder
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    
    # Fill missing values in categorical columns
    for col in categorical_cols:
        train_df[col] = train_df[col].fillna('unknown')
        val_df[col] = val_df[col].fillna('unknown')
    
    # Create OneHotEncoder for categorical features
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_df[categorical_cols])
    X_cat_val = encoder.transform(val_df[categorical_cols])

    # Combine all features
    X_train = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    X_val = hstack([X_val_text, X_val_numeric, X_cat_val]).toarray()
    
    y_train = train_df["interaction_label"].astype(int)
    y_val = val_df["interaction_label"].astype(int)

    model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        loss_function='Logloss',
        verbose=False
    )
    model.fit(X_train, y_train)
    
    # Save hybrid model and preprocessors
    model.save_model("saved_models/hybrid_model.cbm")
    with open("saved_models/hybrid_tfidf_title.pkl", "wb") as f:
        pickle.dump(tfidf_title, f)
    with open("saved_models/hybrid_tfidf_interests.pkl", "wb") as f:
        pickle.dump(tfidf_interests, f)
    with open("saved_models/hybrid_scaler.pkl", "wb") as f:
        pickle.dump(scaler, f)
    with open("saved_models/hybrid_encoder.pkl", "wb") as f:
        pickle.dump(encoder, f)

    scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, scores), tfidf_title, tfidf_interests, scaler, encoder

def compare_all_models(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()

    # Ensure correct types and fill missing
    for df in [train_df, val_df]:
        df["user_id"] = df["user_id"].astype(str)
        df["event_id"] = df["event_id"].astype(str)
        df["interaction_label"] = df["interaction_label"].astype(int)
        df["title"] = df["title"].fillna("").astype(str)
        df["user_interests"] = df["user_interests"].fillna("").astype(str)
        
        # Ensure categorical columns exist
        for col in ['weather_condition', 'event_indoor_capability', 'user_weather_preference']:
            if col not in df.columns:
                df[col] = "unknown"

    print("Training Content-Based model...")
    content_model, content_scores, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder = train_content_model(train_df, val_df)

    print("Training SVD model...")
    svd_model, svd_scores = train_svd(train_df, val_df)

    print("Training Hybrid (SVD + Content + Weather) model...")
    hybrid_model_obj, hybrid_scores, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder = hybrid_model(train_df, val_df)

    # Assemble final results
    results_df = pd.DataFrame({
        "Content-Based": content_scores,
        "SVD": svd_scores,
        "Hybrid": hybrid_scores,
    })

    return results_df.T, content_model, svd_model, hybrid_model_obj, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder

def evaluate_on_real_world_data(test_df, model_type="hybrid"):
    """
    Evaluate the specified model on real-world test data
    
    Args:
        test_df: DataFrame containing real-world test data
        model_type: Type of model to evaluate ('content', 'svd', or 'hybrid')
    
    Returns:
        Dictionary of evaluation metrics
    """
    if model_type == "content":
        model_path = "saved_models/content_model.cbm"
        tfidf_title_path = "saved_models/content_tfidf_title.pkl"
        tfidf_interests_path = "saved_models/content_tfidf_interests.pkl"
        scaler_path = "saved_models/content_scaler.pkl"
        encoder_path = "saved_models/content_encoder.pkl"
    elif model_type == "hybrid":
        model_path = "saved_models/hybrid_model.cbm"
        tfidf_title_path = "saved_models/hybrid_tfidf_title.pkl"
        tfidf_interests_path = "saved_models/hybrid_tfidf_interests.pkl"
        scaler_path = "saved_models/hybrid_scaler.pkl"
        encoder_path = "saved_models/hybrid_encoder.pkl"
    else:
        # For SVD, use a different approach
        with open("saved_models/svd_model.pkl", "rb") as f:
            svd_model = pickle.load(f)
        
        test_df = test_df.copy()
        test_df["svd_score"] = test_df.apply(
            lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, 
            axis=1
        )
        
        # Calculate overall metrics
        overall_metrics = compute_all_metrics(test_df["interaction_label"].astype(int), test_df["svd_score"])
        
        # Calculate per-user metrics
        user_metrics = {}
        for k in [5, 10]:
            user_results = {f"Precision@{k}": [], f"Recall@{k}": [], f"NDCG@{k}": [], f"MRR@{k}": []}
            for user in test_df["user_id"].unique():
                user_data = test_df[test_df["user_id"] == user]
                if len(user_data) < 2 or user_data["interaction_label"].sum() == 0:
                    continue
                
                y_true = user_data["interaction_label"].values
                y_score = user_data["svd_score"].values
                
                p, r, n = compute_ranking_metrics(y_true, y_score, k)
                mrr = mean_reciprocal_rank(y_true, y_score, k)
                
                user_results[f"Precision@{k}"].append(p)
                user_results[f"Recall@{k}"].append(r)
                user_results[f"NDCG@{k}"].append(n)
                user_results[f"MRR@{k}"].append(mrr)
            
            for metric, values in user_results.items():
                if values:
                    user_metrics[metric] = np.mean(values)
        
        return {**overall_metrics, **user_metrics}
    
    # Load model and preprocessors
    model = CatBoostClassifier()
    model.load_model(model_path)
    with open(tfidf_title_path, "rb") as f:
        tfidf_title = pickle.load(f)
    with open(tfidf_interests_path, "rb") as f:
        tfidf_interests = pickle.load(f)
    with open(scaler_path, "rb") as f:
        scaler = pickle.load(f)
    with open(encoder_path, "rb") as f:
        encoder = pickle.load(f)
    
    # Prepare test data
    test_df = test_df.copy()
    test_df["title"] = test_df["title"].fillna("")
    test_df["user_interests"] = test_df["user_interests"].fillna("")
    
    # Calculate overall metrics
    # TF-IDF features
    tfidf_title_mat = tfidf_title.transform(test_df["title"])
    tfidf_interests_mat = tfidf_interests.transform(test_df["user_interests"])
    
    # Numeric features
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    if model_type == "hybrid":
        # For hybrid model, we need to compute SVD scores
        with open("saved_models/hybrid_svd_component.pkl", "rb") as f:
            svd_model = pickle.load(f)
        
        test_df["svd_score"] = test_df.apply(
            lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, 
            axis=1
        )
        numeric_cols.append("svd_score")
    
    test_df[numeric_cols] = test_df[numeric_cols].fillna(0)
    X_numeric = scaler.transform(test_df[numeric_cols])
    
    # Categorical features
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for col in categorical_cols:
        test_df[col] = test_df[col].fillna('unknown')
    X_cat = encoder.transform(test_df[categorical_cols])
    
    # Combine features
    X = hstack([tfidf_title_mat, tfidf_interests_mat, X_numeric, X_cat]).toarray()
    
    # Get predictions
    y_true = test_df["interaction_label"].values
    y_score = model.predict_proba(X)[:, 1]
    
    # Calculate overall metrics
    overall_metrics = compute_all_metrics(y_true, y_score)
    
    # Calculate per-user metrics
    user_metrics = evaluate_per_user(test_df, model, tfidf_title, tfidf_interests, scaler, encoder)
    
    return {**overall_metrics, **user_metrics}

# Main execution
if __name__ == "__main__":
    # Load data
    import pandas as pd
    # Load data for training and validation
    users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv')
    events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv")
    interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv')
    
    # Load real-world test data
    test_users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_users_data.csv')
    test_events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_events_data.csv")
    test_interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_interactions_data.csv')
    
    # Rename columns in test data
    test_users_df.rename(columns={
        'lat': 'user_lat',
        'lng': 'user_lon',
        'location': 'user_city',
        'indoor_outdoor_preference': 'user_weather_preference',
        'joinedAt': 'signup_date'
    }, inplace=True)

    test_events_df.rename(columns={
        'category': 'event_type',
        'lat': 'event_lat',
        'lng': 'event_lon',
        'city': 'event_city',
        'weather_description': 'weather_condition',
        'temperature_2m_mean': 'temperature'
    }, inplace=True)

    test_interactions_df.rename(columns={
        'distance_to_event': 'interaction_distance_to_event'
    }, inplace=True)
    
    # Preprocess data
    def preprocess_common(interactions_df, users_df, events_df):
        # Create copies of the input dataframes
        interactions_df = interactions_df.copy()
        users_df = users_df.copy()
        events_df = events_df.copy()

        # Drop rows with missing user_id or event_id
        interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
        
        # Convert distance_to_event to float
        interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
        
        # Ensure correct types
        for df in [interactions_df, users_df]:
            df["user_id"] = df["user_id"].astype(str)
        
        for df in [interactions_df, events_df]:
            df["event_id"] = df["event_id"].astype(str)

        interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

        # Convert to string type for TF-IDF fields
        events_df["title"] = events_df["title"].fillna("").astype(str)
        users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
        users_df["age"] = users_df["age"].fillna(0).astype(float)

        # Ensure all numeric fields are float
        numeric_cols = ["temperature", "attendance_rate"]
        for col in numeric_cols:
            events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

        # Return all three dataframes
        return interactions_df, users_df, events_df
    
    # Preprocess training/validation data
    interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
    merged_df = interactions_df.merge(events_df, on="event_id").merge(users_df, on="user_id")
    #(merged_df.columns)
    # Split into train and validation
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    
    # Preprocess test data
    test_interactions_df, test_users_df, test_events_df = preprocess_common(test_interactions_df, test_users_df, test_events_df)
    test_df = test_interactions_df.merge(test_events_df, on="event_id").merge(test_users_df, on="user_id")
    #print(test_df.columns)
    # Train and compare models
    print("Training and comparing models...")
    results_df, content_model, svd_model, hybrid_model_obj, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder = compare_all_models(train_df, val_df)
    
    print("\nValidation Results:")
    print(results_df)
    
    # Evaluate on real-world test data
    print("\nEvaluating models on real-world test data...")
    
    content_test_metrics = evaluate_on_real_world_data(test_df, "content")
    svd_test_metrics = evaluate_on_real_world_data(test_df, "svd")
    hybrid_test_metrics = evaluate_on_real_world_data(test_df, "hybrid")
    
    # Create a DataFrame to display the results
    test_results_df = pd.DataFrame({
        "Content-Based": content_test_metrics,
        "SVD": svd_test_metrics,
        "Hybrid": hybrid_test_metrics,
    })
    
    print("\nReal-World Test Results:")
    print(test_results_df.T)
    
    # # Save the test results
    # test_results_df.to_csv("real_world_test_results.csv")
    
    print("\nEvaluation complete. Models and results saved.")


Training and comparing models...
Training Content-Based model...
Training SVD model...
Training Hybrid (SVD + Content + Weather) model...

Validation Results:
                    AUC       MAP  MRR  Precision@5  Recall@5    NDCG@5  \
Content-Based  0.703533  0.670664  1.0          0.6  0.000197  0.654809   
SVD            0.618879  0.574715  1.0          0.8  0.000263  0.868795   
Hybrid         0.591918  0.572668  1.0          1.0  0.000329  1.000000   

               Precision@10  Recall@10   NDCG@10  
Content-Based           0.8   0.000526  0.775994  
SVD                     0.8   0.000526  0.851236  
Hybrid                  1.0   0.000657  1.000000  

Evaluating models on real-world test data...


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- svd_score


In [25]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
from catboost import CatBoostClassifier

# --- Metrics ---
def compute_ranking_metrics(y_true, y_score, k=10):
    sorted_indices = np.argsort(y_score)[::-1]
    top_k = np.array(y_true)[sorted_indices][:k]
    precision = np.mean(top_k)
    recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
    dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
    ideal_k = min(int(np.sum(y_true)), k)
    idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
    ndcg = dcg / idcg if idcg > 0 else 0
    return precision, recall, ndcg

def mean_reciprocal_rank(y_true, y_score, k=10):
    sorted_indices = np.argsort(y_score)[::-1][:k]
    top_k = np.array(y_true)[sorted_indices]
    ranks = np.where(top_k == 1)[0]
    if len(ranks) == 0:
        return 0.0
    return 1.0 / (ranks[0] + 1)

def compute_all_metrics(y_true, y_score):
    metrics = {}
    if len(np.unique(y_true)) < 2:
        metrics["AUC"] = np.nan
        metrics["MAP"] = np.nan
    else:
        metrics["AUC"] = roc_auc_score(y_true, y_score)
        metrics["MAP"] = average_precision_score(y_true, y_score)
    for k in [5, 10]:
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        metrics[f"Precision@{k}"] = p
        metrics[f"Recall@{k}"] = r
        metrics[f"NDCG@{k}"] = n
    metrics["MRR@10"] = mean_reciprocal_rank(y_true, y_score, k=10)
    return metrics

# --- Model Training ---
def train_content_model(train_df, val_df):
    train_clean = train_df.copy()
    val_clean = val_df.copy()
    for df in [train_clean, val_clean]:
        df["title"] = df["title"].fillna("")
        df["user_interests"] = df["user_interests"].fillna("")
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat_train = tfidf_title.fit_transform(train_clean["title"])
    tfidf_interests_mat_train = tfidf_interests.fit_transform(train_clean["user_interests"])
    tfidf_title_mat_val = tfidf_title.transform(val_clean["title"])
    tfidf_interests_mat_val = tfidf_interests.transform(val_clean["user_interests"])
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    for df in [train_clean, val_clean]:
        df[numeric_cols] = df[numeric_cols].fillna(0)
    scaler = StandardScaler().fit(train_clean[numeric_cols])
    X_numeric_train = scaler.transform(train_clean[numeric_cols])
    X_numeric_val = scaler.transform(val_clean[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for df in [train_clean, val_clean]:
        for col in categorical_cols:
            df[col] = df[col].fillna('unknown')
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_clean[categorical_cols])
    X_cat_val = encoder.transform(val_clean[categorical_cols])
    X_train = hstack([tfidf_title_mat_train, tfidf_interests_mat_train, X_numeric_train, X_cat_train]).toarray()
    X_val = hstack([tfidf_title_mat_val, tfidf_interests_mat_val, X_numeric_val, X_cat_val]).toarray()
    y_train = train_clean["interaction_label"].astype(int)
    y_val = val_clean["interaction_label"].astype(int)
    model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train)
    val_scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, val_scores), tfidf_title, tfidf_interests, scaler, encoder

def train_svd(train_df, val_df):
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    train_svd["interaction_label"] = train_svd["interaction_label"].astype(float)
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    svd = SVD(n_epochs=50).fit(trainset)
    val_copy = val_df.copy()
    val_copy["svd_score"] = val_copy.apply(
        lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    y_val = val_copy["interaction_label"].astype(int)
    val_scores = val_copy["svd_score"]
    return svd, compute_all_metrics(y_val, val_scores)

def train_hybrid_model(train_df, val_df):
    reader = Reader(rating_scale=(0, 1))
    train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
    trainset = train_svd_data.build_full_trainset()
    svd_model = SVD(n_epochs=20).fit(trainset)
    train_df = train_df.copy()
    val_df = val_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    val_df["svd_score"] = val_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    tfidf_title = TfidfVectorizer(max_features=50)
    tfidf_interests = TfidfVectorizer(max_features=50)
    tfidf_title.fit(train_df["title"].fillna(""))
    tfidf_interests.fit(train_df["user_interests"].fillna(""))
    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    X_val_text = hstack([
        tfidf_title.transform(val_df["title"].fillna("")),
        tfidf_interests.transform(val_df["user_interests"].fillna(""))
    ])
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    for df in [train_df, val_df]:
        df[numeric_cols] = df[numeric_cols].fillna(0)
    scaler = StandardScaler().fit(train_df[numeric_cols])
    X_train_numeric = scaler.transform(train_df[numeric_cols])
    X_val_numeric = scaler.transform(val_df[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for df in [train_df, val_df]:
        for col in categorical_cols:
            df[col] = df[col].fillna('unknown')
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_df[categorical_cols])
    X_cat_val = encoder.transform(val_df[categorical_cols])
    X_train = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    X_val = hstack([X_val_text, X_val_numeric, X_cat_val]).toarray()
    y_train = train_df["interaction_label"].astype(int)
    y_val = val_df["interaction_label"].astype(int)
    model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train)
    val_scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, val_scores), tfidf_title, tfidf_interests, scaler, encoder, svd_model

# --- Test Set Evaluation ---
def evaluate_content_or_hybrid_on_test(test_df, model, tfidf_title, tfidf_interests, scaler, encoder, model_type="content", svd_model=None):
    df = test_df.copy()
    df["title"] = df["title"].fillna("")
    df["user_interests"] = df["user_interests"].fillna("")
    if model_type == "hybrid":
        df["svd_score"] = df.apply(
            lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
        )
        numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    else:
        numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    df[numeric_cols] = df[numeric_cols].fillna(0)
    tfidf_title_mat = tfidf_title.transform(df["title"])
    tfidf_interests_mat = tfidf_interests.transform(df["user_interests"])
    X_numeric = scaler.transform(df[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    X_cat = encoder.transform(df[categorical_cols])
    X = hstack([tfidf_title_mat, tfidf_interests_mat, X_numeric, X_cat]).toarray()
    y_true = df["interaction_label"].astype(int)
    y_score = model.predict_proba(X)[:, 1]
    return compute_all_metrics(y_true, y_score)

def evaluate_svd_on_test(test_df, svd_model):
    df = test_df.copy()
    df["svd_score"] = df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    y_true = df["interaction_label"].astype(int)
    y_score = df["svd_score"]
    return compute_all_metrics(y_true, y_score)

import pandas as pd
# Load data for training and validation
users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv')
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv")
interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv')

# Load real-world test data
test_users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_users_data.csv')
test_events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_events_data.csv")
test_interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_interactions_data.csv')

# Rename columns in test data
test_users_df.rename(columns={
    'lat': 'user_lat',
    'lng': 'user_lon',
    'location': 'user_city',
    'indoor_outdoor_preference': 'user_weather_preference',
    'joinedAt': 'signup_date'
}, inplace=True)

test_events_df.rename(columns={
    'category': 'event_type',
    'lat': 'event_lat',
    'lng': 'event_lon',
    'city': 'event_city',
    'weather_description': 'weather_condition',
    'temperature_2m_mean': 'temperature'
}, inplace=True)

test_interactions_df.rename(columns={
    'distance_to_event': 'interaction_distance_to_event'
}, inplace=True)

# Preprocess data
def preprocess_common(interactions_df, users_df, events_df):
    # Create copies of the input dataframes
    interactions_df = interactions_df.copy()
    users_df = users_df.copy()
    events_df = events_df.copy()

    # Drop rows with missing user_id or event_id
    interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
    
    # Convert distance_to_event to float
    interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
    
    # Ensure correct types
    for df in [interactions_df, users_df]:
        df["user_id"] = df["user_id"].astype(str)
    
    for df in [interactions_df, events_df]:
        df["event_id"] = df["event_id"].astype(str)

    interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

    # Convert to string type for TF-IDF fields
    events_df["title"] = events_df["title"].fillna("").astype(str)
    users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
    users_df["age"] = users_df["age"].fillna(0).astype(float)

    # Ensure all numeric fields are float
    numeric_cols = ["temperature", "attendance_rate"]
    for col in numeric_cols:
        events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

    # Return all three dataframes
    return interactions_df, users_df, events_df

# Preprocess training/validation data
interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
merged_df = interactions_df.merge(events_df, on="event_id").merge(users_df, on="user_id")
#(merged_df.columns)
# Split into train and validation
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Preprocess test data
test_interactions_df, test_users_df, test_events_df = preprocess_common(test_interactions_df, test_users_df, test_events_df)
test_df = test_interactions_df.merge(test_events_df, on="event_id").merge(test_users_df, on="user_id")
#print(test_df.columns)

# Train models
content_model, content_val_metrics, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder = train_content_model(train_df, val_df)
svd_model, svd_val_metrics = train_svd(train_df, val_df)
hybrid_model_obj, hybrid_val_metrics, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder, hybrid_svd_model = train_hybrid_model(train_df, val_df)

# Evaluate on test set
content_test_metrics = evaluate_content_or_hybrid_on_test(
    test_df, content_model, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder, model_type="content"
)
svd_test_metrics = evaluate_svd_on_test(test_df, svd_model)
hybrid_test_metrics = evaluate_content_or_hybrid_on_test(
    test_df, hybrid_model_obj, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder, model_type="hybrid", svd_model=hybrid_svd_model
)

# Display results
print("Validation Metrics:")
print("Content-Based:", content_val_metrics)
print("SVD:", svd_val_metrics)
print("Hybrid:", hybrid_val_metrics)
print("\nTest Metrics:")
print("Content-Based:", content_test_metrics)
print("SVD:", svd_test_metrics)
print("Hybrid:", hybrid_test_metrics)


Validation Metrics:
Content-Based: {'AUC': 0.7035326174166626, 'MAP': 0.6706636721021786, 'Precision@5': 0.6, 'Recall@5': 0.00019712201852946975, 'NDCG@5': 0.6548086577531307, 'Precision@10': 0.8, 'Recall@10': 0.000525658716078586, 'NDCG@10': 0.7759944384848245, 'MRR@10': 1.0}
SVD: {'AUC': 0.6229846172682247, 'MAP': 0.5791060550566649, 'Precision@5': 1.0, 'Recall@5': 0.0003285366975491162, 'NDCG@5': 1.0, 'Precision@10': 0.7, 'Recall@10': 0.0004599513765687627, 'NDCG@10': 0.8006937664098821, 'MRR@10': 1.0}
Hybrid: {'AUC': 0.5943294731766849, 'MAP': 0.5725222554606872, 'Precision@5': 0.8, 'Recall@5': 0.000262829358039293, 'NDCG@5': 0.6608397947263839, 'Precision@10': 0.9, 'Recall@10': 0.0005913660555884093, 'NDCG@10': 0.77990823370192, 'MRR@10': 0.5}

Test Metrics:
Content-Based: {'AUC': 0.5210915237187295, 'MAP': 0.9460097660367329, 'Precision@5': 1.0, 'Recall@5': 0.00018755392175250386, 'NDCG@5': 1.0, 'Precision@10': 1.0, 'Recall@10': 0.0003751078435050077, 'NDCG@10': 1.0, 'MRR@10': 1.

last implimentation

In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
from catboost import CatBoostClassifier

# --- Metrics ---
K_LIST = [1, 5, 10, 50, 100]

def compute_ranking_metrics(y_true, y_score, k=10):
    sorted_indices = np.argsort(y_score)[::-1]
    top_k = np.array(y_true)[sorted_indices][:k]
    precision = np.mean(top_k)
    recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
    dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
    ideal_k = min(int(np.sum(y_true)), k)
    idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
    ndcg = dcg / idcg if idcg > 0 else 0
    return precision, recall, ndcg

def mean_reciprocal_rank(y_true, y_score, k=100):
    sorted_indices = np.argsort(y_score)[::-1][:k]
    top_k = np.array(y_true)[sorted_indices]
    ranks = np.where(top_k == 1)[0]
    if len(ranks) == 0:
        return 0.0
    return 1.0 / (ranks[0] + 1)

def compute_all_metrics(y_true, y_score, k_list=K_LIST):
    metrics = {}
    for k in k_list:
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        metrics[f"Precision@{k}"] = p
        metrics[f"Recall@{k}"] = r
        metrics[f"NDCG@{k}"] = n
        metrics[f"MRR@{k}"] = mean_reciprocal_rank(y_true, y_score, k)
    return metrics

# --- Model Training ---
def train_content_model(train_df, val_df):
    train_clean = train_df.copy()
    val_clean = val_df.copy()
    for df in [train_clean, val_clean]:
        df["title"] = df["title"].fillna("")
        df["user_interests"] = df["user_interests"].fillna("")
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat_train = tfidf_title.fit_transform(train_clean["title"])
    tfidf_interests_mat_train = tfidf_interests.fit_transform(train_clean["user_interests"])
    tfidf_title_mat_val = tfidf_title.transform(val_clean["title"])
    tfidf_interests_mat_val = tfidf_interests.transform(val_clean["user_interests"])
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    for df in [train_clean, val_clean]:
        df[numeric_cols] = df[numeric_cols].fillna(0)
    scaler = StandardScaler().fit(train_clean[numeric_cols])
    X_numeric_train = scaler.transform(train_clean[numeric_cols])
    X_numeric_val = scaler.transform(val_clean[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for df in [train_clean, val_clean]:
        for col in categorical_cols:
            df[col] = df[col].fillna('unknown')
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_clean[categorical_cols])
    X_cat_val = encoder.transform(val_clean[categorical_cols])
    X_train = hstack([tfidf_title_mat_train, tfidf_interests_mat_train, X_numeric_train, X_cat_train]).toarray()
    X_val = hstack([tfidf_title_mat_val, tfidf_interests_mat_val, X_numeric_val, X_cat_val]).toarray()
    y_train = train_clean["interaction_label"].astype(int)
    y_val = val_clean["interaction_label"].astype(int)
    model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train)
    val_scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, val_scores), tfidf_title, tfidf_interests, scaler, encoder

def train_svd(train_df, val_df):
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    train_svd["interaction_label"] = train_svd["interaction_label"].astype(float)
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    svd = SVD(n_epochs=50).fit(trainset)
    val_copy = val_df.copy()
    val_copy["svd_score"] = val_copy.apply(
        lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    y_val = val_copy["interaction_label"].astype(int)
    val_scores = val_copy["svd_score"]
    return svd, compute_all_metrics(y_val, val_scores)

def train_hybrid_model(train_df, val_df):
    reader = Reader(rating_scale=(0, 1))
    train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
    trainset = train_svd_data.build_full_trainset()
    svd_model = SVD(n_epochs=20).fit(trainset)
    train_df = train_df.copy()
    val_df = val_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    val_df["svd_score"] = val_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    tfidf_title = TfidfVectorizer(max_features=50)
    tfidf_interests = TfidfVectorizer(max_features=50)
    tfidf_title.fit(train_df["title"].fillna(""))
    tfidf_interests.fit(train_df["user_interests"].fillna(""))
    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    X_val_text = hstack([
        tfidf_title.transform(val_df["title"].fillna("")),
        tfidf_interests.transform(val_df["user_interests"].fillna(""))
    ])
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    for df in [train_df, val_df]:
        df[numeric_cols] = df[numeric_cols].fillna(0)
    scaler = StandardScaler().fit(train_df[numeric_cols])
    X_train_numeric = scaler.transform(train_df[numeric_cols])
    X_val_numeric = scaler.transform(val_df[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for df in [train_df, val_df]:
        for col in categorical_cols:
            df[col] = df[col].fillna('unknown')
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_df[categorical_cols])
    X_cat_val = encoder.transform(val_df[categorical_cols])
    X_train = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    X_val = hstack([X_val_text, X_val_numeric, X_cat_val]).toarray()
    y_train = train_df["interaction_label"].astype(int)
    y_val = val_df["interaction_label"].astype(int)
    model = CatBoostClassifier(iterations=200, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=False)
    model.fit(X_train, y_train)
    val_scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, val_scores), tfidf_title, tfidf_interests, scaler, encoder, svd_model

# --- Test Set Evaluation ---
def evaluate_content_or_hybrid_on_test(test_df, model, tfidf_title, tfidf_interests, scaler, encoder, model_type="content", svd_model=None, k_list=K_LIST):
    df = test_df.copy()
    df["title"] = df["title"].fillna("")
    df["user_interests"] = df["user_interests"].fillna("")
    if model_type == "hybrid":
        df["svd_score"] = df.apply(
            lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
        )
        numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    else:
        numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    df[numeric_cols] = df[numeric_cols].fillna(0)
    tfidf_title_mat = tfidf_title.transform(df["title"])
    tfidf_interests_mat = tfidf_interests.transform(df["user_interests"])
    X_numeric = scaler.transform(df[numeric_cols])
    categorical_cols = ['weather_condition', 'event_indoor_capability', 'user_weather_preference']
    for col in categorical_cols:
        df[col] = df[col].fillna('unknown')
    X_cat = encoder.transform(df[categorical_cols])
    X = hstack([tfidf_title_mat, tfidf_interests_mat, X_numeric, X_cat]).toarray()
    y_true = df["interaction_label"].astype(int)
    y_score = model.predict_proba(X)[:, 1]
    return compute_all_metrics(y_true, y_score, k_list=k_list)

def evaluate_svd_on_test(test_df, svd_model, k_list=K_LIST):
    df = test_df.copy()
    df["svd_score"] = df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    y_true = df["interaction_label"].astype(int)
    y_score = df["svd_score"]
    return compute_all_metrics(y_true, y_score, k_list=k_list)

# --- Example Training/Evaluation Pipeline ---
import pandas as pd
# Load data for training and validation
users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv')
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv")
interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv')

# Load real-world test data
test_users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_users_data.csv')
test_events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_events_data.csv")
test_interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_interactions_data.csv')

# Rename columns in test data
test_users_df.rename(columns={
    'lat': 'user_lat',
    'lng': 'user_lon',
    'location': 'user_city',
    'indoor_outdoor_preference': 'user_weather_preference',
    'joinedAt': 'signup_date'
}, inplace=True)

test_events_df.rename(columns={
    'category': 'event_type',
    'lat': 'event_lat',
    'lng': 'event_lon',
    'city': 'event_city',
    'weather_description': 'weather_condition',
    'temperature_2m_mean': 'temperature'
}, inplace=True)

test_interactions_df.rename(columns={
    'distance_to_event': 'interaction_distance_to_event'
}, inplace=True)

# Preprocess data
def preprocess_common(interactions_df, users_df, events_df):
    # Create copies of the input dataframes
    interactions_df = interactions_df.copy()
    users_df = users_df.copy()
    events_df = events_df.copy()

    # Drop rows with missing user_id or event_id
    interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
    
    # Convert distance_to_event to float
    interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
    
    # Ensure correct types
    for df in [interactions_df, users_df]:
        df["user_id"] = df["user_id"].astype(str)
    
    for df in [interactions_df, events_df]:
        df["event_id"] = df["event_id"].astype(str)

    interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

    # Convert to string type for TF-IDF fields
    events_df["title"] = events_df["title"].fillna("").astype(str)
    users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
    users_df["age"] = users_df["age"].fillna(0).astype(float)

    # Ensure all numeric fields are float
    numeric_cols = ["temperature", "attendance_rate"]
    for col in numeric_cols:
        events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

    # Return all three dataframes
    return interactions_df, users_df, events_df

# Preprocess training/validation data
interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
merged_df = interactions_df.merge(events_df, on="event_id").merge(users_df, on="user_id")
#(merged_df.columns)
# Split into train and validation
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Preprocess test data
test_interactions_df, test_users_df, test_events_df = preprocess_common(test_interactions_df, test_users_df, test_events_df)
test_df = test_interactions_df.merge(test_events_df, on="event_id").merge(test_users_df, on="user_id")
#print(test_df.columns)

# Train models
content_model, content_val_metrics, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder = train_content_model(train_df, val_df)
svd_model, svd_val_metrics = train_svd(train_df, val_df)
hybrid_model_obj, hybrid_val_metrics, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder, hybrid_svd_model = train_hybrid_model(train_df, val_df)

# Evaluate on test set
content_test_metrics = evaluate_content_or_hybrid_on_test(
    test_df, content_model, content_tfidf_title, content_tfidf_interests, content_scaler, content_encoder, model_type="content", k_list=K_LIST
)
svd_test_metrics = evaluate_svd_on_test(test_df, svd_model, k_list=K_LIST)
hybrid_test_metrics = evaluate_content_or_hybrid_on_test(
    test_df, hybrid_model_obj, hybrid_tfidf_title, hybrid_tfidf_interests, hybrid_scaler, hybrid_encoder, model_type="hybrid", svd_model=hybrid_svd_model, k_list=K_LIST
)

# --- Save results in a DataFrame (excluding AUC and MAP) ---
def filter_metrics(metrics_dict):
    return {k: v for k, v in metrics_dict.items() if not (k.startswith("AUC") or k.startswith("MAP"))}

results_df = pd.DataFrame({
    "Content-Based": filter_metrics(content_test_metrics),
    "SVD": filter_metrics(svd_test_metrics),
    "Hybrid": filter_metrics(hybrid_test_metrics),
}).T

results_df.to_csv("test_metrics_k_results.csv")
print(results_df)


               Precision@1  Recall@1  NDCG@1  MRR@1  Precision@5  Recall@5  \
Content-Based          1.0  0.000038     1.0    1.0          1.0  0.000188   
SVD                    1.0  0.000038     1.0    1.0          1.0  0.000188   
Hybrid                 1.0  0.000038     1.0    1.0          1.0  0.000188   

               NDCG@5  MRR@5  Precision@10  Recall@10  NDCG@10  MRR@10  \
Content-Based     1.0    1.0           1.0   0.000375      1.0     1.0   
SVD               1.0    1.0           1.0   0.000375      1.0     1.0   
Hybrid            1.0    1.0           1.0   0.000375      1.0     1.0   

               Precision@50  Recall@50   NDCG@50  MRR@50  Precision@100  \
Content-Based          0.98   0.001838  0.981407     1.0           0.93   
SVD                    0.96   0.001801  0.966914     1.0           0.86   
Hybrid                 0.88   0.001650  0.899723     1.0           0.91   

               Recall@100  NDCG@100  MRR@100  
Content-Based    0.003489  0.943707      1

In [28]:
results_df

Unnamed: 0,Precision@1,Recall@1,NDCG@1,MRR@1,Precision@5,Recall@5,NDCG@5,MRR@5,Precision@10,Recall@10,NDCG@10,MRR@10,Precision@50,Recall@50,NDCG@50,MRR@50,Precision@100,Recall@100,NDCG@100,MRR@100
Content-Based,1.0,3.8e-05,1.0,1.0,1.0,0.000188,1.0,1.0,1.0,0.000375,1.0,1.0,0.98,0.001838,0.981407,1.0,0.93,0.003489,0.943707,1.0
SVD,1.0,3.8e-05,1.0,1.0,1.0,0.000188,1.0,1.0,1.0,0.000375,1.0,1.0,0.96,0.001801,0.966914,1.0,0.86,0.003226,0.886381,1.0
Hybrid,1.0,3.8e-05,1.0,1.0,1.0,0.000188,1.0,1.0,1.0,0.000375,1.0,1.0,0.88,0.00165,0.899723,1.0,0.91,0.003413,0.915821,1.0


In [5]:
def preprocess_common(interactions_df, users_df, events_df):
 
    # Create copies of the input dataframes
    interactions_df = interactions_df.copy()
    users_df = users_df.copy()
    events_df = events_df.copy()

    # Drop rows with missing user_id or event_id
    interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
    
    # Convert distance_to_event to float
    interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
    
    # Ensure correct types
    for df in [interactions_df, users_df]:
        df["user_id"] = df["user_id"].astype(str)
    
    for df in [interactions_df, events_df]:
        df["event_id"] = df["event_id"].astype(str)

    interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

    # Convert to string type for TF-IDF fields
    events_df["title"] = events_df["title"].fillna("").astype(str)
    users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
    users_df["age"] = users_df["age"].fillna(0).astype(float)

    # Ensure all numeric fields are float - fixed to reference events_df instead of df
    numeric_cols = ["temperature", "attendance_rate"]
    for col in numeric_cols:
        events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

    # Return all three dataframes
    return interactions_df, users_df, events_df

interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
len(interactions_df) + len(users_df) + len(events_df)
merged_df = interactions_df.merge(events_df,on="event_id")\
    .merge(users_df, on="user_id")

from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

test_interactions_df, test_users_df, test_events_df = preprocess_common(test_interactions_df, test_users_df, test_events_df)
len(interactions_df) + len(users_df) + len(events_df)
test_df = test_interactions_df.merge(test_events_df,on="event_id")\
    .merge(test_users_df, on="user_id")


Unnamed: 0.1,Unnamed: 0_x,interaction_id,user_id,event_id,interaction_type,interaction_time,interaction_distance_to_event,interaction_label,Unnamed: 0_y,title,...,event_indoor_capability,Unnamed: 0,user_lat,user_lon,user_city,user_weather_preference,age,user_interests,signup_date,social_connectedness
0,0,CJ011J,PG158Y,PP391M,invited & maybe,2025-08-14 19:16:49.777185,10.675627,1,16693,persistent fullrange encoding community causes...,...,False,11299,40.732831,-73.951693,New York,outdoor,25.0,music sports fashion,2023-12-02 02:45:09.610317,16
1,51136,WB346E,PG158Y,AM416J,invited & no,2025-07-08 04:57:54.696089,5.022271,0,12872,openarchitected secondary local area network i...,...,True,11299,40.732831,-73.951693,New York,outdoor,25.0,music sports fashion,2023-12-02 02:45:09.610317,16
2,39875,YL968R,PG158Y,VT464H,invited & maybe,2025-07-23 17:55:09.485708,11.276411,1,15364,phased didactic array business networking in n...,...,True,11299,40.732831,-73.951693,New York,outdoor,25.0,music sports fashion,2023-12-02 02:45:09.610317,16
3,2742,VT157H,PG158Y,CU455G,invited & maybe,2025-08-23 11:48:59.945844,2.537383,1,9647,standalone needsbased orchestration entertainm...,...,True,11299,40.732831,-73.951693,New York,outdoor,25.0,music sports fashion,2023-12-02 02:45:09.610317,16
4,50458,QA779Q,PG158Y,FJ447J,maybe,2025-05-02 10:28:30.708459,1.274298,1,15857,synergized 24hour emulation arts culture in ne...,...,True,11299,40.732831,-73.951693,New York,outdoor,25.0,music sports fashion,2023-12-02 02:45:09.610317,16


In [68]:
test_interactions_df, test_users_df, test_events_df = preprocess_common(test_interactions_df, test_users_df, test_events_df)
len(interactions_df) + len(users_df) + len(events_df)
test_df = test_interactions_df.merge(test_events_df,on="event_id")\
    .merge(test_users_df, on="user_id")

test_df.head()

Unnamed: 0.1,Unnamed: 0_x,user_id,event_id,interaction_type,interaction_distance_to_event,interaction_label,event_weather_condition,event_temperature,event_precipitation_sum,user_weather_condition,...,event_indoor_capability,precipitation_sum,Unnamed: 0,user_lat,user_lon,user_city,user_weather_preference,age,user_interests,signup_date
0,0,3468617687,702719295,maybe,522.753486,1,Cloudy,3.7455,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
1,4,3468617687,284003894,invited & yes,55.942166,1,Cloudy,11.129666,0.0,Partly Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
2,45,3468617687,2951450859,invited & yes,55.67033,1,Cloudy,1.859583,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
3,76,3468617687,3961159311,invited & yes,55.045346,1,Cloudy,6.20475,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
4,87,3468617687,2017343689,invited & yes,56.013814,1,Light Rain,8.367833,16.800001,Light Rain,...,True,16.800001,366,43.158,-79.244,Saint Catharines Ontario,indoor,20,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z


In [11]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

first working code

In [14]:

# # Enhanced benchmarking evaluation and training pipeline with complete metrics and weather features included
# from sklearn.metrics import roc_auc_score, average_precision_score
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report
# from surprise import Dataset, Reader, SVD
# from scipy.sparse import hstack
# import pandas as pd
# import numpy as np
# from catboost import CatBoostClassifier
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# # --- Shared Evaluation ---
# def compute_ranking_metrics(y_true, y_score, k=10):
#     # Sort indices by descending score
#     sorted_indices = np.argsort(y_score)[::-1]
#     # Get top-k true labels
#     top_k = np.array(y_true)[sorted_indices][:k]
#     # Calculate precision
#     precision = np.mean(top_k)
#     # Calculate recall
#     recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
#     # Calculate DCG
#     dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
#     # Calculate ideal DCG
#     ideal_k = min(int(np.sum(y_true)), k)
#     idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
#     # Calculate NDCG
#     ndcg = dcg / idcg if idcg > 0 else 0
#     return precision, recall, ndcg

# def compute_all_metrics(y_true, y_score):
#     # Ensure there are enough positive examples for AUC calculation
#     if len(np.unique(y_true)) < 2:
#         metrics = {
#             "AUC": np.nan,
#             "MAP": np.nan
#         }
#     else:
#         metrics = {
#             "AUC": roc_auc_score(y_true, y_score),
#             "MAP": average_precision_score(y_true, y_score)
#         }
    
#     for k in [5, 10]:
#         p, r, n = compute_ranking_metrics(y_true, y_score, k)
#         metrics[f"Precision@{k}"] = p
#         metrics[f"Recall@{k}"] = r
#         metrics[f"NDCG@{k}"] = n
#     return metrics


Training Content-Based model...
Training SVD model...
Training Hybrid (SVD + Content + Weather) model with strict validation...


Unnamed: 0,AUC,MAP,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
Content-Based,0.893437,0.991727,1.0,0.000235,1.0,1.0,0.000469,1.0
SVD,0.990446,0.999411,1.0,0.000235,1.0,1.0,0.000469,1.0
Hybrid,0.712929,0.966202,1.0,0.000935,1.0,1.0,0.001871,1.0


In [24]:

# # Enhanced benchmarking evaluation and training pipeline with complete metrics and weather features included
# from sklearn.metrics import roc_auc_score, average_precision_score
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import classification_report
# from surprise import Dataset, Reader, SVD
# from scipy.sparse import hstack
# import pandas as pd
# import numpy as np
# from catboost import CatBoostClassifier
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# # --- Shared Evaluation ---
# def compute_ranking_metrics(y_true, y_score, k=10):
#     # Sort indices by descending score
#     sorted_indices = np.argsort(y_score)[::-1]
#     # Get top-k true labels
#     top_k = np.array(y_true)[sorted_indices][:k]
#     # Calculate precision
#     precision = np.mean(top_k)
#     # Calculate recall
#     recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
#     # Calculate DCG
#     dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
#     # Calculate ideal DCG
#     ideal_k = min(int(np.sum(y_true)), k)
#     idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
#     # Calculate NDCG
#     ndcg = dcg / idcg if idcg > 0 else 0
#     return precision, recall, ndcg

# def compute_all_metrics(y_true, y_score):
#     # Ensure there are enough positive examples for AUC calculation
#     if len(np.unique(y_true)) < 2:
#         metrics = {
#             "AUC": np.nan,
#             "MAP": np.nan
#         }
#     else:
#         metrics = {
#             "AUC": roc_auc_score(y_true, y_score),
#             "MAP": average_precision_score(y_true, y_score)
#         }
    
#     for k in [5, 10]:
#         p, r, n = compute_ranking_metrics(y_true, y_score, k)
#         metrics[f"Precision@{k}"] = p
#         metrics[f"Recall@{k}"] = r
#         metrics[f"NDCG@{k}"] = n
#     return metrics


# def train_content_model(train_df, val_df):
#     # Clean and prepare training data
#     train_clean = train_df.copy()
#     train_clean["title"] = train_clean["title"].fillna("")
#     train_clean["user_interests"] = train_clean["user_interests"].fillna("")
    
#     # Clean and prepare validation data
#     val_clean = val_df.copy()
#     val_clean["title"] = val_clean["title"].fillna("")
#     val_clean["user_interests"] = val_clean["user_interests"].fillna("")

#     # TF-IDF features - fit only on training data
#     tfidf_title = TfidfVectorizer(max_features=100)
#     tfidf_interests = TfidfVectorizer(max_features=100)
#     tfidf_title_mat_train = tfidf_title.fit_transform(train_clean["title"])
#     tfidf_interests_mat_train = tfidf_interests.fit_transform(train_clean["user_interests"])
    
#     # Transform validation data using fitted vectorizers
#     tfidf_title_mat_val = tfidf_title.transform(val_clean["title"])
#     tfidf_interests_mat_val = tfidf_interests.transform(val_clean["user_interests"])

#     # Numeric features
#     numeric_cols = ["distance_to_event", "temperature", "age", "attendance_rate"]
#     train_clean[numeric_cols] = train_clean[numeric_cols].fillna(0)
#     val_clean[numeric_cols] = val_clean[numeric_cols].fillna(0)
    
#     # Fit scaler on training data only
#     scaler = StandardScaler().fit(train_clean[numeric_cols])
#     X_numeric_train = scaler.transform(train_clean[numeric_cols])
#     X_numeric_val = scaler.transform(val_clean[numeric_cols])
    
#     # Categorical features with OneHotEncoder
#     categorical_cols = ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']
    
#     # Fill missing values in categorical columns
#     for col in categorical_cols:
#         train_clean[col] = train_clean[col].fillna('unknown')
#         val_clean[col] = val_clean[col].fillna('unknown')
    
#     # Create OneHotEncoder for categorical features
#     encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
#     X_cat_train = encoder.fit_transform(train_clean[categorical_cols])
#     X_cat_val = encoder.transform(val_clean[categorical_cols])

#     # Combine all features
#     X_train = hstack([tfidf_title_mat_train, tfidf_interests_mat_train, X_numeric_train, X_cat_train]).toarray()
#     X_val = hstack([tfidf_title_mat_val, tfidf_interests_mat_val, X_numeric_val, X_cat_val]).toarray()
    
#     y_train = train_clean["interaction_label"].astype(int)
#     y_val = val_clean["interaction_label"].astype(int)

#     # Train CatBoost on training data only
#     model = CatBoostClassifier(
#         iterations=200,
#         depth=6,
#         learning_rate=0.1,
#         loss_function='Logloss',
#         verbose=False
#     )
#     model.fit(X_train, y_train)
    
#     # Predict on validation data
#     val_scores = model.predict_proba(X_val)[:, 1]

#     return model, compute_all_metrics(y_val, val_scores)

# def train_svd(train_df, val_df):
#     # Prepare training data for SVD
#     reader = Reader(rating_scale=(0, 1))
#     train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    
#     # Convert all to strings to ensure compatibility
#     train_svd["user_id"] = train_svd["user_id"].astype(str)
#     train_svd["event_id"] = train_svd["event_id"].astype(str)
#     train_svd["interaction_label"] = train_svd["interaction_label"].astype(float)
    
#     data = Dataset.load_from_df(train_svd, reader)
#     trainset = data.build_full_trainset()
    
#     # Train SVD model on training data only
#     svd = SVD(n_epochs=50).fit(trainset)
    
#     # Generate predictions for validation data
#     val_copy = val_df.copy()
#     val_copy["svd_score"] = val_copy.apply(
#         lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, 
#         axis=1
#     )
    
#     return svd, compute_all_metrics(val_copy["interaction_label"].astype(int), val_copy["svd_score"])

# def hybrid_model(train_df, val_df):
#     from sklearn.preprocessing import OneHotEncoder
    
#     # Step 1: Train SVD model on train_df
#     reader = Reader(rating_scale=(0, 1))
#     train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
#     trainset = train_svd_data.build_full_trainset()
#     svd_model = SVD(n_epochs=20).fit(trainset)

#     # Step 2: Compute SVD scores for both train and val
#     train_df = train_df.copy()
#     val_df = val_df.copy()
#     train_df["svd_score"] = train_df.apply(
#         lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
#     )
#     val_df["svd_score"] = val_df.apply(
#         lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
#     )

#     # Step 3: TF-IDF + numeric features
#     tfidf_title = TfidfVectorizer(max_features=50)
#     tfidf_interests = TfidfVectorizer(max_features=50)
#     tfidf_title.fit(train_df["title"].fillna(""))
#     tfidf_interests.fit(train_df["user_interests"].fillna(""))

#     X_train_text = hstack([
#         tfidf_title.transform(train_df["title"].fillna("")),
#         tfidf_interests.transform(train_df["user_interests"].fillna(""))
#     ])
#     X_val_text = hstack([
#         tfidf_title.transform(val_df["title"].fillna("")),
#         tfidf_interests.transform(val_df["user_interests"].fillna(""))
#     ])

#     # Numeric features including SVD score
#     numeric_cols = ["distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
#     train_df[numeric_cols] = train_df[numeric_cols].fillna(0)
#     val_df[numeric_cols] = val_df[numeric_cols].fillna(0)

#     scaler = StandardScaler().fit(train_df[numeric_cols])
#     X_train_numeric = scaler.transform(train_df[numeric_cols])
#     X_val_numeric = scaler.transform(val_df[numeric_cols])
    
#     # Categorical features with OneHotEncoder
#     categorical_cols = ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']
    
#     # Fill missing values in categorical columns
#     for col in categorical_cols:
#         train_df[col] = train_df[col].fillna('unknown')
#         val_df[col] = val_df[col].fillna('unknown')
    
#     # Create OneHotEncoder for categorical features
#     encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
#     X_cat_train = encoder.fit_transform(train_df[categorical_cols])
#     X_cat_val = encoder.transform(val_df[categorical_cols])

#     # Combine all features
#     X_train = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
#     X_val = hstack([X_val_text, X_val_numeric, X_cat_val]).toarray()
    
#     y_train = train_df["interaction_label"].astype(int)
#     y_val = val_df["interaction_label"].astype(int)

#     model = CatBoostClassifier(
#         iterations=200,
#         depth=6,
#         learning_rate=0.1,
#         loss_function='Logloss',
#         verbose=False
#     )
#     model.fit(X_train, y_train)

#     scores = model.predict_proba(X_val)[:, 1]
#     return model, compute_all_metrics(y_val, scores)
# '✅ Hybrid model updated to use CatBoost for better non-linear modeling.'


# def compare_all_models(train_df, val_df):
#     train_df = train_df.copy()
#     val_df = val_df.copy()

#     # Ensure correct types and fill missing
#     for df in [train_df, val_df]:
#         df["user_id"] = df["user_id"].astype(str)
#         df["event_id"] = df["event_id"].astype(str)
#         df["interaction_label"] = df["interaction_label"].astype(int)
#         df["title"] = df["title"].fillna("").astype(str)
#         df["user_interests"] = df["user_interests"].fillna("").astype(str)
        
#         # Ensure categorical columns exist
#         for col in ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']:
#             if col not in df.columns:
#                 df[col] = "unknown"

#     print("Training Content-Based model...")
#     _, content_scores = train_content_model(train_df, val_df)

#     print("Training SVD model...")
#     _, svd_scores = train_svd(train_df, val_df)

#     print("Training Hybrid (SVD + Content + Weather) model with strict validation...")
#     _, hybrid_scores = hybrid_model(train_df, val_df)

#     # Assemble final results
#     results_df = pd.DataFrame({
#         "Content-Based": content_scores,
#         "SVD": svd_scores,
#         "Hybrid": hybrid_scores,
#     })

#     return results_df.T

# compare_all_models(train_df, val_df)

Training Content-Based model...
Training SVD model...
Training Hybrid (SVD + Content + Weather) model with strict validation...


Unnamed: 0,AUC,MAP,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
Content-Based,0.775522,0.980886,1.0,0.000935,1.0,1.0,0.001871,1.0
SVD,0.79764,0.980718,1.0,0.000935,1.0,1.0,0.001871,1.0
Hybrid,0.708555,0.964895,0.8,0.000748,0.853932,0.8,0.001497,0.831848


In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from scipy.sparse import hstack
from tqdm import tqdm
from surprise import Dataset, Reader, SVD
# ====================== COMMON EVALUATION FRAMEWORK ======================
class RecommenderEvaluator:
    def __init__(self, events_df, max_distance_km=300, weather_threshold=0.3):
        self.events_df = events_df
        self.max_distance = max_distance_km
        self.weather_thresh = weather_threshold
        
    def compute_weather_match(self, user_weather, event_weather, user_pref, is_indoor, 
                            user_temp, event_temp, user_precip, event_precip):
        """Identical weather scoring to target model"""
        score = 0.0
        if user_weather == event_weather:
            score += 0.4
        elif (user_weather in ['Sunny','Cloudy','Clear','Partly Cloudy'] and 
              event_weather in ['Sunny','Cloudy','Clear','Partly Cloudy']):
            score += 0.2
        elif (user_weather in ['Rain','Light Drizzle','Heavy Rain','Thunderstorm'] and 
              event_weather in ['Rain','Light Drizzle','Heavy Rain','Thunderstorm']):
            score += 0.2
            
        if user_pref == 'any':
            score += 0.3
        elif (user_pref == 'indoor' and is_indoor) or (user_pref == 'outdoor' and not is_indoor):
            score += 0.3
            
        temp_diff = abs(user_temp - event_temp)
        precip_diff = abs(user_precip - event_precip)
        temp_score = max(0, 1 - (temp_diff / 20))
        precip_score = max(0, 1 - (precip_diff / 10))
        score += 0.3 * (0.6 * temp_score + 0.4 * precip_score)
        return round(score, 2)
    
    def filter_candidates(self, user_data, candidates):
        """Apply weather/distance filtering"""
        if 'interaction_distance_to_event' not in candidates:
            candidates = candidates.merge(
                user_data[['event_id', 'interaction_distance_to_event']].drop_duplicates(),
                on='event_id', how='left'
            )
            
        # Distance filter
        candidates = candidates[candidates['interaction_distance_to_event'] <= self.max_distance]
        
        # Weather scoring
        weather_scores = []
        for _, event in candidates.iterrows():
            score = self.compute_weather_match(
                user_data['user_weather_preference'].iloc[0],
                event.get('weather_condition', 'any'),
                user_data['user_weather_preference'].iloc[0],
                event['event_indoor_capability'],
                user_data.get('temperature', 20),
                event.get('temperature', 20),
                user_data.get('precipitation', 0),
                event.get('precipitation', 0)
            )
            weather_scores.append(score)
            
        candidates = candidates.assign(weather_score=weather_scores)
        return candidates[candidates['weather_score'] >= self.weather_thresh]
    
    def evaluate(self, model, test_df, model_type='content', k_list=[1,5,10]):
        """Unified evaluation for all model types"""
        metrics = {f'{m}@{k}':[] for m in ['P','R','NDCG'] for k in k_list}
        metrics['MRR'] = []
        
        for user_id, user_data in tqdm(test_df.groupby('user_id'), desc=f"Evaluating {model_type}"):
            # Generate candidates (positives + sampled negatives)
            positives = user_data[user_data['interaction_label'] == 1]
            if len(positives) == 0:
                continue
                
            # Get all possible negatives
            negative_pool = self.events_df[
                ~self.events_df['event_id'].isin(positives['event_id'])
            ]
            
            # Sample negatives (handle case where pool is smaller than 100)
            n_sample = min(100, len(negative_pool))
            negatives = negative_pool.sample(n_sample) if n_sample > 0 else pd.DataFrame()
            
            if len(negatives) == 0:
                candidates = positives[['event_id','interaction_label']].merge(
                    self.events_df, on='event_id')
            else:
                candidates = pd.concat([
                    positives[['event_id','interaction_label']],
                    negatives[['event_id']].assign(interaction_label=0)
                ]).merge(self.events_df, on='event_id')
            
            # Apply filters
            candidates = self.filter_candidates(user_data, candidates)
            if len(candidates) == 0:
                continue
                
            # Get predictions based on model type
            if model_type == 'content':
                scores = self._predict_content(model, user_data, candidates)
            elif model_type == 'svd':
                scores = self._predict_svd(model, user_data, candidates)
            else:  # hybrid
                scores = self._predict_hybrid(model, user_data, candidates)
                
            # Compute metrics
            y_true = candidates['interaction_label'].values
            ranked_idx = np.argsort(scores)[::-1]
            y_true_sorted = y_true[ranked_idx]
            
            for k in k_list:
                rel = y_true_sorted[:k]
                metrics[f'P@{k}'].append(np.sum(rel) / k)
                metrics[f'R@{k}'].append(np.sum(rel) / np.sum(y_true))
                metrics[f'NDCG@{k}'].append(ndcg_score([y_true], [scores], k=k))
                
            # MRR
            pos_ranks = np.where(y_true_sorted == 1)[0]
            metrics['MRR'].append(1/(pos_ranks[0]+1) if len(pos_ranks) > 0 else 0)
        
        return {k:np.mean(v) for k,v in metrics.items()}
    
    def _predict_content(self, model, user_data, candidates):
        """Content-based prediction"""
        # Assume model contains: tfidf_title, tfidf_interests, scaler, encoder, cb_model
        X_text = hstack([
            model['tfidf_title'].transform(candidates['title'].fillna("")),
            model['tfidf_interests'].transform(
                [user_data['user_interests'].iloc[0]] * len(candidates))
        ])
        
        X_num = model['scaler'].transform(candidates[
            ['interaction_distance_to_event', 'temperature', 'age', 'attendance_rate']
        ])
        X_cat = model['encoder'].transform(candidates[
            ['weather_condition', 'events_event_indoor_capability', 
             'user_weather_preference']
        ])
        
        return model['cb_model'].predict_proba(
            hstack([X_text, X_num, X_cat]).toarray()
        )[:, 1]
    
    def _predict_svd(self, model, user_data, candidates):
        """SVD prediction"""
        return candidates.apply(
            lambda x: model.predict(str(user_data['user_id'].iloc[0]), 
                                  str(x['event_id'])).est,
            axis=1
        )
    
    def _predict_hybrid(self, model, user_data, candidates):
        """Hybrid prediction"""
        # Get SVD scores
        svd_scores = candidates.apply(
            lambda x: model['svd'].predict(str(user_data['user_id'].iloc[0]), 
                                         str(x['event_id'])).est,
            axis=1
        )
        
        # Get content features
        X_text = hstack([
            model['tfidf_title'].transform(candidates['title'].fillna("")),
            model['tfidf_interests'].transform(
                [user_data['user_interests'].iloc[0]] * len(candidates))
        ])
        X_num = model['scaler'].transform(
            candidates[['distance_to_event', 'temperature', 'age', 'attendance_rate']]
            .assign(svd_score=svd_scores)  # Include SVD score as feature
        )
        X_cat = model['encoder'].transform(candidates[
            ['weather_condition', 'events_event_indoor_capability', 
             'user_weather_preference']
        ])
        
        return model['cb_model'].predict_proba(
            hstack([X_text, X_num, X_cat]).toarray()
        )[:, 1]

# ====================== EVALUATION EXECUTION ======================
def evaluate_all_models(test_df, events_df, models):
    """Evaluate all baseline models on real-world data"""
    evaluator = RecommenderEvaluator(events_df)
    
    results = {}
    for name, model in models.items():
        model_type = 'content' if 'cb_model' in model else \
                    'svd' if isinstance(model, SVD) else 'hybrid'
        results[name] = evaluator.evaluate(model, test_df, model_type)
    
    return pd.DataFrame(results).T.round(4)

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
import pickle
import os

model_save_path="/home/nkama/masters_thesis_project/thesis/models"

def train_and_save_models(train_df, model_save_path=model_save_path):
    """Train all baseline models and return them in evaluation-ready format"""
    models = {}
    
    # ===== 1. Train Content-Based Model =====
    print("Training content-based model...")
    # Feature engineering
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat = tfidf_title.fit_transform(train_df["title"].fillna(""))
    tfidf_interests_mat = tfidf_interests.fit_transform(train_df["user_interests"].fillna(""))
    
    # Numeric and categorical
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    scaler = StandardScaler().fit(train_df[numeric_cols].fillna(0))
    
    categorical_cols = ['weather_condition', 'event_indoor_capability', 
                       'user_weather_preference']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    encoder.fit(train_df[categorical_cols].fillna('unknown'))
    
    # Combine features
    X_train = hstack([
        tfidf_title_mat,
        tfidf_interests_mat,
        scaler.transform(train_df[numeric_cols].fillna(0)),
        encoder.transform(train_df[categorical_cols].fillna('unknown'))
    ]).toarray()
    
    # Train CatBoost
    cb_model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        verbose=False
    )
    cb_model.fit(X_train, train_df["interaction_label"].astype(int))
    
    # Package content model
    models['content'] = {
        'tfidf_title': tfidf_title,
        'tfidf_interests': tfidf_interests,
        'scaler': scaler,
        'encoder': encoder,
        'cb_model': cb_model
    }
    
    # ===== 2. Train SVD Model =====
    print("Training SVD model...")
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    
    svd_model = SVD(n_epochs=50)
    svd_model.fit(trainset)
    models['svd'] = svd_model
    
        # ===== 3. Train Hybrid Model =====
    print("Training hybrid model...")
    # First get SVD scores for training data
    train_df = train_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est,
        axis=1
    )
    
    # Create new scaler for hybrid model that includes svd_score
    numeric_cols_hybrid = numeric_cols + ["svd_score"]
    scaler_hybrid = StandardScaler().fit(train_df[numeric_cols_hybrid].fillna(0))
    
    # TF-IDF features (reuse from content model)
    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    
    # Numeric features with new scaler
    X_train_numeric = scaler_hybrid.transform(train_df[numeric_cols_hybrid].fillna(0))
    
    # Categorical features (reuse encoder)
    X_cat_train = encoder.transform(train_df[categorical_cols].fillna('unknown'))
    
    # Combine all features
    X_train_hybrid = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    
    # Train hybrid CatBoost
    hybrid_cb = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        verbose=False
    )
    hybrid_cb.fit(X_train_hybrid, train_df["interaction_label"].astype(int))
    
    # Package hybrid model
    models['hybrid'] = {
        'svd': svd_model,
        'tfidf_title': tfidf_title,
        'tfidf_interests': tfidf_interests,
        'scaler': scaler_hybrid,  # Use the new scaler
        'encoder': encoder,
        'cb_model': hybrid_cb
    }
    
    # ===== Save Models =====
    if model_save_path:
        os.makedirs(model_save_path, exist_ok=True)
        for name, model in models.items():
            with open(f"{model_save_path}/{name}_model.pkl", "wb") as f:
                pickle.dump(model, f)
    
    return models

# Example Usage:
models = train_and_save_models(train_df)
# Then evaluate on real-world data:
results = evaluate_all_models(test_df, events_df, models)
print(results)

Training content-based model...
Training SVD model...
Training hybrid model...


In [39]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score
from scipy.sparse import hstack
from tqdm import tqdm
from surprise import Dataset, Reader, SVD
# ====================== COMMON EVALUATION FRAMEWORK ======================
class RecommenderEvaluator:
    def __init__(self, events_df, max_distance_km=2000, weather_threshold=0.2):
        self.events_df = events_df
        self.max_distance = max_distance_km
        self.weather_thresh = weather_threshold
        
    def compute_weather_match(self, user_weather, event_weather, user_pref, is_indoor, 
                            user_temp, event_temp, user_precip, event_precip):
        """Identical weather scoring to target model"""
        score = 0.0
        if user_weather == event_weather:
            score += 0.4
        elif (user_weather in ['Sunny','Cloudy','Clear','Partly Cloudy'] and 
              event_weather in ['Sunny','Cloudy','Clear','Partly Cloudy']):
            score += 0.2
        elif (user_weather in ['Rain','Light Drizzle','Heavy Rain','Thunderstorm'] and 
              event_weather in ['Rain','Light Drizzle','Heavy Rain','Thunderstorm']):
            score += 0.2
            
        if user_pref == 'any':
            score += 0.3
        elif (user_pref == 'indoor' and is_indoor) or (user_pref == 'outdoor' and not is_indoor):
            score += 0.3
            
        temp_diff = abs(user_temp - event_temp)
        precip_diff = abs(user_precip - event_precip)
        temp_score = max(0, 1 - (temp_diff / 20))
        precip_score = max(0, 1 - (precip_diff / 10))
        score += 0.3 * (0.6 * temp_score + 0.4 * precip_score)
        return round(score, 2)
    
    def filter_candidates(self, user_data, candidates):
        """Apply weather/distance filtering"""
        if 'interaction_distance_to_event' not in candidates:
            candidates = candidates.merge(
                user_data[['event_id', 'interaction_distance_to_event']].drop_duplicates(),
                on='event_id', how='left'
            )
            
        # Distance filter
        candidates = candidates[candidates['interaction_distance_to_event'] <= self.max_distance]
        
        # Weather scoring
        weather_scores = []
        for _, event in candidates.iterrows():
            score = self.compute_weather_match(
                user_data['user_weather_preference'].iloc[0],
                event.get('weather_condition', 'any'),
                user_data['user_weather_preference'].iloc[0],
                event['event_indoor_capability'],
                user_data.get('temperature', 20),
                event.get('temperature', 20),
                user_data.get('precipitation', 0),
                event.get('precipitation', 0)
            )
            weather_scores.append(score)
        candidates = candidates.assign(weather_score=weather_scores)
        return candidates[candidates['weather_score'] >= self.weather_thresh]
    
    def evaluate(self, model, test_df, model_type='content', k_list=[1,5,10]):
        """Unified evaluation for all model types"""
        metrics = {f'{m}@{k}':[] for m in ['P','R','NDCG'] for k in k_list}
        metrics['MRR'] = []
        
        for user_id, user_data in tqdm(test_df.groupby('user_id'), desc=f"Evaluating {model_type}"):
            # Generate candidates (positives + sampled negatives)
            positives = user_data[user_data['interaction_label'] == 1]
            if len(positives) == 0:
                continue
                
            # Get all possible negatives
            negative_pool = self.events_df[
                ~self.events_df['event_id'].isin(positives['event_id'])
            ]
            
            # Sample negatives (handle case where pool is smaller than 100)
            n_sample = min(100, len(negative_pool))
            negatives = negative_pool.sample(n_sample) if n_sample > 0 else pd.DataFrame()
            
            if len(negatives) == 0:
                candidates = positives[['event_id','interaction_label']].merge(
                    self.events_df, on='event_id')
            else:
                candidates = pd.concat([
                    positives[['event_id','interaction_label']],
                    negatives[['event_id']].assign(interaction_label=0)
                ]).merge(self.events_df, on='event_id')
            
            # Apply filters
            candidates = self.filter_candidates(user_data, candidates)
            # Add this inside evaluate() after filtering candidates

            if len(candidates) == 0:
                continue
                
            # Get predictions based on model type
            if model_type == 'content':
                scores = self._predict_content(model, user_data, candidates)
            elif model_type == 'svd':
                scores = self._predict_svd(model, user_data, candidates)
            else:  # hybrid
                scores = self._predict_hybrid(model, user_data, candidates)
                
            # Compute metrics
            y_true = candidates['interaction_label'].values
            if np.sum(y_true) == 0:
                print(f"User {user_id} has no positive examples after filtering")
                continue
            ranked_idx = np.argsort(scores)[::-1]
            y_true_sorted = y_true[ranked_idx]
            
            for k in k_list:
                rel = y_true_sorted[:k]
                metrics[f'P@{k}'].append(np.sum(rel) / k)
                metrics[f'R@{k}'].append(np.sum(rel) / np.sum(y_true))
                metrics[f'NDCG@{k}'].append(ndcg_score([y_true], [scores], k=k))
                
            # MRR
            pos_ranks = np.where(y_true_sorted == 1)[0]
            metrics['MRR'].append(1/(pos_ranks[0]+1) if len(pos_ranks) > 0 else 0)
        
        return {k:np.mean(v) for k,v in metrics.items()}
    
    def _predict_content(self, model, user_data, candidates):
        """Content-based prediction"""
        # Assume model contains: tfidf_title, tfidf_interests, scaler, encoder, cb_model
        X_text = hstack([
            model['tfidf_title'].transform(candidates['title'].fillna("")),
            model['tfidf_interests'].transform(
                [user_data['user_interests'].iloc[0]] * len(candidates))
        ])
        
        X_num = model['scaler'].transform(candidates[
            ['interaction_distance_to_event', 'temperature', 'age', 'attendance_rate']
        ].fillna(0))
        
        X_cat = model['encoder'].transform(candidates[
            ['weather_condition', 'event_indoor_capability', 
             'user_weather_preference']
        ].fillna('unknown'))
        
        return model['cb_model'].predict_proba(
            hstack([X_text, X_num, X_cat]).toarray()
        )[:, 1]
    
    def _predict_svd(self, model, user_data, candidates):
        """SVD prediction"""
        return candidates.apply(
            lambda x: model.predict(str(user_data['user_id'].iloc[0]), 
                                  str(x['event_id'])).est,
            axis=1
        )
    
    def _predict_hybrid(self, model, user_data, candidates):
        """Hybrid prediction"""
        # Get SVD scores
        svd_scores = candidates.apply(
            lambda x: model['svd'].predict(str(user_data['user_id'].iloc[0]), 
                                         str(x['event_id'])).est,
            axis=1
        )
        
        # Get content features
        X_text = hstack([
            model['tfidf_title'].transform(candidates['title'].fillna("")),
            model['tfidf_interests'].transform(
                [user_data['user_interests'].iloc[0]] * len(candidates))
        ])
        
        # Correct column name from distance_to_event to interaction_distance_to_event
        X_num = model['scaler'].transform(
            candidates[['interaction_distance_to_event', 'temperature', 'age', 'attendance_rate']]
            .assign(svd_score=svd_scores)  # Include SVD score as feature
            .fillna(0)
        )
        
        X_cat = model['encoder'].transform(candidates[
            ['weather_condition', 'event_indoor_capability', 
             'user_weather_preference']
        ].fillna('unknown'))
        
        return model['cb_model'].predict_proba(
            hstack([X_text, X_num, X_cat]).toarray()
        )[:, 1]

# ====================== EVALUATION EXECUTION ======================
def evaluate_all_models(test_df, events_df, models):
    """Evaluate all baseline models on real-world data"""
    evaluator = RecommenderEvaluator(events_df)
    
    results = {}
    for name, model in models.items():
        model_type = 'svd' if isinstance(model, SVD) else \
            'content' if isinstance(model, dict) and 'cb_model' in model and 'svd' not in model else 'hybrid'

        results[name] = evaluator.evaluate(model, test_df, model_type, k_list=[1, 5, 10, 50, 100])
    
    # Create DataFrame and exclude AUC/MAP scores
    results_df = pd.DataFrame(results).T.round(4)
    return results_df

from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy.sparse import hstack
import pickle
import os

model_save_path="/home/nkama/masters_thesis_project/thesis/models"

def train_and_save_models(train_df, model_save_path=model_save_path):
    """Train all baseline models and return them in evaluation-ready format"""
    models = {}
    
    # ===== 1. Train Content-Based Model =====
    print("Training content-based model...")
    # Feature engineering
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat = tfidf_title.fit_transform(train_df["title"].fillna(""))
    tfidf_interests_mat = tfidf_interests.fit_transform(train_df["user_interests"].fillna(""))
    
    # Numeric and categorical
    numeric_cols = ["interaction_distance_to_event", "temperature", "age", "attendance_rate"]
    scaler = StandardScaler().fit(train_df[numeric_cols].fillna(0))
    
    categorical_cols = ['weather_condition', 'event_indoor_capability', 
                       'user_weather_preference']
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    encoder.fit(train_df[categorical_cols].fillna('unknown'))
    
    # Combine features
    X_train = hstack([
        tfidf_title_mat,
        tfidf_interests_mat,
        scaler.transform(train_df[numeric_cols].fillna(0)),
        encoder.transform(train_df[categorical_cols].fillna('unknown'))
    ]).toarray()
    
    # Train CatBoost
    cb_model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        verbose=False
    )
    cb_model.fit(X_train, train_df["interaction_label"].astype(int))
    
    # Package content model
    models['content'] = {
        'tfidf_title': tfidf_title,
        'tfidf_interests': tfidf_interests,
        'scaler': scaler,
        'encoder': encoder,
        'cb_model': cb_model
    }
    
    # ===== 2. Train SVD Model =====
    print("Training SVD model...")
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    
    svd_model = SVD(n_epochs=50)
    svd_model.fit(trainset)
    models['svd'] = svd_model
    
    # ===== 3. Train Hybrid Model =====
    print("Training hybrid model...")
    # First get SVD scores for training data
    train_df = train_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est,
        axis=1
    )
    
    # Create new scaler for hybrid model that includes svd_score
    numeric_cols_hybrid = numeric_cols + ["svd_score"]
    scaler_hybrid = StandardScaler().fit(train_df[numeric_cols_hybrid].fillna(0))
    
    # TF-IDF features (reuse from content model)
    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    
    # Numeric features with new scaler
    X_train_numeric = scaler_hybrid.transform(train_df[numeric_cols_hybrid].fillna(0))
    
    # Categorical features (reuse encoder)
    X_cat_train = encoder.transform(train_df[categorical_cols].fillna('unknown'))
    
    # Combine all features
    X_train_hybrid = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    
    # Train hybrid CatBoost
    hybrid_cb = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        verbose=False
    )
    hybrid_cb.fit(X_train_hybrid, train_df["interaction_label"].astype(int))
    
    # Package hybrid model
    models['hybrid'] = {
        'svd': svd_model,
        'tfidf_title': tfidf_title,
        'tfidf_interests': tfidf_interests,
        'scaler': scaler_hybrid,  # Use the new scaler
        'encoder': encoder,
        'cb_model': hybrid_cb
    }
    
    # ===== Save Models =====
    if model_save_path:
        os.makedirs(model_save_path, exist_ok=True)
        for name, model in models.items():
            with open(f"{model_save_path}/{name}_model.pkl", "wb") as f:
                pickle.dump(model, f)
    
    return models

# Example Usage:
models = train_and_save_models(train_df)
#Then evaluate on real-world data:
results = evaluate_all_models(test_df, events_df, models)
print(results)


Training content-based model...
Training SVD model...
Training hybrid model...


Evaluating content: 100%|██████████| 5315/5315 [02:23<00:00, 37.14it/s] 
Evaluating svd: 100%|██████████| 5315/5315 [03:56<00:00, 22.44it/s] 
Evaluating hybrid: 100%|██████████| 5315/5315 [04:10<00:00, 21.23it/s] 


         P@1  P@5  P@10  P@50  P@100  R@1  R@5  R@10  R@50  R@100  NDCG@1  \
content  NaN  NaN   NaN   NaN    NaN  NaN  NaN   NaN   NaN    NaN     NaN   
svd      NaN  NaN   NaN   NaN    NaN  NaN  NaN   NaN   NaN    NaN     NaN   
hybrid   NaN  NaN   NaN   NaN    NaN  NaN  NaN   NaN   NaN    NaN     NaN   

         NDCG@5  NDCG@10  NDCG@50  NDCG@100  MRR  
content     NaN      NaN      NaN       NaN  NaN  
svd         NaN      NaN      NaN       NaN  NaN  
hybrid      NaN      NaN      NaN       NaN  NaN  


In [49]:
import hopsworks

# Connect to Hopsworks
project = hopsworks.login()
ms = project.get_model_serving()
mr = project.get_model_registry()  # Get the model registry
ranking_model = mr.get_model("weather_ranking_model", version=1) # Get the model 


from hsml.deployment_config import DeploymentConfig

# Create deployment configuration with environment variables
deployment_config = DeploymentConfig(
    environment={"SKLEARN_SERVER_EXTRA_REQUIREMENTS": "catboost==1.2.1"}
)

# Deploy with configuration
ranking_deployment = ranking_model.deploy(
    name="weathermodel",
    deployment_config=deployment_config
)


2025-05-18 01:57:38,184 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-05-18 01:57:38,189 INFO: Initializing external client
2025-05-18 01:57:38,190 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-18 01:57:39,385 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


ModuleNotFoundError: No module named 'hsml.deployment_config'

: 