In [7]:
import pandas as pd
# Load data
users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_users_data.csv')
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_events_data.csv")
interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/notebooks/test_interactions_data.csv')

# Renaming columns in users_df
users_df.rename(columns={
    'lat': 'user_lat',
    'lng': 'user_lon',
    'location': 'user_city',
    'indoor_outdoor_preference': 'user_weather_preference',
    'joinedAt': 'signup_date'
}, inplace=True)

# Renaming columns in events_df
events_df.rename(columns={
    'category': 'event_type',
    'lat': 'event_lat',
    'lng': 'event_lon',
    'city': 'event_city',
    'weather_description': 'weather_condition',
    'temperature_2m_mean': 'temperature'
}, inplace=True)

# Renaming columns in interactions_df
interactions_df.rename(columns={
    'distance_to_event': 'interaction_distance_to_event'
}, inplace=True)
print(interactions_df.isnull().sum())

Unnamed: 0                       0
user_id                          0
event_id                         0
interaction_type                 0
interaction_distance_to_event    0
interaction_label                0
event_weather_condition          0
event_temperature                0
event_precipitation_sum          0
user_weather_condition           0
user_temperature                 0
user_precipitation               0
dtype: int64


In [3]:
interactions_df.head(2)

Unnamed: 0,user_id,event_id,interaction_type,distance_to_event,interaction_label,event_weather_condition,event_temperature,event_precipitation_sum,user_weather_condition,user_temperature,user_precipitation
0,3468617687,702719295,maybe,522.753486,1,Cloudy,3.7455,0.0,Cloudy,5.463667,0.0
1,487910947,702719295,maybe,506.036813,1,Cloudy,3.7455,0.0,Light Drizzle,4.505249,0.1


In [9]:
def preprocess_common(interactions_df, users_df, events_df):
 
    # Create copies of the input dataframes
    interactions_df = interactions_df.copy()
    users_df = users_df.copy()
    events_df = events_df.copy()

    # Drop rows with missing user_id or event_id
    interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
    
    # Convert distance_to_event to float
    interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
    
    # Ensure correct types
    for df in [interactions_df, users_df]:
        df["user_id"] = df["user_id"].astype(str)
    
    for df in [interactions_df, events_df]:
        df["event_id"] = df["event_id"].astype(str)

    interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

    # Convert to string type for TF-IDF fields
    events_df["title"] = events_df["title"].fillna("").astype(str)
    users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
    users_df["age"] = users_df["age"].fillna(0).astype(float)

    # Ensure all numeric fields are float - fixed to reference events_df instead of df
    numeric_cols = ["temperature", "attendance_rate"]
    for col in numeric_cols:
        events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

    # Return all three dataframes
    return interactions_df, users_df, events_df

interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
len(interactions_df) + len(users_df) + len(events_df)
merged_df = interactions_df.merge(events_df,on="event_id")\
    .merge(users_df, on="user_id")

merged_df.head()


Unnamed: 0.1,Unnamed: 0_x,user_id,event_id,interaction_type,interaction_distance_to_event,interaction_label,event_weather_condition,event_temperature,event_precipitation_sum,user_weather_condition,...,event_indoor_capability,precipitation_sum,Unnamed: 0,user_lat,user_lon,user_city,user_weather_preference,age,user_interests,signup_date
0,0,3468617687,702719295,maybe,522.753486,1,Cloudy,3.7455,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
1,4,3468617687,284003894,invited & yes,55.942166,1,Cloudy,11.129666,0.0,Partly Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
2,45,3468617687,2951450859,invited & yes,55.67033,1,Cloudy,1.859583,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
3,76,3468617687,3961159311,invited & yes,55.045346,1,Cloudy,6.20475,0.0,Cloudy,...,True,0.0,366,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
4,87,3468617687,2017343689,invited & yes,56.013814,1,Light Rain,8.367833,16.800001,Light Rain,...,True,16.800001,366,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z


In [10]:
                                  

merged_df = merged_df.drop(columns=["Unnamed: 0_x", "Unnamed: 0_y", "Unnamed: 0"])



merged_df = merged_df.rename(columns={"interaction_distance_to_event": "distance_to_event"})
merged_df.head(2)
#merged_df.to_csv("merged_interaction_df.csv")


Unnamed: 0,user_id,event_id,interaction_type,distance_to_event,interaction_label,event_weather_condition,event_temperature,event_precipitation_sum,user_weather_condition,user_temperature,...,attendance_rate,event_indoor_capability,precipitation_sum,user_lat,user_lon,user_city,user_weather_preference,age,user_interests,signup_date
0,3468617687,702719295,maybe,522.753486,1,Cloudy,3.7455,0.0,Cloudy,5.463667,...,0.458333,True,0.0,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z
1,3468617687,284003894,invited & yes,55.942166,1,Cloudy,11.129666,0.0,Partly Cloudy,11.06575,...,0.022042,True,0.0,43.158,-79.244,Saint Catharines Ontario,indoor,20.0,drink technology business seasonal food entert...,2012-09-25T17:48:37.804Z


In [11]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

In [14]:

# Enhanced benchmarking evaluation and training pipeline with complete metrics and weather features included
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from surprise import Dataset, Reader, SVD
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- Shared Evaluation ---
def compute_ranking_metrics(y_true, y_score, k=10):
    # Sort indices by descending score
    sorted_indices = np.argsort(y_score)[::-1]
    # Get top-k true labels
    top_k = np.array(y_true)[sorted_indices][:k]
    # Calculate precision
    precision = np.mean(top_k)
    # Calculate recall
    recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
    # Calculate DCG
    dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
    # Calculate ideal DCG
    ideal_k = min(int(np.sum(y_true)), k)
    idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
    # Calculate NDCG
    ndcg = dcg / idcg if idcg > 0 else 0
    return precision, recall, ndcg

def compute_all_metrics(y_true, y_score):
    # Ensure there are enough positive examples for AUC calculation
    if len(np.unique(y_true)) < 2:
        metrics = {
            "AUC": np.nan,
            "MAP": np.nan
        }
    else:
        metrics = {
            "AUC": roc_auc_score(y_true, y_score),
            "MAP": average_precision_score(y_true, y_score)
        }
    
    for k in [5, 10]:
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        metrics[f"Precision@{k}"] = p
        metrics[f"Recall@{k}"] = r
        metrics[f"NDCG@{k}"] = n
    return metrics


Training Content-Based model...
Training SVD model...
Training Hybrid (SVD + Content + Weather) model with strict validation...


Unnamed: 0,AUC,MAP,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
Content-Based,0.893437,0.991727,1.0,0.000235,1.0,1.0,0.000469,1.0
SVD,0.990446,0.999411,1.0,0.000235,1.0,1.0,0.000469,1.0
Hybrid,0.712929,0.966202,1.0,0.000935,1.0,1.0,0.001871,1.0


In [24]:
def train_content_model(train_df, val_df):
    # Clean and prepare training data
    train_clean = train_df.copy()
    train_clean["title"] = train_clean["title"].fillna("")
    train_clean["user_interests"] = train_clean["user_interests"].fillna("")
    
    # Clean and prepare validation data
    val_clean = val_df.copy()
    val_clean["title"] = val_clean["title"].fillna("")
    val_clean["user_interests"] = val_clean["user_interests"].fillna("")

    # TF-IDF features - fit only on training data
    tfidf_title = TfidfVectorizer(max_features=100)
    tfidf_interests = TfidfVectorizer(max_features=100)
    tfidf_title_mat_train = tfidf_title.fit_transform(train_clean["title"])
    tfidf_interests_mat_train = tfidf_interests.fit_transform(train_clean["user_interests"])
    
    # Transform validation data using fitted vectorizers
    tfidf_title_mat_val = tfidf_title.transform(val_clean["title"])
    tfidf_interests_mat_val = tfidf_interests.transform(val_clean["user_interests"])

    # Numeric features
    numeric_cols = ["distance_to_event", "temperature", "age", "attendance_rate"]
    train_clean[numeric_cols] = train_clean[numeric_cols].fillna(0)
    val_clean[numeric_cols] = val_clean[numeric_cols].fillna(0)
    
    # Fit scaler on training data only
    scaler = StandardScaler().fit(train_clean[numeric_cols])
    X_numeric_train = scaler.transform(train_clean[numeric_cols])
    X_numeric_val = scaler.transform(val_clean[numeric_cols])
    
    # Categorical features with OneHotEncoder
    categorical_cols = ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']
    
    # Fill missing values in categorical columns
    for col in categorical_cols:
        train_clean[col] = train_clean[col].fillna('unknown')
        val_clean[col] = val_clean[col].fillna('unknown')
    
    # Create OneHotEncoder for categorical features
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_clean[categorical_cols])
    X_cat_val = encoder.transform(val_clean[categorical_cols])

    # Combine all features
    X_train = hstack([tfidf_title_mat_train, tfidf_interests_mat_train, X_numeric_train, X_cat_train]).toarray()
    X_val = hstack([tfidf_title_mat_val, tfidf_interests_mat_val, X_numeric_val, X_cat_val]).toarray()
    
    y_train = train_clean["interaction_label"].astype(int)
    y_val = val_clean["interaction_label"].astype(int)

    # Train CatBoost on training data only
    model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        loss_function='Logloss',
        verbose=False
    )
    model.fit(X_train, y_train)
    
    # Predict on validation data
    val_scores = model.predict_proba(X_val)[:, 1]

    return model, compute_all_metrics(y_val, val_scores)

def train_svd(train_df, val_df):
    # Prepare training data for SVD
    reader = Reader(rating_scale=(0, 1))
    train_svd = train_df[["user_id", "event_id", "interaction_label"]].copy()
    
    # Convert all to strings to ensure compatibility
    train_svd["user_id"] = train_svd["user_id"].astype(str)
    train_svd["event_id"] = train_svd["event_id"].astype(str)
    train_svd["interaction_label"] = train_svd["interaction_label"].astype(float)
    
    data = Dataset.load_from_df(train_svd, reader)
    trainset = data.build_full_trainset()
    
    # Train SVD model on training data only
    svd = SVD(n_epochs=50).fit(trainset)
    
    # Generate predictions for validation data
    val_copy = val_df.copy()
    val_copy["svd_score"] = val_copy.apply(
        lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, 
        axis=1
    )
    
    return svd, compute_all_metrics(val_copy["interaction_label"].astype(int), val_copy["svd_score"])

def hybrid_model(train_df, val_df):
    from sklearn.preprocessing import OneHotEncoder
    
    # Step 1: Train SVD model on train_df
    reader = Reader(rating_scale=(0, 1))
    train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
    trainset = train_svd_data.build_full_trainset()
    svd_model = SVD(n_epochs=20).fit(trainset)

    # Step 2: Compute SVD scores for both train and val
    train_df = train_df.copy()
    val_df = val_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    val_df["svd_score"] = val_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )

    # Step 3: TF-IDF + numeric features
    tfidf_title = TfidfVectorizer(max_features=50)
    tfidf_interests = TfidfVectorizer(max_features=50)
    tfidf_title.fit(train_df["title"].fillna(""))
    tfidf_interests.fit(train_df["user_interests"].fillna(""))

    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    X_val_text = hstack([
        tfidf_title.transform(val_df["title"].fillna("")),
        tfidf_interests.transform(val_df["user_interests"].fillna(""))
    ])

    # Numeric features including SVD score
    numeric_cols = ["distance_to_event", "temperature", "age", "attendance_rate", "svd_score"]
    train_df[numeric_cols] = train_df[numeric_cols].fillna(0)
    val_df[numeric_cols] = val_df[numeric_cols].fillna(0)

    scaler = StandardScaler().fit(train_df[numeric_cols])
    X_train_numeric = scaler.transform(train_df[numeric_cols])
    X_val_numeric = scaler.transform(val_df[numeric_cols])
    
    # Categorical features with OneHotEncoder
    categorical_cols = ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']
    
    # Fill missing values in categorical columns
    for col in categorical_cols:
        train_df[col] = train_df[col].fillna('unknown')
        val_df[col] = val_df[col].fillna('unknown')
    
    # Create OneHotEncoder for categorical features
    encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
    X_cat_train = encoder.fit_transform(train_df[categorical_cols])
    X_cat_val = encoder.transform(val_df[categorical_cols])

    # Combine all features
    X_train = hstack([X_train_text, X_train_numeric, X_cat_train]).toarray()
    X_val = hstack([X_val_text, X_val_numeric, X_cat_val]).toarray()
    
    y_train = train_df["interaction_label"].astype(int)
    y_val = val_df["interaction_label"].astype(int)

    model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        loss_function='Logloss',
        verbose=False
    )
    model.fit(X_train, y_train)

    scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, scores)
'✅ Hybrid model updated to use CatBoost for better non-linear modeling.'


def compare_all_models(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()

    # Ensure correct types and fill missing
    for df in [train_df, val_df]:
        df["user_id"] = df["user_id"].astype(str)
        df["event_id"] = df["event_id"].astype(str)
        df["interaction_label"] = df["interaction_label"].astype(int)
        df["title"] = df["title"].fillna("").astype(str)
        df["user_interests"] = df["user_interests"].fillna("").astype(str)
        
        # Ensure categorical columns exist
        for col in ['events_weather_condition', 'events_event_indoor_capability', 'users_user_weather_preference']:
            if col not in df.columns:
                df[col] = "unknown"

    print("Training Content-Based model...")
    _, content_scores = train_content_model(train_df, val_df)

    print("Training SVD model...")
    _, svd_scores = train_svd(train_df, val_df)

    print("Training Hybrid (SVD + Content + Weather) model with strict validation...")
    _, hybrid_scores = hybrid_model(train_df, val_df)

    # Assemble final results
    results_df = pd.DataFrame({
        "Content-Based": content_scores,
        "SVD": svd_scores,
        "Hybrid": hybrid_scores,
    })

    return results_df.T

compare_all_models(train_df, val_df)

Training Content-Based model...
Training SVD model...
Training Hybrid (SVD + Content + Weather) model with strict validation...


Unnamed: 0,AUC,MAP,Precision@5,Recall@5,NDCG@5,Precision@10,Recall@10,NDCG@10
Content-Based,0.775522,0.980886,1.0,0.000935,1.0,1.0,0.001871,1.0
SVD,0.79764,0.980718,1.0,0.000935,1.0,1.0,0.001871,1.0
Hybrid,0.708555,0.964895,0.8,0.000748,0.853932,0.8,0.001497,0.831848
