In [None]:
import pandas as pd
interactions_df = pd.read_csv("/home/nkama/masters_thesis_project/interactions.csv")
users_df = pd.read_csv("/home/nkama/masters_thesis_project/users.csv")
events_df = pd.read_csv("/home/nkama/masters_thesis_project/events.csv")
interactions_df.head(2)
interactions_df["interaction_label"] = interactions_df['interaction_type'].apply(
    lambda x: 1 if x in ['maybe', 'invited & maybe', 'yes', 'invited & yes'] else 0
)


In [None]:
def preprocess_common(interactions_df, users_df, events_df):
 
    # Create copies of the input dataframes
    interactions_df = interactions_df.copy()
    users_df = users_df.copy()
    events_df = events_df.copy()

    # Drop rows with missing user_id or event_id
    interactions_df = interactions_df.dropna(subset=["user_id", "event_id", "interaction_label"])
    
    # Convert distance_to_event to float
    interactions_df["interaction_distance_to_event"] = interactions_df["interaction_distance_to_event"].fillna(0).astype(float)
    
    # Ensure correct types
    for df in [interactions_df, users_df]:
        df["user_id"] = df["user_id"].astype(str)
    
    for df in [interactions_df, events_df]:
        df["event_id"] = df["event_id"].astype(str)

    interactions_df["interaction_label"] = interactions_df["interaction_label"].astype(int)

    # Convert to string type for TF-IDF fields
    events_df["title"] = events_df["title"].fillna("").astype(str)
    users_df["user_interests"] = users_df["user_interests"].fillna("").astype(str)
    users_df["age"] = users_df["age"].fillna(0).astype(float)

    # Ensure all numeric fields are float - fixed to reference events_df instead of df
    numeric_cols = ["duration", "temperature", "attendance_rate"]
    for col in numeric_cols:
        events_df[col] = pd.to_numeric(events_df[col], errors="coerce").fillna(0).astype(float)

    # Return all three dataframes
    return interactions_df, users_df, events_df

interactions_df, users_df, events_df = preprocess_common(interactions_df, users_df, events_df)
len(interactions_df) + len(users_df) + len(events_df)
merged_df = interactions_df.merge(events_df,on="event_id")\
    .merge(users_df, on="user_id")

merged_df.head()
                                  

merged_df = merged_df.drop(columns=["Unnamed: 0_x", "Unnamed: 0_y", "Unnamed: 0"])
merged_df.head(2)


merged_df = merged_df.rename(columns={"interaction_distance_to_event": "distance_to_event"})
merged_df.head(2)
merged_df.to_csv("merged_interaction_df.csv")


In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)


In [None]:

from lightfm import LightFM
from lightfm.data import Dataset as LFMData
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from scipy.sparse import hstack
from sklearn.metrics import roc_auc_score, average_precision_score
import numpy as np

# Evaluation metric helpers
def evaluate_ranking_scores(y_true, scores, k=10):
    sorted_indices = np.argsort(scores)[::-1]
    top_k = y_true[sorted_indices][:k]
    precision_at_k = np.mean(top_k)
    recall_at_k = np.sum(top_k) / np.sum(y_true)
    return precision_at_k, recall_at_k

def base_metrics(y_true, scores, k=10):
    auc = roc_auc_score(y_true, scores)
    map_score = average_precision_score(y_true, scores)
    precision, recall = evaluate_ranking_scores(np.array(y_true), np.array(scores), k)
    return {"AUC": auc, "MAP": map_score, f"Precision@{k}": precision, f"Recall@{k}": recall}

# Prepare metadata-enriched LightFM model
def train_lightfm_with_metadata(df):
    df = df.copy()
    df["interaction_label"] = df["interaction_label"].astype(float)

    # Split into train/test sets
    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["interaction_label"], random_state=42)

    # Initialize dataset
    dataset = LFMData()
    dataset.fit(
        users=df["user_id"].unique(),
        items=df["event_id"].unique(),
        user_features=df["user_weather_preference"].unique().tolist() + 
                      df["user_interests"].fillna("").str.split().explode().unique().tolist(),
        item_features=df["event_type"].unique().tolist() +
                      df["weather_condition"].unique().tolist()
    )

    # Build interactions
    train_interactions, _ = dataset.build_interactions(train_df[["user_id", "event_id", "interaction_label"]].values)

    # Create user features
    user_features = []
    for _, row in df.iterrows():
        features = [row["user_weather_preference"]] + row["user_interests"].split()
        user_features.append((row["user_id"], features))
    user_features_mat = dataset.build_user_features(user_features)

    # Create item features
    item_features = []
    for _, row in df.iterrows():
        features = [row["event_type"], row["weather_condition"]]
        item_features.append((row["event_id"], features))
    item_features_mat = dataset.build_item_features(item_features)

    # Train LightFM model
    model = LightFM(loss="warp", random_state=42)
    model.fit(train_interactions, user_features=user_features_mat, item_features=item_features_mat, epochs=10, num_threads=2)

    # Map user/item ids to internal LightFM ids
    user_mapping, _, item_mapping, _ = dataset.mapping()
    test_df = test_df[test_df["user_id"].isin(user_mapping) & test_df["event_id"].isin(item_mapping)]
    test_df["user_idx"] = test_df["user_id"].map(user_mapping)
    test_df["item_idx"] = test_df["event_id"].map(item_mapping)

    # Predict for each test interaction

    test_df["lightfm_score"] = test_df.apply(
    lambda row: model.predict(
        np.array([row["user_idx"]]), 
        np.array([row["item_idx"]]),
        user_features=user_features_mat, 
        item_features=item_features_mat
    )[0],  # Take the single prediction value from array
    axis=1
)

    # Compute metrics
    return model, base_metrics(test_df["interaction_label"], test_df["lightfm_score"])


In [None]:

# Train and evaluate the LightFM model with metadata
lightfm_model, lightfm_metrics = train_lightfm_with_metadata(merged_df)
lightfm_metrics


In [None]:

#Second version

import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset as LFMData
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score

# Load data
df = pd.read_csv("merged_interaction_df.csv")
df["interaction_label"] = df["interaction_label"].astype(int)

# Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["interaction_label"])

# --------------------------
# Build user/item metadata as feature dicts
# --------------------------

def extract_user_features(df):
    tfidf = TfidfVectorizer(max_features=50)
    tfidf_matrix = tfidf.fit_transform(df["user_interests"].fillna(""))
    interest_features = tfidf.get_feature_names_out()
    
    user_features = {}
    for idx, row in df.iterrows():
        feats = [f"age:{int(row['age']//10)*10}", f"weather_pref:{row['user_weather_preference']}", f"social:{int(row['social_connectedness']//5)*5}"]
        tfidf_feats = tfidf_matrix[idx].toarray().flatten()
        feats += [f"interest:{interest_features[i]}" for i in tfidf_feats.nonzero()[0]]
        user_features[row["user_id"]] = feats
    return user_features

def extract_item_features(df):
    tfidf = TfidfVectorizer(max_features=50)
    tfidf_matrix = tfidf.fit_transform(df["title"].fillna(""))
    title_features = tfidf.get_feature_names_out()
    
    item_features = {}
    for idx, row in df.iterrows():
        feats = [
            f"event_type:{row['event_type']}",
            f"indoor:{row['event_indoor_capability']}",
            f"temperature:{int(row['temperature']//5)*5}",
            f"duration:{int(row['duration']//60)}h",
        ]
        tfidf_feats = tfidf_matrix[idx].toarray().flatten()
        feats += [f"title:{title_features[i]}" for i in tfidf_feats.nonzero()[0]]
        item_features[row["event_id"]] = feats
    return item_features

user_features_dict = extract_user_features(df)
item_features_dict = extract_item_features(df)

# --------------------------
# Build LightFM Dataset
# --------------------------
dataset = LFMData()
dataset.fit(
    users=df["user_id"].unique(),
    items=df["event_id"].unique(),
    user_features={f for feats in user_features_dict.values() for f in feats},
    item_features={f for feats in item_features_dict.values() for f in feats}
)

# Interactions
interactions, _ = dataset.build_interactions([
    (row["user_id"], row["event_id"], row["interaction_label"]) for _, row in train_df.iterrows()
])

# Feature matrices
user_features = dataset.build_user_features([(uid, feats) for uid, feats in user_features_dict.items()])
item_features = dataset.build_item_features([(iid, feats) for iid, feats in item_features_dict.items()])

# --------------------------
# Train LightFM
# --------------------------
model = LightFM(loss='warp', random_state=42)
model.fit(interactions, user_features=user_features, item_features=item_features, epochs=10, num_threads=4)

# --------------------------
# Evaluate
# --------------------------
# Map IDs
user_map, _, item_map, _ = dataset.mapping()
test_df = test_df[test_df["user_id"].isin(user_map) & test_df["event_id"].isin(item_map)]
test_df["user_idx"] = test_df["user_id"].map(user_map)
test_df["item_idx"] = test_df["event_id"].map(item_map)

# Predict
user_ids = test_df["user_idx"].values
item_ids = test_df["item_idx"].values

test_df["lightfm_score"] = model.predict(
    user_ids, item_ids,
    user_features=user_features,
    item_features=item_features
)


# --------------------------
# Ranking Evaluation
# --------------------------
def evaluate_all(y_true, y_score, k_values=[5, 10]):
    results = {
        "AUC": roc_auc_score(y_true, y_score),
        "MAP": average_precision_score(y_true, y_score)
    }

    y_true = np.array(y_true)
    y_score = np.array(y_score)
    for k in k_values:
        sorted_idx = np.argsort(y_score)[::-1][:k]
        top_k_true = y_true[sorted_idx]
        precision = np.mean(top_k_true)
        recall = np.sum(top_k_true) / np.sum(y_true)
        ndcg = ndcg_score([y_true], [y_score], k=k)
        results[f"Precision@{k}"] = precision
        results[f"Recall@{k}"] = recall
        results[f"NDCG@{k}"] = ndcg
    return results

metrics = evaluate_all(test_df["interaction_label"], test_df["lightfm_score"])
print("📊 LightFM with Metadata Results:")
for metric, val in metrics.items():
    print(f"{metric}: {val:.4f}")


In [None]:

# Enhanced benchmarking evaluation and training pipeline with complete metrics and weather features included
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from lightfm import LightFM
from lightfm.data import Dataset as LFMData
from lightfm.evaluation import precision_at_k, auc_score
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier

# --- Shared Evaluation ---
def compute_ranking_metrics(y_true, y_score, k=10):
    # Sort indices by descending score
    sorted_indices = np.argsort(y_score)[::-1]
    # Get top-k true labels
    top_k = np.array(y_true)[sorted_indices][:k]
    # Calculate precision
    precision = np.mean(top_k)
    # Calculate recall
    recall = np.sum(top_k) / np.sum(y_true) if np.sum(y_true) > 0 else 0
    # Calculate DCG
    dcg = np.sum(top_k / np.log2(np.arange(2, len(top_k) + 2)))
    # Calculate ideal DCG
    ideal_k = min(int(np.sum(y_true)), k)
    idcg = np.sum([1 / np.log2(i + 2) for i in range(ideal_k)])
    # Calculate NDCG
    ndcg = dcg / idcg if idcg > 0 else 0
    return precision, recall, ndcg

def compute_all_metrics(y_true, y_score):
    # Ensure there are enough positive examples for AUC calculation
    if len(np.unique(y_true)) < 2:
        metrics = {
            "AUC": np.nan,
            "MAP": np.nan
        }
    else:
        metrics = {
            "AUC": roc_auc_score(y_true, y_score),
            "MAP": average_precision_score(y_true, y_score)
        }
    
    for k in [5, 10]:
        p, r, n = compute_ranking_metrics(y_true, y_score, k)
        metrics[f"Precision@{k}"] = p
        metrics[f"Recall@{k}"] = r
        metrics[f"NDCG@{k}"] = n
    return metrics

# --- Content-Based Model ---
def train_content_model(df):
    # Handle potential NaN values properly
    df_clean = df.copy()
    df_clean["title"] = df_clean["title"].fillna("")
    df_clean["user_interests"] = df_clean["user_interests"].fillna("")
    
    # Create TF-IDF features
    tfidf_title = TfidfVectorizer(max_features=100).fit_transform(df_clean["title"])
    tfidf_interests = TfidfVectorizer(max_features=100).fit_transform(df_clean["user_interests"])
    
    # Prepare numeric features
    numeric_cols = ["distance_to_event", "duration", "temperature", "age", "attendance_rate"]
    df_clean[numeric_cols] = df_clean[numeric_cols].fillna(0)
    X_numeric = StandardScaler().fit_transform(df_clean[numeric_cols])
    
    # Combine all features
    X = hstack([tfidf_title, tfidf_interests, X_numeric])
    y = df_clean["interaction_label"].astype(int)
    
    # Train model and compute scores
    model = LogisticRegression(max_iter=1000, solver='liblinear').fit(X, y)
    scores = model.predict_proba(X)[:, 1]
    return model, compute_all_metrics(y, scores)

# --- SVD Model ---
def train_svd(df):
    # Prepare data for SVD
    reader = Reader(rating_scale=(0, 1))
    df_svd = df[["user_id", "event_id", "interaction_label"]].copy()
    
    # Convert all to strings to ensure compatibility
    df_svd["user_id"] = df_svd["user_id"].astype(str)
    df_svd["event_id"] = df_svd["event_id"].astype(str)
    df_svd["interaction_label"] = df_svd["interaction_label"].astype(float)
    
    data = Dataset.load_from_df(df_svd, reader)
    trainset = data.build_full_trainset()
    
    # Train SVD model
    svd = SVD(n_epochs=20).fit(trainset)
    
    # Generate predictions for all user-item pairs
    df_copy = df.copy()
    df_copy["svd_score"] = df_copy.apply(
        lambda row: svd.predict(str(row["user_id"]), str(row["event_id"])).est, 
        axis=1
    )
    
    return svd, compute_all_metrics(df_copy["interaction_label"].astype(int), df_copy["svd_score"])


# --- Final Hybrid Training with Train/Test Separation ---
def hybrid_model(train_df, val_df):
    # Step 1: Train SVD model on train_df
    reader = Reader(rating_scale=(0, 1))
    train_svd_data = Dataset.load_from_df(train_df[["user_id", "event_id", "interaction_label"]].astype(str), reader)
    trainset = train_svd_data.build_full_trainset()
    svd_model = SVD(n_epochs=20).fit(trainset)

    # Step 2: Compute SVD scores for both train and val
    train_df = train_df.copy()
    val_df = val_df.copy()
    train_df["svd_score"] = train_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )
    val_df["svd_score"] = val_df.apply(
        lambda row: svd_model.predict(str(row["user_id"]), str(row["event_id"])).est, axis=1
    )

    # Step 3: TF-IDF + numeric features
    tfidf_title = TfidfVectorizer(max_features=50)
    tfidf_interests = TfidfVectorizer(max_features=50)
    tfidf_title.fit(train_df["title"].fillna(""))
    tfidf_interests.fit(train_df["user_interests"].fillna(""))

    X_train_text = hstack([
        tfidf_title.transform(train_df["title"].fillna("")),
        tfidf_interests.transform(train_df["user_interests"].fillna(""))
    ])
    X_val_text = hstack([
        tfidf_title.transform(val_df["title"].fillna("")),
        tfidf_interests.transform(val_df["user_interests"].fillna(""))
    ])

    numeric_cols = ["distance_to_event", "duration", "temperature", "age", "attendance_rate", "svd_score"]
    train_df[numeric_cols] = train_df[numeric_cols].fillna(0)
    val_df[numeric_cols] = val_df[numeric_cols].fillna(0)

    scaler = StandardScaler().fit(train_df[numeric_cols])
    X_train_numeric = scaler.transform(train_df[numeric_cols])
    X_val_numeric = scaler.transform(val_df[numeric_cols])

    # Combine all features
    X_train = hstack([X_train_text, X_train_numeric]).toarray()
    X_val = hstack([X_val_text, X_val_numeric]).toarray()
    y_train = train_df["interaction_label"].astype(int)
    y_val = val_df["interaction_label"].astype(int)

    model = CatBoostClassifier(
        iterations=200,
        depth=6,
        learning_rate=0.1,
        loss_function='Logloss',
        verbose=False
    )
    model.fit(X_train, y_train)

    scores = model.predict_proba(X_val)[:, 1]
    return model, compute_all_metrics(y_val, scores)

"✅ Hybrid model updated to use CatBoost for better non-linear modeling."

# Final all-model benchmark comparison with strict train/validation split and consistent evaluation
def compare_all_models_strict(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()

    # Ensure correct types and fill missing
    for df in [train_df, val_df]:
        df["user_id"] = df["user_id"].astype(str)
        df["event_id"] = df["event_id"].astype(str)
        df["interaction_label"] = df["interaction_label"].astype(int)
        df["title"] = df["title"].fillna("").astype(str)
        df["user_interests"] = df["user_interests"].fillna("").astype(str)

    print("Training Content-Based model...")
    _, content_scores = train_content_model(train_df)

    print("Training SVD model...")
    _, svd_scores = train_svd(train_df)

    print("Training Hybrid (SVD + Content + Weather) model with strict validation...")
    _, hybrid_scores = hybrid_model(train_df, val_df)

  

    # Assemble final results
    results_df = pd.DataFrame({
        "Content-Based": content_scores,
        "SVD": svd_scores,
        "Hybrid": hybrid_scores,

    })

    return results_df.T

"✅ All-model benchmark comparison function finalized with strict train/val split and metric evaluation."

from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(merged_df, test_size=0.2, random_state=42)

benchmark_results = compare_all_models_strict(train_df, val_df)
display(benchmark_results)
