## <span style="color:#ff5f27">👨🏻‍🏫 Train Ranking Model </span>

In this notebook, we will train a ranking model using gradient boosted trees. 

In [3]:
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, precision_recall_fscore_support
import joblib

In [4]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

2025-05-31 19:39:32,159 INFO: Initializing external client
2025-05-31 19:39:32,162 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-31 19:39:33,949 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


### get feature groups

In [3]:

users_fg = fs.get_feature_group(
    name="users",
    version=1
)

events_fg = fs.get_feature_group(
    name="events",
    version=1
)

weather_rank_fg = fs.get_feature_group(
    name="weather_ranking",
    version=1
)

no_weather_rank_fg = fs.get_feature_group(
    name="no_weather_ranking",
    version=1
)

## <span style="color:#ff5f27">⚙️ Feature View Creation </span>

In [38]:
# Select features
selected_features_customers = users_fg.select_all()

fs.get_or_create_feature_view( 
    name='users',
    query=selected_features_customers,
    version=1,
)

<hsfs.feature_view.FeatureView at 0x7fd3a81830d0>

In [39]:
# Select features
selected_features_articles = events_fg.select_all()

fs.get_or_create_feature_view(
    name='events',
    query=selected_features_articles,
    version=1,
)

<hsfs.feature_view.FeatureView at 0x7fd3a876ab30>

In [5]:
NO_WEATHER_SELECTED_FEATURES =['interaction_distance_to_event', 'event_type', 'event_city','title',
       'attendance_rate', 'event_indoor_capability', 'user_city', 'age',
       'user_interests','interaction_label']

WEATHER_SELECTED_FEATURES =['interaction_distance_to_event', 'event_type', 'event_city','title', 
       'weather_condition', 'temperature','precipitation', 'attendance_rate',
       'event_indoor_capability', 'user_city', 'indoor_outdoor_preference',
       'age', 'user_interests','user_weather_condition', 'user_temperature',	'user_precipitation','interaction_label']


In [40]:
# Select weather features
features_weather_ranking = weather_rank_fg.select(WEATHER_SELECTED_FEATURES)
# Select no weather features
features_no_weather_ranking = no_weather_rank_fg.select(NO_WEATHER_SELECTED_FEATURES)

In [41]:
# Create feature view for weather ranking
feature_view_ranking_weather = fs.get_or_create_feature_view(
    name='weather_ranking_2',
    query=features_weather_ranking,
    labels=['interaction_label'],
    version=1,
)
# Create feature view for no weather ranking
feature_view_ranking_no_weather = fs.get_or_create_feature_view(
    name='no_weather_ranking_2',
    query=features_no_weather_ranking,
    labels=["interaction_label"],
    version=1,
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fv/weather_ranking_2/version/1
Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fv/no_weather_ranking_2/version/1


---

---

---

---

## <span style="color:#ff5f27">🗄️ Train Data loading </span>

In [6]:
# Get feature views weather ranking
feature_view_ranking_weather = fs.get_feature_view(name='weather_ranking_2', version=1)


In [7]:
# Get feature views no weather ranking
feature_view_ranking_no_weather = fs.get_feature_view(name='no_weather_ranking_2', version=1)


In [7]:
# NO_WEATHER_SELECTED_FEATURES =['interaction_distance_to_event', 'event_type', 'event_city',
#        'attendance_rate', 'event_indoor_capability', 'user_city', 'age',
#        'user_interests']

# WEATHER_SELECTED_FEATURES =['interaction_distance_to_event', 'event_type', 'event_city', 
#        'weather_condition', 'temperature', 'attendance_rate',
#        'event_indoor_capability', 'user_city', 'indoor_outdoor_preference',
#        'age', 'user_interests']


In [None]:
# from sklearn.model_selection import train_test_split
# users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/notebooks/users.csv')
# events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/notebooks/events.csv")
# interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/notebooks/interactions.csv')

# merged_df = pd.merge(interactions_df, users_df, on='user_id')
# merged_df = pd.merge(merged_df, events_df, on='event_id') 


# X_weather = merged_df[WEATHER_SELECTED_FEATURES]
# y_weather = merged_df["interaction_label"]

# X_no_weather = merged_df[NO_WEATHER_SELECTED_FEATURES]
# y_no_weather = merged_df["interaction_label"]

# # Split merged_df into train and test using train_test_split
# weather_X_train, weather_X_val, weather_y_train, weather_y_val = train_test_split(X_weather,y_weather, test_size=0.2, random_state=42)
# no_weather_X_train, no_weather_X_val, no_weather_y_train, no_weather_y_val = train_test_split(X_no_weather,y_no_weather, test_size=0.2, random_state=42)
# X_weather.head()

In [8]:
# Get training and validation data directly from feature views for weather ranking
weather_X_train, weather_X_val, weather_y_train, weather_y_val = \
    feature_view_ranking_weather.train_test_split(
    test_size=0.1,
    description='Weather ranking training dataset',
)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (22.92s) 



In [9]:

# Get training and validation data directly from feature views for no weather ranking
no_weather_X_train, no_weather_X_val, no_weather_y_train, no_weather_y_val = \
    feature_view_ranking_no_weather.train_test_split(
    test_size=0.1,
    description='No-weather ranking training dataset',
)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (15.61s) 



In [18]:
weather_X_train.columns


Index(['interaction_distance_to_event', 'event_type', 'event_city',
       'weather_condition', 'temperature', 'precipitation', 'attendance_rate',
       'event_indoor_capability', 'user_city', 'indoor_outdoor_preference',
       'age', 'user_interests', 'user_weather_condition', 'user_temperature',
       'user_precipitation'],
      dtype='object')

In [2]:
# import pandas as pd
# users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/data/main_data/users.csv')
# events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/partially_synthetic/data/main_data/events.csv")
# interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/partially_synthetic/data/main_data/interactions.csv')



# # Merge user/event features into interactions
# interactions_df = interactions_df.merge(users_df, on="user_id")
# interactions_df = interactions_df.merge(events_df, on="event_id", suffixes=('_user', '_event'))
# from sklearn.model_selection import train_test_split

# NO_WEATHER_SELECTED_FEATURES =['interaction_type',
#        'distance_to_event', 'interaction_label',
#         'gender', 'joinedAt', 'location', 'age',
#       'indoor_outdoor_preference', 'user_interests', 
#        'start_time', 'city', 'yes_count',
#        'maybe_count', 'invited_count', 'no_count', 'total_users', 'category', 
#        'title', 'event_type','event_indoor_capability']

# WEATHER_SELECTED_FEATURES =['interaction_type',
#        'distance_to_event', 'interaction_label',
#         'gender', 'joinedAt', 'location', 'age',
#       'indoor_outdoor_preference',
#        'start_time', 'city', 'yes_count',
#        'maybe_count', 'invited_count', 'no_count', 'total_users',
#        'weather_description', 'category','event_type',
#        'event_indoor_capability', 'temperature_2m_mean', 'precipitation_sum']

# # )
# # Splitting the dataset into features and labels
# weather_X = interactions_df[WEATHER_SELECTED_FEATURES]  # Features
# weather_y = interactions_df['interaction_label']   

# no_weather_X = interactions_df[NO_WEATHER_SELECTED_FEATURES]  # Features
# no_weather_y = interactions_df['interaction_label']                   # Labels

# # Splitting the dataset into training and evaluation sets
# weather_X_train, weather_X_val, weather_y_train, weather_y_val = \
#     train_test_split(weather_X, weather_y, test_size=0.2, random_state=42)

# no_weather_X_train, no_weather_X_val, no_weather_y_train, no_weather_y_val = \
#     train_test_split(no_weather_X, no_weather_y, test_size=0.2, random_state=42)

In [63]:
# from catboost import CatBoostClassifier, Pool
# from sklearn.metrics import classification_report, precision_recall_fscore_support
# from sklearn.metrics import confusion_matrix
# import numpy as np

# # Final version without text fields (title, user_interests)
# def train_catboost_without_text_fields(
#     train_df, val_df, train_y, val_y
# ):
#     # Drop the text fields if present
#     text_columns = ["title", "user_interests"]
#     train_df = train_df.drop(columns=[col for col in text_columns if col in train_df.columns])
#     val_df = val_df.drop(columns=[col for col in text_columns if col in val_df.columns])

#     # Identify categorical features
#     cat_features = train_df.select_dtypes(include=["object", "bool"]).columns.tolist()

#     # Create CatBoost Pools
#     train_pool = Pool(train_df, train_y, cat_features=cat_features)
#     val_pool = Pool(val_df, val_y, cat_features=cat_features)
#     # Calculate class weights
#     pos_weight = len(train_y[train_y == 0]) / len(train_y[train_y == 1])


#     # Train the model
#     model = CatBoostClassifier(
#         learning_rate=0.2,
#         iterations=100,
#         depth=10,
#         early_stopping_rounds=5,
#         use_best_model=True,
#         scale_pos_weight=10,
#         verbose=False
#     )


#     model.fit(train_pool, eval_set=val_pool)

#     # Evaluation
#     preds = model.predict(val_pool)
#     precision, recall, fscore, _ = precision_recall_fscore_support(val_y, preds, average="binary")
#     print("\nClassification Report:")
#     print(classification_report(val_y, preds))

#     metrics = {
#         "precision": precision,
#         "recall": recall,
#         "fscore": fscore,
#     }
    
#     preds = model.scores = model.predict_proba(val_pool)[:, 1] 
#     print("Predicted Class Distribution:", np.unique(preds, return_counts=True))

#     # print("\nConfusion Matrix:")
#     # print(confusion_matrix(val_y, preds))

#     return model, metrics, val_pool

# "CatBoost training function excluding title and user_interests."


'CatBoost training function excluding title and user_interests.'

In [10]:
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import classification_report, precision_recall_fscore_support
import numpy as np

# Final version including text fields (title, user_interests)
def train_catboost(
    train_df, val_df, train_y, val_y
):
    # Identify categorical features, including text fields
    cat_features = train_df.select_dtypes(include=["object", "bool"]).columns.tolist()

    # Create CatBoost Pools
    train_pool = Pool(train_df, train_y, cat_features=cat_features)
    val_pool = Pool(val_df, val_y, cat_features=cat_features)

    # Calculate class weights
    pos_weight = len(train_y[train_y == 0]) / len(train_y[train_y == 1])

    # Train the model
    model = CatBoostClassifier(
        learning_rate=0.01,
        iterations=100,
        depth=5,
        early_stopping_rounds=5,
        use_best_model=True,
        scale_pos_weight=pos_weight,  # Use calculated positive weight
        verbose=False
    )

    model.fit(train_pool, eval_set=val_pool)

    # Evaluation
    preds = model.predict(val_pool)
    precision, recall, fscore, _ = precision_recall_fscore_support(val_y, preds, average="binary")
    print("\nClassification Report:")
    print(classification_report(val_y, preds))

    metrics = {
        "precision": precision,
        "recall": recall,
        "fscore": fscore,
    }
    
    preds_proba = model.predict_proba(val_pool)[:, 1] 
    print("Predicted Class Distribution:", np.unique(preds_proba, return_counts=True))

    return model, metrics, val_pool

# Note: Ensure that the DataFrame passed to this function includes 'user_interests' and 'title'.

In [9]:
weather_y_val.value_counts()

interaction_label
1                    35448
0                    14292
Name: count, dtype: int64

In [11]:
# Use this function to train on your weather / no-weather datasets
weather_model, weather_metrics, weather_val_pool = train_catboost(
    train_df=weather_X_train,
    val_df=weather_X_val,
    train_y=weather_y_train,
    val_y=weather_y_val
)

# #Save the models using Joblib
# joblib.dump(weather_model, '/home/nkama/masters_thesis_project/thesis/models/weather_ranking_model.pkl')
# print("\nModels saved successfully!")




Classification Report:



              precision    recall  f1-score   support

           0       0.00      0.00      0.00     14292
           1       0.71      1.00      0.83     35448

    accuracy                           0.71     49740
   macro avg       0.36      0.50      0.42     49740
weighted avg       0.51      0.71      0.59     49740

Predicted Class Distribution: (array([0.5934457 , 0.59348359, 0.59351597, ..., 0.68195727, 0.68196254,
       0.68212717]), array([1, 1, 1, ..., 1, 1, 1]))


In [15]:
feat_to_score = {
    feature: score 
    for feature, score 
    in zip(
        weather_X_train.columns, 
        weather_model.feature_importances_,
    )
}

feat_to_score = dict(
    sorted(
        feat_to_score.items(),
        key=lambda item: item[1],
        reverse=True,
    )
)
feat_to_score

{'interaction_distance_to_event': 95.93204998677713,
 'event_indoor_capability': 0.4346953089621326,
 'title': 0.4240075671567045,
 'user_temperature': 0.40476148658368594,
 'precipitation': 0.36677061158728846,
 'weather_condition': 0.3435573834861786,
 'age': 0.33834087598832935,
 'user_weather_condition': 0.30151470123997165,
 'attendance_rate': 0.30036077578444514,
 'event_city': 0.2893172652883507,
 'temperature': 0.25110583733846803,
 'user_interests': 0.22352187552828293,
 'user_city': 0.14621329280679,
 'indoor_outdoor_preference': 0.14097193571307504,
 'event_type': 0.0540516872020414,
 'user_precipitation': 0.0487594085571333}

In [35]:
weather_X_train.columns

Index(['interaction_distance_to_event', 'event_type', 'event_city',
       'weather_condition', 'temperature', 'precipitation', 'attendance_rate',
       'event_indoor_capability', 'user_city', 'indoor_outdoor_preference',
       'age', 'user_interests', 'user_weather_condition', 'user_temperature',
       'user_precipitation'],
      dtype='object')

In [13]:

# Use this function to train on your weather / no-weather datasets
no_weather_model, no_weather_metrics, no_weather_val_pool = train_catboost(
    train_df=no_weather_X_train,
    val_df=no_weather_X_val,
    train_y=no_weather_y_train,
    val_y=no_weather_y_val
)

# joblib.dump(no_weather_model, '/home/nkama/masters_thesis_project/thesis/models/no_weather_ranking_model.pkl')
# print("\nModels saved successfully!")


Classification Report:



              precision    recall  f1-score   support

           0       0.00      0.00      0.00     14312
           1       0.71      1.00      0.83     35428

    accuracy                           0.71     49740
   macro avg       0.36      0.50      0.42     49740
weighted avg       0.51      0.71      0.59     49740

Predicted Class Distribution: (array([0.59322731, 0.59340502, 0.59341716, ..., 0.68802799, 0.68803421,
       0.68805272]), array([2, 1, 1, ..., 1, 1, 1]))


In [16]:

feat_to_score = {
    feature: score 
    for feature, score 
    in zip(
        no_weather_X_train.columns, 
        no_weather_model.feature_importances_,
    )
}

feat_to_score = dict(
    sorted(
        feat_to_score.items(),
        key=lambda item: item[1],
        reverse=True,
    )
)
feat_to_score

{'interaction_distance_to_event': 98.56858103820176,
 'title': 0.32138396014526593,
 'event_city': 0.28768716105271774,
 'event_type': 0.2739099062313311,
 'event_indoor_capability': 0.1959338405594333,
 'attendance_rate': 0.12532672542750817,
 'age': 0.12158177157819017,
 'user_city': 0.10559559680378885,
 'user_interests': 0.0}

In [8]:
# from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score, precision_score, recall_score
# import numpy as np


# def evaluate_ranking_model_proba(model, val_pool, val_y, k_list=[5, 10]):
#     """
#     Evaluate a CatBoost ranking model using predicted probabilities, not binary class outputs.
#     """

#     # Predict class probabilities (not class labels)
#     proba = model.predict_proba(val_pool)[:, 1]  # Probability for class 1

#     results = {
#         "AUC": roc_auc_score(val_y, proba),
#         "Average Precision (MAP)": average_precision_score(val_y, proba),
#     }

#     # Convert to numpy arrays
#     true_labels = np.array(val_y)
#     predicted_scores = np.array(proba)

#     # Sort by predicted score
#     sorted_indices = np.argsort(predicted_scores)[::-1]
#     sorted_true = true_labels[sorted_indices]

#     for k in k_list:
#         top_k = sorted_true[:k]
#         precision_at_k = np.mean(top_k)
#         recall_at_k = np.sum(top_k) / np.sum(true_labels)
#         ndcg_at_k = ndcg_score(
#             y_true=true_labels.reshape(1, -1),
#             y_score=predicted_scores.reshape(1, -1),
#             k=k
#         )



#         results[f"Precision@{k}"] = precision_at_k
#         results[f"Recall@{k}"] = recall_at_k
#         results[f"NDCG@{k}"] = ndcg_at_k

#     return results

# "✅ Evaluation function ready: scores ranking model using AUC, MAP, Precision@K, Recall@K, and NDCG@K."


'✅ Evaluation function ready: scores ranking model using AUC, MAP, Precision@K, Recall@K, and NDCG@K.'

In [None]:

# # Evaluate weather-aware model
# weather_scores = evaluate_ranking_model_proba(
#     model=weather_model,
#     val_pool=weather_val_pool,
#     val_y=weather_y_val
# )

# # Evaluate no-weather model
# no_weather_scores = evaluate_ranking_model_proba(
#     model=no_weather_model,
#     val_pool=no_weather_val_pool,
#     val_y=no_weather_y_val
# )

# # Compare results
# print("Weather Model Scores:")
# for k, v in weather_scores.items():
#     print(f"{k}: {v:.4f}")

# print("\nNo-Weather Model Scores:")
# for k, v in no_weather_scores.items():
#     print(f"{k}: {v:.4f}")


In [28]:
# Connect to Hopsworks Model Registry
mr = project.get_model_registry()

In [29]:
from hsml.schema import Schema
from hsml.model_schema import ModelSchema

# Create model schema for weather ranking model
input_example = weather_X_train.sample().to_dict("records")
input_schema = Schema(weather_X_train)
output_schema = Schema(weather_y_train)
model_schema = ModelSchema(input_schema, output_schema)

weather_ranking_model = mr.python.create_model(
    name="weather_ranking_model", 
    metrics=weather_metrics,
    model_schema=model_schema,
    input_example=input_example,
    description="Ranking model that scores item candidates",
)
weather_ranking_model.save("/home/nkama/masters_thesis_project/thesis/models/weather_ranking_model.pkl")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /home/nkama/masters_thesis_project/thesis/models/weather_ranking_model.pkl: 0.000%|          | 0/880…

Uploading /home/nkama/masters_thesis_project/thesis/notebooks/input_example.json: 0.000%|          | 0/447 ela…

Uploading /home/nkama/masters_thesis_project/thesis/notebooks/model_schema.json: 0.000%|          | 0/1283 ela…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1220788/models/weather_ranking_model/1


Model(name: 'weather_ranking_model', version: 1)

In [30]:
# Create model schema for no weather ranking model  
input_example = no_weather_X_train.sample().to_dict("records")
input_schema = Schema(no_weather_X_train)
output_schema = Schema(no_weather_y_train)
model_schema = ModelSchema(input_schema, output_schema)

no_weather_ranking_model = mr.python.create_model(
    name="no_weather_ranking_model", 
    metrics=no_weather_metrics,
    model_schema=model_schema,
    input_example=input_example,
    description="Ranking model that scores item candidates",
)
no_weather_ranking_model.save("/home/nkama/masters_thesis_project/thesis/models/no_weather_ranking_model.pkl")

  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /home/nkama/masters_thesis_project/thesis/models/no_weather_ranking_model.pkl: 0.000%|          | 0/…

Uploading /home/nkama/masters_thesis_project/thesis/notebooks/input_example.json: 0.000%|          | 0/264 ela…

Uploading /home/nkama/masters_thesis_project/thesis/notebooks/model_schema.json: 0.000%|          | 0/799 elap…

Model created, explore it at https://c.app.hopsworks.ai:443/p/1220788/models/no_weather_ranking_model/1


Model(name: 'no_weather_ranking_model', version: 1)

In [5]:
import pandas as pd
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Load the datasets
events_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv')
users_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv')
interactions_df = pd.read_csv('/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv')
events_df.rename(columns={"category": "event_type"}, inplace = True)



# Merge the dataframes
merged_df = pd.merge(interactions_df, events_df, on='event_id', how='left')
merged_df = pd.merge(merged_df, users_df, on='user_id', how='left')

# Rename 'category' column to 'event_type' as requested
if 'category' in merged_df.columns:
    merged_df.rename(columns={'category': 'event_type'}, inplace=True)
else:
    print("Warning: 'category' column not found in merged_df. Assuming 'event_type' is already present or handled.")

# Create a copy of the dataframe for feature engineering
df = merged_df.copy()

# Feature Engineering
# Convert target variable `interaction_label` to numerical
df['interaction_label'] = df['interaction_label'].apply(lambda x: 1 if x == 'yes' else 0)

# Convert datetime columns
df['interaction_time'] = pd.to_datetime(df['interaction_time'])
df['start_time'] = pd.to_datetime(df['start_time'])
df['signup_date'] = pd.to_datetime(df['signup_date']) # Convert signup_date as well

# Extract time-based features
df['interaction_hour'] = df['interaction_time'].dt.hour
df['interaction_dayofweek'] = df['interaction_time'].dt.dayofweek
df['interaction_month'] = df['interaction_time'].dt.month

df['event_hour'] = df['start_time'].dt.hour
df['event_dayofweek'] = df['start_time'].dt.dayofweek
df['event_month'] = df['start_time'].dt.month

# Calculate time difference between interaction and event start
df['time_to_event_hours'] = (df['start_time'] - df['interaction_time']).dt.total_seconds() / 3600

# Calculate event duration in hours
df['event_duration_hours'] = df['duration'] / 60

# Calculate user_age_at_signup
# Assuming 'age' is current age and 'signup_date' is user's signup date.
# A more precise feature would be age at the time of interaction or event,
# but for simplicity, using the 'age' column as a direct feature for now.
# If `age` is the user's current age, and `signup_date` is a past date,
# `user_age_at_signup` might not be directly derivable without knowing the current date the `age` was recorded.
# For now, we will use the existing `age` column directly.

# One-hot encode categorical features
# Include 'indoor_outdoor_preference' in categorical columns
categorical_cols = ['user_city', 'event_type', 'event_city', 'interaction_type', 'weather_condition', 'user_weather_condition', 'indoor_outdoor_preference']
df = pd.get_dummies(df, columns=categorical_cols, dummy_na=False)

# Convert 'event_indoor_capability' to numerical (True/False to 1/0)
df['event_indoor_capability'] = df['event_indoor_capability'].astype(int)

# Text feature engineering for `user_interests` (CountVectorizer)
# Fill NaN values in 'user_interests' with empty string
df['user_interests'] = df['user_interests'].fillna('')
cv = CountVectorizer(max_features=50) # Limit features to avoid sparsity
user_interests_features = cv.fit_transform(df['user_interests']).toarray()
# Ensure consistent indexing after reset_index
user_interests_df = pd.DataFrame(user_interests_features, columns=[f'interest_{i}' for i in range(user_interests_features.shape[1])], index=df.index)
df = pd.concat([df, user_interests_df], axis=1)

# Text feature engineering for `title` (TF-IDF Vectorizer)
# Fill NaN values in 'title' with empty string
df['title'] = df['title'].fillna('')
tfidf = TfidfVectorizer(max_features=100) # Limit features to avoid sparsity
title_features = tfidf.fit_transform(df['title']).toarray()
# Ensure consistent indexing after reset_index
title_df = pd.DataFrame(title_features, columns=[f'title_tfidf_{i}' for i in range(title_features.shape[1])], index=df.index)
df = pd.concat([df, title_df], axis=1)

# Drop original columns that are no longer needed or were transformed
df = df.drop(columns=['interaction_id', 'user_id', 'event_id', 'interaction_time', 'start_time', 'duration',
                      'user_interests', 'title', 'signup_date', 'user_lat', 'user_lon', 'event_lat', 'event_lon'])

# Define features for both models
# Common features
common_features = [
    'interaction_distance_to_event', 'age', 'social_connectedness',
    'attendance_rate', 'event_indoor_capability', 'interaction_hour',
    'interaction_dayofweek', 'interaction_month', 'event_hour',
    'event_dayofweek', 'event_month', 'time_to_event_hours', 'event_duration_hours'
] + [col for col in df.columns if col.startswith('user_city_') or col.startswith('event_type_') or col.startswith('event_city_') or col.startswith('interaction_type_') or col.startswith('indoor_outdoor_preference_') or col.startswith('interest_') or col.startswith('title_tfidf_')]

# Weather features
weather_features_base = ['temperature', 'precipitation', 'user_temperature', 'user_precipitation']
available_weather_condition_cols = [col for col in df.columns if col.startswith('weather_condition_')]
available_user_weather_condition_cols = [col for col in df.columns if col.startswith('user_weather_condition_')]
weather_features_available = weather_features_base + available_weather_condition_cols + available_user_weather_condition_cols

features_with_weather = list(set(common_features + weather_features_available))
features_without_weather = list(set(common_features))

# Remove the target variable if it accidentally got into features
if 'interaction_label' in features_with_weather:
    features_with_weather.remove('interaction_label')
if 'interaction_label' in features_without_weather:
    features_without_weather.remove('interaction_label')

# Filter for existing columns to avoid key errors
features_with_weather = [f for f in features_with_weather if f in df.columns]
features_without_weather = [f for f in features_without_weather if f in df.columns]

# Display the head of the processed dataframe
print("Processed DataFrame Head:")
print(df.head())

# Display the dtypes of the processed dataframe
print("\nProcessed DataFrame dtypes:")
print(df.dtypes)

print("\nFeatures for Model with Weather:")
print(features_with_weather)

print("\nFeatures for Model without Weather:")
print(features_without_weather)

Processed DataFrame Head:
   interaction_distance_to_event  interaction_label  temperature  \
0                      15.442082                  0         19.2   
1                      18.905958                  0         32.0   
2                      58.743016                  0         17.4   
3                      22.280995                  0         25.2   
4                      56.605419                  0         19.6   

   precipitation  attendance_rate  event_indoor_capability  age  \
0           0.00        15.915409                        1   25   
1           0.32        19.572239                        0   46   
2           0.00        32.747489                        0   37   
3           0.00        29.689391                        1   44   
4          16.04        35.509785                        1   24   

   social_connectedness  user_temperature  user_precipitation  ...  \
0                    11              17.2                0.18  ...   
1                    1