In [2]:
import hopsworks
import pandas as pd
import numpy as np
import great_expectations as ge
from datetime import datetime, timedelta
from features.users import generate_users
from features.events import generate_events
from features.interactions import generate_interactions, augment_cold_start


## <span style='color:#ff5f27'> ⚙️ Data Generation</span>


---

### <span style='color:#ff5f27'> 👥 Users Data Generation</span>

In [3]:
# Login to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()


2025-03-26 17:30:37,720 INFO: Initializing external client
2025-03-26 17:30:37,724 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-03-26 17:30:39,008 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


In [55]:
# Generate data
print("Generating user data...")
num_users = 50000
users = generate_users(num_users)
users_df = pd.DataFrame(users)
print("Done!")

Generating user data...
Done!


In [56]:

print("Generating event data...")
num_events = 20000
events = generate_events(num_events)
events_df = pd.DataFrame(events)
print("Done!")

Generating event data...
Done!


In [57]:
len(events_df)

20000

In [60]:

print("Generating interaction data...")
num_interactions = 50000
interactions = generate_interactions(users, events, num_interactions)
interactions = augment_cold_start(users, events, interactions)
interactions_df = pd.DataFrame(interactions)
print("Done!")


Generating interaction data...
Done!


In [45]:
events_df.to_csv("events_df.csv")
users_df.to_csv("users_df.csv")
interactions_df.to_csv("interactions_df.csv")

In [None]:
import pandas as pd
users_df = pd.read_csv("users_df.csv").drop(columns=(['Unnamed: 0']))
events_df = pd.read_csv("events_df.csv").drop(columns=(['Unnamed: 0']))
interactions_df = pd.read_csv("interactions_df.csv").drop(columns=(['Unnamed: 0']))

In [46]:
# Data validation with Great Expectations
print("Validating user data...")
users_ge_df = ge.from_pandas(users_df)
users_expectations = [
    users_ge_df.expect_column_values_to_not_be_null("user_id"),
    users_ge_df.expect_column_values_to_be_between("age", min_value=18, max_value=100),
    users_ge_df.expect_column_values_to_be_in_set("weather_preference", ["indoor", "outdoor", "any"]),
    users_ge_df.expect_column_values_to_not_be_null("city")
]

print("Validating event data...")
events_ge_df = ge.from_pandas(events_df)
events_expectations = [
    events_ge_df.expect_column_values_to_not_be_null("event_id"),
    events_ge_df.expect_column_values_to_not_be_null("title"),
    events_ge_df.expect_column_values_to_not_be_null("event_type"),
    events_ge_df.expect_column_values_to_be_in_set("weather_condition", 
                                                 ["Clear", "Rain", "Snow", "Cloudy", "Windy"]),
    events_ge_df.expect_column_values_to_be_between("historical_attendance_rate", 
                                                  min_value=0, max_value=100)
]

print("Validating interaction data...")
interactions_ge_df = ge.from_pandas(interactions_df)
interactions_expectations = [
    interactions_ge_df.expect_column_values_to_not_be_null("interaction_id"),
    interactions_ge_df.expect_column_values_to_not_be_null("user_id"),
    interactions_ge_df.expect_column_values_to_not_be_null("event_id"),
    interactions_ge_df.expect_column_values_to_be_in_set("interaction_type", 
                                                       ["maybe", "invited & maybe", "no", "yes", 
                                                        "invited & yes", "invited & no", "invited"])
]


Validating user data...
Validating event data...
Validating interaction data...


In [47]:
# Check if all expectations passed
all_passed = True
for expectation_list in [users_expectations, events_expectations, interactions_expectations]:
    for expectation in expectation_list:
        if not expectation.success:
            print(f"Failed expectation: {expectation}")
            all_passed = False

if not all_passed:
    print("Data validation failed. Please check the data generation process.")
    # You might want to exit here in a production environment
else:
    print("All data validations passed!")


All data validations passed!


## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [48]:
# Create feature groups
print("Creating feature groups...")
users_fg = fs.get_or_create_feature_group(
    name="users",
    version=1,
    primary_key=["user_id"],
    online_enabled=True,
    description="User features for weather-based event recommendation"
)

users_fg.insert(users_df)
print('Done ✅')

Creating feature groups...
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1420950


Uploading Dataframe: 100.00% |██████████| Rows 50000/50000 | Elapsed Time: 00:05 | Remaining Time: 00:00


Launching job: users_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/users_1_offline_fg_materialization/executions
Done ✅


In [49]:

events_fg = fs.get_or_create_feature_group(
    name="events",
    version=1,
    primary_key=["event_id"],
    online_enabled=True,
    description="Event features for weather-based event recommendation"
)

events_fg.insert(events_df)
print('Done ✅')


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1420951


Uploading Dataframe: 100.00% |██████████| Rows 20000/20000 | Elapsed Time: 00:03 | Remaining Time: 00:00


Launching job: events_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/events_1_offline_fg_materialization/executions
Done ✅


In [34]:
users_df.head(2)

Unnamed: 0,user_id,location_lat,location_lon,city,weather_preference,age,declared_interests,signup_date,social_connectedness
0,1853c24c-a59f-467a-ad8c-7de6e7a156a1,51.437964,-0.101358,London,indoor,39,[],2023-12-28 13:22:26.336165,10
1,aa577974-0b43-456b-b8f3-7f4b74559047,51.46264,-0.089395,London,indoor,33,[],2023-10-07 19:56:55.761947,15


In [2]:

events_interactions_df = pd.merge(
    interactions_df, 
    events_df, 
    on='event_id', 
    how='inner',
    suffixes=('', '_event')  # Add suffix for event columns
)

ranking_df = pd.merge(
    events_interactions_df, 
    users_df, 
    on='user_id', 
    how='inner',
    suffixes=('', '_user')  # Add suffix for user columns
)
ranking_df.head(2)

NameError: name 'pd' is not defined

In [1]:
ranking_df.columns

NameError: name 'ranking_df' is not defined

In [None]:

interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    primary_key=["interaction_id", "user_id", "event_id"],
    online_enabled=True,
    description="User-event interactions for weather-based event recommendation"
)

interactions_fg.insert(interactions_df)
print('Done ✅')

In [None]:
# Create Ranking Feature group
events_interactions_df = pd.merge(
    interactions_df, 
    events_df, 
    on='event_id', 
    how='inner',
    suffixes=('', '_event')  # Add suffix for event columns
)

ranking_df = pd.merge(
    events_interactions_df, 
    users_df, 
    on='user_id', 
    how='inner',
    suffixes=('', '_user')  # Add suffix for user columns
)

ranking_df['label'] = ranking_df['interaction_type'].apply(
    lambda x: 1 if x in ['maybe', 'invited & maybe', 'yes', 'invited & yes'] else 0
)

ranking_df_with_weather = ranking_df.drop(
    ['interaction_id', 'interaction_type','interaction_time',\
     'start_time', 'signup_date','social_connectedness'], 
    axis=1, 
    inplace=True,
)

ranking_fg_weather = fs.get_or_create_feature_group(
    name="ranking-with-weather",
    description="Ranking Data with weather conditions.",
    version=1,
    primary_key=["user_id", "video_id"],
    online_enabled=True,
)

ranking_fg_weather.insert(ranking_df_with_weather)
print('Done ✅')

# create ranking data without weather information
ranking_df_without_weather = ranking_df.drop(['interaction_id', 'interaction_type',
       'interaction_time', 'start_time','weather_condition',
       'temperature', 'weather_preference',
       'signup_date', 'social_connectedness'])

ranking_fg_without_weather = fs.get_or_create_feature_group(
    name="ranking-without-weather",
    description="Ranking Data without weather conditions.",
    version=1,
    primary_key=["user_id", "video_id"],
    online_enabled=True,
)

ranking_fg_without_weather.insert(ranking_df_without_weather)
print('Done ✅')

print("Feature backfill")

In [None]:
# Add ranking features
print("Creating ranking features...")
ranking_df = interactions_df.copy()

# Add label column (1 for positive interactions, 0 for negative)
ranking_df['label'] = ranking_df['interaction_type'].apply(
    lambda x: 1 if x in ['yes', 'invited & yes'] else 0
)

# Add weather match feature
weather_match_df = pd.merge(
    ranking_df, 
    users_df[['user_id', 'event_type_preference']], 
    on='user_id'
)
weather_match_df = pd.merge(
    weather_match_df, 
    events_df[['event_id', 'weather_condition', 'indoor_capability']], 
    on='event_id'
)

# Define weather match logic
def calculate_weather_match(row):
    if row['event_type_preference'] == 'any':
        return 1.0
    elif row['event_type_preference'] == 'indoor' and row['indoor_capability']:
        return 1.0
    elif row['event_type_preference'] == 'outdoor' and row['weather_condition'] in ['Clear', 'Cloudy']:
        return 1.0
    elif row['event_type_preference'] == 'indoor' and row['weather_condition'] in ['Rain', 'Snow']:
        return 0.8
    else:
        return 0.5

weather_match_df['weather_match_score'] = weather_match_df.apply(calculate_weather_match, axis=1)
ranking_df = pd.merge(
    ranking_df, 
    weather_match_df[['interaction_id', 'weather_match_score']], 
    on='interaction_id'
)

# Add distance bin feature
def distance_to_bin(distance):
    if distance < 1:
        return 'very_close'
    elif distance < 5:
        return 'close'
    elif distance < 15:
        return 'moderate'
    elif distance < 50:
        return 'far'
    else:
        return 'very_far'

ranking_df['distance_bin'] = ranking_df['distance_to_event'].apply(distance_to_bin)

# Create ranking feature group
ranking_fg = fs.get_or_create_feature_group(
    name="ranking_features",
    version=1,
    primary_key=["interaction_id"],
    description="Features for training the ranking model"
)

# Insert data into feature groups
print("Inserting data into feature groups...")
users_fg.insert(users_df)
events_fg.insert(events_df)
interactions_fg.insert(interactions_df)
ranking_fg.insert(ranking_df)

# Create feature views
print("Creating feature views...")

# Feature view for retrieval model (two-tower model)
retrieval_view = fs.get_or_create_feature_view(
    name="event_retrieval",
    version=1,
    description="Feature view for event retrieval model",
    query=interactions_fg.select(["interaction_id", "user_id", "event_id"])
        .join(users_fg.select(["user_id", "location_lat", "location_lon", "city", 
                              "weather_preference", "age", "declared_interests",
                              "social_connectedness"]), on="user_id")
        .join(events_fg.select(["event_id", "title", "event_type", "location_lat", 
                               "location_lon", "city", "start_time", "duration",
                               "weather_condition", "indoor_capability"]), on="event_id")
)

# Feature view for ranking model
ranking_view = fs.get_or_create_feature_view(
    name="event_ranking",
    version=1,
    description="Feature view for event ranking model",
    query=ranking_fg.select_all()
)

# Feature view for recommendations (combines all features)
recommendation_view = fs.get_or_create_feature_view(
    name="event_recommendations",
    version=1,
    description="Feature view for complete event recommendations",
    query=interactions_fg.select_all()
        .join(users_fg.select_all(), on="user_id")
        .join(events_fg.select_all(), on="event_id")
)

print("Feature backfill completed successfully!")

# Print some statistics
print(f"Total users: {len(users_df)}")
print(f"Total events: {len(events_df)}")
print(f"Total interactions: {len(interactions_df)}")
print(f"Positive interactions: {ranking_df['label'].sum()}")
print(f"Negative interactions: {len(ranking_df) - ranking_df['label'].sum()}")
