In [55]:
import hopsworks
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import regex as re
from thesis.fully_synthetic_data.src.users import generate_users
from thesis.fully_synthetic_data.src.events import generate_events 
from thesis.fully_synthetic_data.src.interactions import generate_interactions 


## <span style='color:#ff5f27'> ⚙️ Data Generation</span>


---

### <span style='color:#ff5f27'> 👥 Users Data Generation</span>

In [17]:
# Login to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()


2025-05-26 16:40:05,821 INFO: Initializing external client
2025-05-26 16:40:05,825 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-26 16:40:07,582 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


In [18]:
# clean text columns in the generated data
def clean_text(text):
    if not isinstance(text, str):
        return "unknown"
    text = text.lower().strip()
    text = text.replace(',', ' ')
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)     # collapse multiple spaces
    return text if text else "unknown"

def clean_text_columns(df):
    if 'title' in df.columns:
        df["title"] = df["title"].apply(lambda x: clean_text(x) if isinstance(x, str) and x.strip() else "unknown")
    if 'user_interests' in df.columns:
        df["user_interests"] = df["user_interests"].apply(
            lambda x: clean_text(x) if isinstance(x, str) and x.strip() else "unknown")
        
    return df



In [57]:
# Generate data
print("Generating user data...")
num_users = 50000
users = generate_users(num_users)
users_df = pd.DataFrame(users)

# Clean the dataframes
users_df = clean_text_columns(users_df)
print("Done!")

Generating user data...
Done!


In [58]:

print("Generating event data...")
num_events = 10000
events = generate_events(num_events)
events_df = pd.DataFrame(events)

# Clean the dataframes
events_df = clean_text_columns(events_df)
print("Done!")

Generating event data...
Done!


In [60]:
events_df.columns

Index(['event_id', 'title', 'event_type', 'event_lat', 'event_lon',
       'event_city', 'start_time', 'duration', 'weather_condition',
       'temperature', 'attendance_rate', 'event_indoor_capability'],
      dtype='object')

In [61]:

print("Generating interaction data...")
num_interactions = 500000
interactions = generate_interactions(users_df, events_df, num_interactions)
interactions_df = pd.DataFrame(interactions)

# Add interaction_label for model training
interactions_df['interaction_label'] = interactions_df['interaction_type'].apply(
    lambda x: 1 if x in ['yes', 'invited & yes', 'maybe', 'invited & maybe'] else 0
)
# Clean the dataframes
interactions_df = clean_text_columns(interactions_df)
print("Done!")


Generating interaction data...
Done!


In [74]:
events_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv",index=False )
users_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv",index=False)
interactions_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv",index=False)

In [77]:
events_df.to_csv("events.csv",index=False )
users_df.to_csv("users.csv",index=False)
interactions_df.to_csv("interactions.csv",index=False)

In [75]:
events_df.columns

Index(['event_id', 'title', 'event_type', 'event_lat', 'event_lon',
       'event_city', 'start_time', 'duration', 'weather_condition',
       'temperature', 'attendance_rate', 'event_indoor_capability'],
      dtype='object')

In [76]:
users_df.columns


Index(['user_id', 'user_lat', 'user_lon', 'user_city',
       'indoor_outdoor_preference', 'age', 'user_interests', 'signup_date',
       'social_connectedness'],
      dtype='object')

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [66]:
# Create feature groups
print("Creating feature groups...")
users_fg = fs.get_or_create_feature_group(
    name="users",
    version=1,
    primary_key=["user_id"],
    online_enabled=True,
    description="User features for weather-based event recommendation"
)

users_fg.insert(users_df) # Insert data into feature groups
print('Done ✅')

Creating feature groups...
Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1477073


Uploading Dataframe: 100.00% |██████████| Rows 50000/50000 | Elapsed Time: 00:09 | Remaining Time: 00:00


Launching job: users_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/users_1_offline_fg_materialization/executions
Done ✅


In [67]:

events_fg = fs.get_or_create_feature_group(
    name="events",
    version=1,
    primary_key=["event_id"],
    online_enabled=True,
    description="Event features for weather-based event recommendation"
)

events_fg.insert(events_df) # Insert data into feature groups
print('Done ✅')


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1477074


Uploading Dataframe: 100.00% |██████████| Rows 10000/10000 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: events_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/events_1_offline_fg_materialization/executions
Done ✅


In [43]:
users_df.head(2)

Unnamed: 0,user_id,user_lat,user_lon,user_city,indoor_outdoor_preference,age,user_interests,signup_date,social_connectedness
0,OS082V,49.307314,2.584194,Paris,outdoor,40,art food,2024-10-02 17:26:50.584972,11
1,IQ685Q,51.532156,-0.195855,London,outdoor,32,tech sports,2023-09-19 14:53:32.712057,16


In [68]:
interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    primary_key=["interaction_id","user_id", "event_id"],
    online_enabled=True,
    description="User-event interactions for weather-based event recommendation"
)

interactions_fg.insert(interactions_df) # Insert data into feature groups

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1479116


Uploading Dataframe: 100.00% |██████████| Rows 500000/500000 | Elapsed Time: 00:38 | Remaining Time: 00:00


Launching job: interactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/interactions_1_offline_fg_materialization/executions


(Job('interactions_1_offline_fg_materialization', 'SPARK'), None)

### create ranking feature group

In [69]:
#create ranking feature group
events_interactions_df = pd.merge(
    interactions_df, 
    events_df, 
    on='event_id', 
    how='inner'  
)

ranking_df = pd.merge(
    events_interactions_df, 
    users_df, 
    on='user_id', 
    how='inner',
    #suffixes=('', '_event') 
)

### create weather ranking data with weather information

In [70]:
ranking_df_with_weather = ranking_df.drop(
    ['interaction_id', 'interaction_type','interaction_time',\
     'start_time', 'signup_date','social_connectedness'], 
    axis=1
)

ranking_fg_weather = fs.get_or_create_feature_group(
    name="weather_ranking",
    description="Ranking Data with weather data.",
    version=1,
    primary_key=["user_id", "event_id"],
    online_enabled=True,
    #features=ranking_df_with_weather.columns.to_list()  # ← critical!
)

ranking_fg_weather.insert(ranking_df_with_weather)
print('Done ✅')

#save to local disk
ranking_df_with_weather.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/ranking_df_with_weather2.csv")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1477075


Uploading Dataframe: 100.00% |██████████| Rows 501807/501807 | Elapsed Time: 01:11 | Remaining Time: 00:00


Launching job: weather_ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/weather_ranking_1_offline_fg_materialization/executions
Done ✅


In [72]:
events_df.head()

Unnamed: 0,event_id,title,event_type,event_lat,event_lon,event_city,start_time,duration,weather_condition,temperature,attendance_rate,event_indoor_capability
0,VH393F,configurable zero tolerance matrices education...,Education & Learning,35.263001,139.44002,Tokyo,2025-10-22 13:44:55.387282,120,Rain,19.4,17.674551,True
1,BX654I,balanced tertiary forecast technology in berlin,Technology,52.313431,13.220363,Berlin,2025-06-20 09:06:31.700477,480,Rain,11.8,10.505273,True
2,GZ338E,preemptive 5thgeneration adapter sports fitnes...,Sports & Fitness,43.955581,-79.723338,Toronto,2025-07-09 18:27:42.564142,180,Rain,7.8,38.330403,False
3,YS594Z,visionary neutral challenge food drink in sydney,Food & Drink,-33.54193,151.252682,Sydney,2025-09-23 18:17:10.431402,240,Clear,24.4,24.458917,False
4,HI986G,implemented impactful open architecture arts c...,Arts & Culture,-33.853951,150.770047,Sydney,2025-08-28 19:41:00.341457,120,Rain,19.4,69.000834,True


### create ranking data without weather information

In [71]:
# create ranking data without weather information
ranking_df_without_weather = ranking_df.drop(['interaction_id', 'interaction_type',
       'interaction_time', 'start_time','weather_condition',
       'temperature', 'indoor_outdoor_preference',
       'signup_date', 'social_connectedness'],axis=1)

ranking_fg_without_weather = fs.get_or_create_feature_group(
    name="no_weather_ranking",
    description="Ranking Data without weather conditions.",
    version=1,
    primary_key=["user_id", "event_id"],
    online_enabled=True,
    #features=ranking_df_without_weather.columns.to_list()
)

ranking_fg_without_weather.insert(ranking_df_without_weather)
print('Done ✅')

#save to local disk
ranking_df_without_weather.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/ranking_df_without_weather2.csv")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1477076


Uploading Dataframe: 100.00% |██████████| Rows 501807/501807 | Elapsed Time: 01:09 | Remaining Time: 00:00


Launching job: no_weather_ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/no_weather_ranking_1_offline_fg_materialization/executions
Done ✅
