In [20]:
import hopsworks
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import regex as re
from thesis.fully_synthetic_data.src.users import generate_users
from thesis.fully_synthetic_data.src.events import generate_events 
from thesis.fully_synthetic_data.src.interactions import generate_interactions 


## <span style='color:#ff5f27'> ⚙️ Data Generation</span>


---

### <span style='color:#ff5f27'> 👥 Users Data Generation</span>

In [21]:
# Login to Hopsworks
project = hopsworks.login()
fs = project.get_feature_store()


2025-05-27 18:24:45,364 INFO: Initializing external client
2025-05-27 18:24:45,366 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-05-27 18:24:46,788 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1220788


In [18]:
# clean text columns in the generated data
def clean_text(text):
    if not isinstance(text, str):
        return "unknown"
    text = text.lower().strip()
    text = text.replace(',', ' ')
    text = re.sub(r"[^\w\s]", "", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)     # collapse multiple spaces
    return text if text else "unknown"

def clean_text_columns(df):
    if 'title' in df.columns:
        df["title"] = df["title"].apply(lambda x: clean_text(x) if isinstance(x, str) and x.strip() else "unknown")
    if 'user_interests' in df.columns:
        df["user_interests"] = df["user_interests"].apply(
            lambda x: clean_text(x) if isinstance(x, str) and x.strip() else "unknown")
        
    return df



In [None]:
# Generate data
print("Generating user data...")
num_users = 50000
users = generate_users(num_users)
users_df = pd.DataFrame(users)

# Clean the dataframes
users_df = clean_text_columns(users_df)
print("Done!")

In [58]:

print("Generating event data...")
num_events = 10000
events = generate_events(num_events)
events_df = pd.DataFrame(events)

# Clean the dataframes
events_df = clean_text_columns(events_df)
print("Done!")

Generating event data...
Done!


In [60]:
events_df.columns

Index(['event_id', 'title', 'event_type', 'event_lat', 'event_lon',
       'event_city', 'start_time', 'duration', 'weather_condition',
       'temperature', 'attendance_rate', 'event_indoor_capability'],
      dtype='object')

In [61]:

print("Generating interaction data...")
num_interactions = 500000
interactions = generate_interactions(users_df, events_df, num_interactions)
interactions_df = pd.DataFrame(interactions)

# Add interaction_label for model training
interactions_df['interaction_label'] = interactions_df['interaction_type'].apply(
    lambda x: 1 if x in ['yes', 'invited & yes', 'maybe', 'invited & maybe'] else 0
)
# Clean the dataframes
interactions_df = clean_text_columns(interactions_df)
print("Done!")


Generating interaction data...
Done!


In [16]:
events_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/events.csv",index=False )
users_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/users.csv",index=False)
interactions_df.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/interactions.csv",index=False)

In [4]:
import pandas as pd
events_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/events" )
users_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/users.csv")
interactions_df = pd.read_csv("/home/nkama/masters_thesis_project/thesis/interactions.csv")

In [14]:
users_df.rename(columns={
    "weather_condition" : "user_weather_condition",
    "temperature": "user_temperature",
    "precipitation":"user_precipitation"
}, inplace = True)

In [15]:
users_df.columns

Index(['user_id', 'user_lat', 'user_lon', 'user_city',
       'indoor_outdoor_preference', 'age', 'user_interests', 'signup_date',
       'social_connectedness', 'user_weather_condition', 'user_temperature',
       'user_precipitation'],
      dtype='object')

In [22]:
len(events_df), len(users_df), len(interactions_df)


(10000, 50000, 500000)

In [25]:
interactions_df.head()


Unnamed: 0,interaction_id,user_id,event_id,interaction_type,interaction_time,interaction_distance_to_event,interaction_label
0,GM322S,DK924U,SW955O,invited & no,2025-07-22 02:34:22.544292,15.442082,0
1,IP622F,PQ743E,KK727U,invited & no,2025-08-26 14:43:31.919623,18.905958,0
2,QR516Z,NX067Y,KV778X,invited & maybe,2025-10-12 20:59:39.916672,58.743016,1
3,KM455K,AJ787D,AF604G,yes,2025-08-19 08:32:51.533099,22.280995,1
4,NC809A,TA859C,DB908U,yes,2025-07-18 21:26:23.948365,56.605419,1


In [28]:
events_df.rename(columns={"category":"event_type"},inplace = True)

## <span style="color:#ff5f27">🔮 Connect to Hopsworks Feature Store </span>

In [30]:
# Create feature groups
print("Creating feature groups...")
users_fg = fs.get_or_create_feature_group(
    name="users",
    version=1,
    primary_key=["user_id"],
    online_enabled=True,
    description="User features for weather-based event recommendation"
)

users_fg.insert(users_df) # Insert data into feature groups
print('Done ✅')

Creating feature groups...


Uploading Dataframe: 100.00% |██████████| Rows 50000/50000 | Elapsed Time: 00:06 | Remaining Time: 00:00


Launching job: users_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/users_1_offline_fg_materialization/executions
Done ✅


In [37]:

events_fg = fs.get_or_create_feature_group(
    name="events",
    version=1,
    primary_key=["event_id"],
    online_enabled=True,
    description="Event features for weather-based event recommendation"
)

events_fg.insert(events_df) # Insert data into feature groups
print('Done ✅')


Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1479135


Uploading Dataframe: 100.00% |██████████| Rows 10000/10000 | Elapsed Time: 00:02 | Remaining Time: 00:00


Launching job: events_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/events_1_offline_fg_materialization/executions
Done ✅


In [35]:
users_df.head(2)

Unnamed: 0,user_id,user_lat,user_lon,user_city,indoor_outdoor_preference,age,user_interests,signup_date,social_connectedness,user_weather_condition,user_temperature,user_precipitation
0,EJ688Y,49.307314,2.584194,Paris,outdoor,40,"sports,music,art,fashion",2025-03-19 08:38:22.020060,14,Clear,18.5,0.0
1,RA189K,40.727034,-73.913585,New York,outdoor,25,"tech,sports",2023-11-29 02:06:51.906668,15,Clear,19.1,0.0


In [31]:
interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    primary_key=["interaction_id","user_id", "event_id"],
    online_enabled=True,
    description="User-event interactions for weather-based event recommendation"
)

interactions_fg.insert(interactions_df) # Insert data into feature groups

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1477094


Uploading Dataframe: 100.00% |██████████| Rows 500000/500000 | Elapsed Time: 00:43 | Remaining Time: 00:00


Launching job: interactions_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/interactions_1_offline_fg_materialization/executions


(Job('interactions_1_offline_fg_materialization', 'SPARK'), None)

### create ranking feature group

In [32]:
#create ranking feature group
events_interactions_df = pd.merge(
    interactions_df, 
    events_df, 
    on='event_id', 
    how='inner'  
)

ranking_df = pd.merge(
    events_interactions_df, 
    users_df, 
    on='user_id', 
    how='inner',
    #suffixes=('', '_event') 
)

### create weather ranking data with weather information

In [33]:
ranking_df_with_weather = ranking_df.drop(
    ['interaction_id', 'interaction_type','interaction_time',\
     'start_time', 'signup_date','social_connectedness'], 
    axis=1
)

ranking_fg_weather = fs.get_or_create_feature_group(
    name="weather_ranking",
    description="Ranking Data with weather data.",
    version=1,
    primary_key=["user_id", "event_id"],
    online_enabled=True,
    #features=ranking_df_with_weather.columns.to_list()  # ← critical!
)

ranking_fg_weather.insert(ranking_df_with_weather)
print('Done ✅')

#save to local disk
ranking_df_with_weather.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/ranking_df_with_weather2.csv")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1479133


Uploading Dataframe: 100.00% |██████████| Rows 501500/501500 | Elapsed Time: 01:20 | Remaining Time: 00:00


Launching job: weather_ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/weather_ranking_1_offline_fg_materialization/executions
Done ✅


In [34]:
events_df.head()

Unnamed: 0,event_id,title,event_type,event_lat,event_lon,event_city,start_time,duration,weather_condition,temperature,precipitation,attendance_rate,event_indoor_capability
0,KI949R,Future-proofed object-oriented superstructure ...,Arts & Culture,48.39139,1.976128,Paris,2025-06-03 09:23:09.030473,120,Clear,20.7,0.0,40.523999,True
1,HI261W,Compatible optimal analyzer Health & Wellness ...,Health & Wellness,51.519833,-0.295177,London,2025-09-01 19:41:01.676911,180,Clear,16.6,0.0,50.663219,False
2,YY535R,Open-source tertiary intranet Sports & Fitness...,Sports & Fitness,40.636058,-74.279254,New York,2025-09-20 14:03:42.135680,120,Clear,18.7,0.0,22.62672,False
3,SC800A,Automated interactive migration Food & Drink i...,Food & Drink,18.664479,73.198802,Mumbai,2025-10-21 18:39:07.866464,120,Cloudy,29.3,0.48,39.114844,False
4,YI400R,Face-to-face real-time framework Arts & Cultur...,Arts & Culture,40.529525,-73.533329,New York,2025-10-29 19:29:52.046278,120,Clear,18.3,0.0,9.704529,True


### create ranking data without weather information

In [36]:
# create ranking data without weather information
ranking_df_without_weather = ranking_df.drop(['interaction_id', 'interaction_type',
       'interaction_time', 'start_time','weather_condition',
       'temperature', 'indoor_outdoor_preference',
       'signup_date', 'social_connectedness'],axis=1)

ranking_fg_without_weather = fs.get_or_create_feature_group(
    name="no_weather_ranking",
    description="Ranking Data without weather conditions.",
    version=1,
    primary_key=["user_id", "event_id"],
    online_enabled=True,
    #features=ranking_df_without_weather.columns.to_list()
)

ranking_fg_without_weather.insert(ranking_df_without_weather)
print('Done ✅')

#save to local disk
ranking_df_without_weather.to_csv("/home/nkama/masters_thesis_project/thesis/fully_synthetic_data/data/ranking_df_without_weather2.csv")

Feature Group created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/1220788/fs/1208418/fg/1479134


Uploading Dataframe: 100.00% |██████████| Rows 501500/501500 | Elapsed Time: 01:13 | Remaining Time: 00:00


Launching job: no_weather_ranking_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://c.app.hopsworks.ai:443/p/1220788/jobs/named/no_weather_ranking_1_offline_fg_materialization/executions
Done ✅
