In [None]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# ======================
# 1. User Data Generation
# ======================
def generate_users(n_users=50000):
    users = []
    cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
    
    # Approximate coordinates for each city
    city_coords = {
        'New York': (40.7128, -74.0060),
        'London': (51.5074, -0.1278),
        'Paris': (48.8566, 2.3522),
        'Tokyo': (35.6762, 139.6503),
        'Sydney': (-33.8688, 151.2093),
        'Berlin': (52.5200, 13.4050),
        'Mumbai': (19.0760, 72.8777),
        'São Paulo': (-23.5505, -46.6333),
        'Toronto': (43.6532, -79.3832),
        'Dubai': (25.2048, 55.2708)
    }
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=[0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05])
        
        # Generate lat and lon based on the selected city with some random variation
        base_lat, base_lon = city_coords[city]
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
        
        # Generate weather preferences with Dirichlet distribution
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]  # indoor/outdoor/any
        weather_pref = np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs)
        
        # Generate declared interests for cold-start handling
        interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
        declared_interests = random.sample(interests, k=random.randint(0,4)) if random.random() < 0.7 else []
        
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': weather_pref,
            'age': int(skewnorm.rvs(5, loc=25, scale=15)),
            'declared_interests': declared_interests,
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    
    return pd.DataFrame(users)



# =======================
# 2. Event Data Generation
# =======================
def generate_events(n_events=10000):
    events = []
    event_types = ['concert', 'sports', 'conference', 'festival', 'workshop', 'exhibition', 'seminar', 'networking']
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']

    # Approximate coordinates for each city (same as in generate_users)
    city_coords = {
        'New York': (40.7128, -74.0060),
        'London': (51.5074, -0.1278),
        'Paris': (48.8566, 2.3522),
        'Tokyo': (35.6762, 139.6503),
        'Sydney': (-33.8688, 151.2093),
        'Berlin': (52.5200, 13.4050),
        'Mumbai': (19.0760, 72.8777),
        'São Paulo': (-23.5505, -46.6333),
        'Toronto': (43.6532, -79.3832),
        'Dubai': (25.2048, 55.2708)
    }
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        # 20% of events in random cities
        city = np.random.choice(cities) if random.random() < 0.8 else np.random.choice(cities)
        
        # Generate lat and lon based on the selected city with some random variation
        base_lat, base_lon = city_coords[city]
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
        
        # Generate weather-sensitive attributes
        weather_condition = np.random.choice(weather_conditions, p=[0.5, 0.2, 0.05, 0.2, 0.05])
        if event_type in ['sports', 'festival']:
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['conference', 'workshop', 'seminar']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])  # Indoor events less weather-dependent
            
        # Generate temporal features
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        if start_time.weekday() >= 5:  # Weekend
            start_time = start_time.replace(hour=np.random.choice([10,14,18]))
        else:
            start_time = start_time.replace(hour=np.random.choice([9,13,18,19]))
            
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type.capitalize()} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),
            'weather_condition': weather_condition,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,
            'indoor_capability': event_type in ['conference', 'workshop', 'exhibition', 'seminar']
        })
    
    return pd.DataFrame(events)


# =============================
# 3. Interaction Data Generation
# =============================
def generate_interactions(users, events, n_interactions=500000):
    interactions = []
    event_coords = events[['location_lat', 'location_lon']].values
    event_weather = dict(zip(events['event_id'], events['weather_condition']))
    
    for _ in range(n_interactions):
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate distance between user and event
        user_coord = (user['location_lat'], user['location_lon'])
        event_coord = (event['location_lat'], event['location_lon'])
        distance = geodesic(user_coord, event_coord).km
        
        # Generate interaction probability components
        weather_score = 1 if (event['weather_condition'] == 'Clear') or \
                          (user['weather_preference'] == 'any') else 0.3


        distance_score = np.exp(-distance/5)  # Faster decay (from 20 to 10 km)
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                            user['weather_preference'] in ['outdoor', 'any']) else 0.5
        social_score = np.log1p(user['social_connectedness']) / 10
        
        # Combine scores for interaction probability
        #interaction_prob = 0.4*weather_score + 0.3*distance_score + 0.3*social_score
        interaction_prob = 0.7*distance_score + 0.2*weather_score + 0.1*social_score
        # Introduce more dispersed locations
        if random.random() < 0.2:
            user_coords = (base_lat + np.random.uniform(-2,2), 
                        base_lon + np.random.uniform(-2,2))

        # Add a minimum distance threshold for interactions
        if distance < 10 and (random.random() < interaction_prob):
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(
                    ['click', 'save', 'attend'],
                    p=[0.7, 0.25, 0.05]
                ),
                'interaction_time': fake.date_time_between(
                    start_date=event['start_time'] - timedelta(days=30),
                    end_date=event['start_time']
                ),
                'weather_at_interaction': event_weather[event['event_id']],
                'distance_to_event': distance
            })
    
    return pd.DataFrame(interactions)

# ======================
# 4. Cold-Start Handling
# ======================
def augment_cold_start(users, events, interactions):
    cold_users = users[users['declared_interests'].str.len() > 0]
    trending_events = events.nlargest(100, 'historical_attendance_rate')
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        for _ in range(random.randint(1,5)):
            event = trending_events.sample(1).iloc[0]
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': 'click',
                'interaction_time': fake.date_time_between(
                    start_date=user['signup_date'],
                    end_date=user['signup_date'] + timedelta(days=7)
                ),
                'weather_at_interaction': event['weather_condition'],
                'distance_to_event': geodesic(
                    (user['location_lat'], user['location_lon']),
                    (event['location_lat'], event['location_lon'])
                ).km
            })
    
    return pd.concat([interactions, pd.DataFrame(cold_interactions)])

# ================
# Execution Pipeline
# ================
if __name__ == "__main__":
    print("Generating users...")
    users_df = generate_users(20000)
    
    print("Generating events...")
    events_df = generate_events(5000)
    
    print("Generating base interactions...")
    interactions_df = generate_interactions(users_df, events_df, 100000)
    
    print("Adding cold-start interactions...")
    full_interactions = augment_cold_start(users_df, events_df, interactions_df)
    
    # Save datasets
    users_df.to_csv('synthetic_users.csv', index=False)
    events_df.to_csv('synthetic_events.csv', index=False)
    full_interactions.to_csv('synthetic_interactions.csv', index=False)

    print(f"Generated {len(users_df)} users, {len(events_df)} events, and {len(full_interactions)} interactions.")


In [None]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

def generate_location(city):
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': int(skewnorm.rvs(5, loc=25, scale=15)),
            'declared_interests': random.sample(interests, k=random.randint(0,4)) if random.random() < 0.7 else [],
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    events = []
    event_types = ['concert', 'sports', 'conference', 'festival', 'workshop', 'exhibition', 'seminar', 'networking']
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities)
        lat, lon = generate_location(city)
        weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        if event_type in ['sports', 'festival']:
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['conference', 'workshop', 'seminar']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        start_time = start_time.replace(hour=np.random.choice([10,14,18]) if start_time.weekday() >= 5 else np.random.choice([9,13,18,19]))
        
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type.capitalize()} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),
            'weather_condition': weather_condition,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,
            'indoor_capability': event_type in ['conference', 'workshop', 'exhibition', 'seminar']
        })
    return pd.DataFrame(events)

def generate_interactions(users, events, n_interactions=100000):
    interactions = []
    event_weather = dict(zip(events['event_id'], events['weather_condition']))
    
    for _ in range(n_interactions):
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        distance = geodesic((user['location_lat'], user['location_lon']), (event['location_lat'], event['location_lon'])).km
        
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and user['weather_preference'] in ['outdoor', 'any']) else 0.5
        distance_score = np.exp(-distance/5)
        social_score = np.log1p(user['social_connectedness']) / 10
        
        interaction_prob = 0.7*distance_score + 0.2*weather_score + 0.1*social_score
        
        if distance < 300 and (random.random() < interaction_prob):
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(['click', 'save', 'attend'], p=[0.7, 0.25, 0.05]),
                'interaction_time': fake.date_time_between(start_date=event['start_time'] - timedelta(days=30), end_date=event['start_time']),
                'weather_at_interaction': event_weather[event['event_id']],
                'distance_to_event': distance
            })
    return pd.DataFrame(interactions)

def augment_cold_start(users, events, interactions):
    cold_users = users[users['declared_interests'].apply(len) > 0]
    trending_events = events.nlargest(100, 'historical_attendance_rate')
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        for _ in range(random.randint(1,5)):
            event = trending_events.sample(1).iloc[0]
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': 'click',
                'interaction_time': fake.date_time_between(start_date=user['signup_date'], end_date=user['signup_date'] + timedelta(days=7)),
                'weather_at_interaction': event['weather_condition'],
                'distance_to_event': geodesic((user['location_lat'], user['location_lon']), (event['location_lat'], event['location_lon'])).km
            })
    return pd.concat([interactions, pd.DataFrame(cold_interactions)])

if __name__ == "__main__":
    print("Generating users...")
    users_df = generate_users(50000)
    
    print("Generating events...")
    events_df = generate_events(10000)
    
    print("Generating base interactions...")
    interactions_df = generate_interactions(users_df, events_df, 500000)
    
    print("Adding cold-start interactions...")
    full_interactions = augment_cold_start(users_df, events_df, interactions_df)
    
    users_df.to_csv('synthetic_users.csv', index=False)
    events_df.to_csv('synthetic_events.csv', index=False)
    full_interactions.to_csv('synthetic_interactions.csv', index=False)

    print(f"Generated {len(users_df)} users, {len(events_df)} events, and {len(full_interactions)} interactions.")


In [None]:
events = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_events.csv')
events.head()

In [None]:
len(users)

In [None]:
users.isnull().sum()

In [None]:
users["declared_interests"].unique()


In [None]:
users = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_users.csv')
users.head()

In [None]:
events.isnull().sum()

In [None]:
interactions = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_interactions.csv')
interactions.head()

In [None]:
interactions.isnull().sum()

In [None]:
interactions["interaction_type"].unique()

In [None]:
def validate_data(users, events, interactions):
    print(f"Total users: {len(users)}")
    print(f"Total events: {len(events)}")
    print(f"Total interactions: {len(interactions)}")
    
    # Check user-event ratio
    assert len(users) > len(events), "There should be more users than events"
    
    # Check city distribution
    city_distribution = users['city'].value_counts(normalize=True)
    print("City distribution:")
    print(city_distribution)
    assert len(city_distribution) == 10, "Should have 10 cities"
    
    # Check weather-event alignment
    outdoor_events = events[events['event_type'].isin(['sports', 'festival'])]
    assert (outdoor_events['weather_condition'] == 'Clear').mean() > 0.7
    
    # In validate_data()
    distance_bins = pd.cut(interactions['distance_to_event'], 
                        bins=[0, 5, 20, 50, 100, 200],
                        include_lowest=True)
    click_rates = (interactions.groupby(distance_bins, observed=True)['interaction_type']
                   .value_counts(normalize=True))
    
    # Get first and last bin with actual data
    valid_bins = click_rates.index.get_level_values(0).unique()
    if len(valid_bins) >= 2:
        first_bin = valid_bins[0]
        last_bin = valid_bins[-1]
        # assert (click_rates.loc[first_bin, 'click'] > 
        #         click_rates.loc[last_bin, 'click']), "Distance decay pattern not observed"
        # # Instead of strict greater-than
        # Add diagnostic printouts before the assert
        print("\nDistance Decay Pattern Analysis:")
        print(f"First bin ({valid_bins[0]}): {click_rates.loc[valid_bins[0], 'click']:.2%} click rate")
        print(f"Last bin ({valid_bins[-1]}): {click_rates.loc[valid_bins[-1], 'click']:.2%} click rate")
        print(f"Decay ratio: {click_rates.loc[valid_bins[0], 'click']/click_rates.loc[valid_bins[-1], 'click']:.1f}x")

        #assert click_rates.loc[first_bin, 'click'] > click_rates.loc[last_bin, 'click'] * 0.8,  "Distance decay pattern not observed"

    # Check cold-start coverage using value_counts with normalize
    cold_users = users[users['declared_interests'].str.len() > 0]
    cold_interaction_counts = interactions[interactions['user_id'].isin(cold_users['user_id'])].groupby('user_id').size()
    cold_coverage = (cold_interaction_counts >= 3).mean()
    assert cold_coverage > 0.8, f"Cold-start coverage insufficient: {cold_coverage:.1%}"


validate_data(users_df, events_df, full_interactions)


In [None]:
import pandas as pd

In [None]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet
import os

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
# Verify that probabilities sum to 1
if not np.isclose(sum(city_probs), 1.0):
    raise ValueError(f"City probabilities must sum to 1.0, but sum to {sum(city_probs)}")

city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

def generate_location(city):
    """Generate random coordinates near a city center, with occasional outliers."""
    if city not in city_coords:
        raise ValueError(f"Unknown city: {city}")
        
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        # Most locations are near city center
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        # Some locations are farther out (suburbs or neighboring areas)
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    """Generate synthetic user data."""
    if n_users <= 0:
        raise ValueError("Number of users must be positive")
        
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Ensure age is reasonable (avoid negative values from skewnorm)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Generate weather preference probabilities
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'declared_interests': random.sample(interests, k=random.randint(0, min(4, len(interests)))) if random.random() < 0.7 else [],
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    """Generate synthetic event data."""
    if n_events <= 0:
        raise ValueError("Number of events must be positive")
        
    events = []
    event_types = ['concert', 'sports', 'conference', 'festival', 'workshop', 'exhibition', 'seminar', 'networking']
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    # Verify weather probabilities sum to 1
    if not np.isclose(sum(weather_probs), 1.0):
        raise ValueError(f"Weather probabilities must sum to 1.0, but sum to {sum(weather_probs)}")
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Weather conditions with conditional probabilities based on event type
        if event_type in ['sports', 'festival']:
            # Ensure most outdoor events have Clear weather to meet validation requirements
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['conference', 'workshop', 'seminar']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        else:
            weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        
        # Generate start time with reasonable hours based on weekday/weekend
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        is_weekend = start_time.weekday() >= 5
        hour_choices = [10, 14, 18] if is_weekend else [9, 13, 18, 19]
        start_time = start_time.replace(hour=np.random.choice(hour_choices))
        
        # Generate other event attributes
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type.capitalize()} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),  # Duration in minutes
            'weather_condition': weather_condition,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,  # Percentage
            'indoor_capability': event_type in ['conference', 'workshop', 'exhibition', 'seminar']
        })
    return pd.DataFrame(events)

def generate_interactions(users, events, n_interactions=100000):
    """Generate synthetic user-event interactions."""
    if n_interactions <= 0:
        raise ValueError("Number of interactions must be positive")
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    interactions = []
    event_weather = dict(zip(events['event_id'], events['weather_condition']))
    
    for _ in range(n_interactions):
        # Sample a random user and event
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate distance between user and event
        distance = geodesic((user['location_lat'], user['location_lon']), 
                           (event['location_lat'], event['location_lon'])).km
        
        # Calculate interaction probability based on various factors
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                               user['weather_preference'] in ['outdoor', 'any']) else 0.5
        # Modified distance decay to ensure validation requirements are met
        distance_score = np.exp(-distance/5)  # Decreases with distance
        
        social_score = np.log1p(user['social_connectedness']) / 10
        
        interaction_prob = 0.7*distance_score + 0.2*weather_score + 0.1*social_score
        
        # Create interaction if probability and distance criteria are met
        if distance < 300 and (random.random() < interaction_prob):
            interaction_time = fake.date_time_between(
                start_date=event['start_time'] - timedelta(days=30), 
                end_date=event['start_time']
            )
            
            # Set interaction type with bias toward clicks at closer distances
            if distance < 20:
                interaction_type_probs = [0.8, 0.15, 0.05]  # Higher click rate for nearby events
            else:
                interaction_type_probs = [0.7, 0.25, 0.05]
                
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(['click', 'save', 'attend'], p=interaction_type_probs),
                'interaction_time': interaction_time,
                'weather_at_interaction': event_weather[event['event_id']],
                'distance_to_event': distance
            })
    return pd.DataFrame(interactions)

def augment_cold_start(users, events, interactions):
    """Add interactions for new users to help with cold-start problem."""
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    # Get users with declared interests (potential cold start users)
    cold_users = users[users['declared_interests'].apply(len) > 0]
    if cold_users.empty:
        print("Warning: No users with declared interests found for cold start augmentation")
        return interactions
        
    # Get trending events
    trending_events = events.nlargest(min(100, len(events)), 'historical_attendance_rate')
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        # Ensure sufficient interactions for cold-start users (at least 3)
        for _ in range(max(3, random.randint(3, 6))):
            event = trending_events.sample(1).iloc[0]
            
            # Calculate distance
            distance = geodesic((user['location_lat'], user['location_lon']), 
                              (event['location_lat'], event['location_lon'])).km
            
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': 'click',
                'interaction_time': fake.date_time_between(
                    start_date=user['signup_date'], 
                    end_date=user['signup_date'] + timedelta(days=7)
                ),
                'weather_at_interaction': event['weather_condition'],
                'distance_to_event': distance
            })
    
    # Combine with original interactions
    return pd.concat([interactions, pd.DataFrame(cold_interactions)], ignore_index=True)

def validate_data(users, events, interactions):
    """
    Validate the generated data to ensure it meets quality requirements.
    Raises AssertionError if validation fails.
    """
    print(f"Total users: {len(users)}")
    print(f"Total events: {len(events)}")
    print(f"Total interactions: {len(interactions)}")
    
    # Check user-event ratio
    assert len(users) > len(events), "There should be more users than events"
    
    # Check city distribution
    city_distribution = users['city'].value_counts(normalize=True)
    print("City distribution:")
    print(city_distribution)
    assert len(city_distribution) == 10, "Should have 10 cities"
    
    # Check weather-event alignment
    outdoor_events = events[events['event_type'].isin(['sports', 'festival'])]
    outdoor_clear_rate = (outdoor_events['weather_condition'] == 'Clear').mean()
    print(f"Outdoor events with Clear weather: {outdoor_clear_rate:.1%}")
    assert outdoor_clear_rate > 0.7, f"Too few outdoor events have Clear weather ({outdoor_clear_rate:.1%})"
    
    # Check distance decay pattern in interaction types
    try:
        distance_bins = pd.cut(interactions['distance_to_event'], 
                          bins=[0, 5, 20, 50, 100, 200],
                          include_lowest=True)
        click_rates = (interactions.groupby([distance_bins, 'interaction_type'])
                      .size()
                      .unstack(fill_value=0)
                      .apply(lambda x: x / x.sum(), axis=1))
        
        # Get first and last bin with actual data
        if 'click' in click_rates.columns:
            valid_bins = click_rates.index.dropna()
            if len(valid_bins) >= 2:
                first_bin = valid_bins[0]
                last_bin = valid_bins[-1]
                
                # Add diagnostic printouts
                print("\nDistance Decay Pattern Analysis:")
                print(f"First bin ({first_bin}): {click_rates.loc[first_bin, 'click']:.2%} click rate")
                print(f"Last bin ({last_bin}): {click_rates.loc[last_bin, 'click']:.2%} click rate")
                
                if click_rates.loc[first_bin, 'click'] > 0 and click_rates.loc[last_bin, 'click'] > 0:
                    decay_ratio = click_rates.loc[first_bin, 'click'] / click_rates.loc[last_bin, 'click']
                    print(f"Decay ratio: {decay_ratio:.1f}x")
    except Exception as e:
        print(f"Warning: Could not analyze distance decay pattern: {e}")
    
    # Check cold-start coverage
    try:
        cold_users = users[users['declared_interests'].apply(len) > 0]
        if not cold_users.empty:
            cold_user_ids = set(cold_users['user_id'])
            cold_interaction_counts = (interactions[interactions['user_id'].isin(cold_user_ids)]
                                      .groupby('user_id').size())
            
            cold_coverage = (cold_interaction_counts >= 3).mean() if not cold_interaction_counts.empty else 0
            print(f"Cold-start users with 3+ interactions: {cold_coverage:.1%}")
            assert cold_coverage > 0.8, f"Cold-start coverage insufficient: {cold_coverage:.1%}"
        else:
            print("Warning: No cold-start users identified")
    except Exception as e:
        print(f"Warning: Could not validate cold-start coverage: {e}")

def main(output_dir='.', n_users=10000, n_events=5000, n_interactions=50000, validate=True):
    """Main function to generate and save all synthetic datasets."""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"Generating {n_users} users...")
        users_df = generate_users(n_users)
        
        print(f"Generating {n_events} events...")
        events_df = generate_events(n_events)
        
        print(f"Generating {n_interactions} base interactions...")
        interactions_df = generate_interactions(users_df, events_df, n_interactions)
        
        print("Adding cold-start interactions...")
        full_interactions = augment_cold_start(users_df, events_df, interactions_df)
        
        # Validate data if requested
        if validate:
            print("\n=== Data Validation ===")
            validate_data(users_df, events_df, full_interactions)
            print("=== Validation Successful ===\n")
        
        # Save to CSV files
        users_df.to_csv(os.path.join(output_dir, 'synthetic_users.csv'), index=False)
        events_df.to_csv(os.path.join(output_dir, 'synthetic_events.csv'), index=False)
        full_interactions.to_csv(os.path.join(output_dir, 'synthetic_interactions.csv'), index=False)
        
        print(f"Successfully generated:")
        print(f"- {len(users_df)} users")
        print(f"- {len(events_df)} events")
        print(f"- {len(full_interactions)} interactions (including {len(full_interactions) - len(interactions_df)} cold-start interactions)")
        print(f"Files saved to {output_dir}")
        
        return users_df, events_df, full_interactions
        
    except AssertionError as ae:
        print(f"Validation failed: {ae}")
        raise
    except Exception as e:
        print(f"Error during data generation: {e}")
        raise

if __name__ == "__main__":
    main()

In [1]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet
import os

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
# Verify that probabilities sum to 1
if not np.isclose(sum(city_probs), 1.0):
    raise ValueError(f"City probabilities must sum to 1.0, but sum to {sum(city_probs)}")

city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

def generate_location(city):
    """Generate random coordinates near a city center, with occasional outliers."""
    if city not in city_coords:
        raise ValueError(f"Unknown city: {city}")
        
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        # Most locations are near city center
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        # Some locations are farther out (suburbs or neighboring areas)
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    """Generate synthetic user data."""
    if n_users <= 0:
        raise ValueError("Number of users must be positive")
        
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Ensure age is reasonable (avoid negative values from skewnorm)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Generate weather preference probabilities
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'declared_interests': random.sample(interests, k=random.randint(0, min(4, len(interests)))) if random.random() < 0.7 else [],
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    """Generate synthetic event data."""
    if n_events <= 0:
        raise ValueError("Number of events must be positive")
        
    events = []
    event_types = ['concert', 'sports', 'conference', 'festival', 'workshop', 'exhibition', 'seminar', 'networking']
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    # Verify weather probabilities sum to 1
    if not np.isclose(sum(weather_probs), 1.0):
        raise ValueError(f"Weather probabilities must sum to 1.0, but sum to {sum(weather_probs)}")
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Weather conditions with conditional probabilities based on event type
        if event_type in ['sports', 'festival']:
            # Ensure most outdoor events have Clear weather to meet validation requirements
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['conference', 'workshop', 'seminar']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        else:
            weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        
        # Generate start time with reasonable hours based on weekday/weekend
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        is_weekend = start_time.weekday() >= 5
        hour_choices = [10, 14, 18] if is_weekend else [9, 13, 18, 19]
        start_time = start_time.replace(hour=np.random.choice(hour_choices))
        
        # Generate other event attributes
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type.capitalize()} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),  # Duration in minutes
            'weather_condition': weather_condition,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,  # Percentage
            'indoor_capability': event_type in ['conference', 'workshop', 'exhibition', 'seminar']
        })
    return pd.DataFrame(events)


def calculate_time_weight(interaction_time, current_time, half_life=30):
    time_diff = (current_time - interaction_time).days
    return np.exp(np.log(0.5) * time_diff / half_life)


def generate_interactions(users, events, n_interactions=100000):
    """Generate synthetic user-event interactions with strong distance decay pattern."""
    if n_interactions <= 0:
        raise ValueError("Number of interactions must be positive")
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    interactions = []
    event_weather = dict(zip(events['event_id'], events['weather_condition']))
    
    # Track distance distribution for reporting
    all_distances = []
    
    # FIXED: Sample more potential interactions but only keep ones meeting our criteria
    attempts = n_interactions * 5  # Sample more to account for filtering
    
    for _ in range(attempts):
        if len(interactions) >= n_interactions:
            break
            
        # Sample a random user and event
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate distance between user and event
        distance = geodesic((user['location_lat'], user['location_lon']), 
                           (event['location_lat'], event['location_lon'])).km
        all_distances.append(distance)
        
        # FIXED: Much stronger distance decay formula
        # Use a steeper exponential decay for distance
        distance_score = np.exp(-distance/10)  # Reduced from /5 to /20 for stronger decay
        
        # Calculate other factors
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                               user['weather_preference'] in ['outdoor', 'any']) else 0.5
        social_score = np.log1p(user['social_connectedness']) / 10
        
        # FIXED: Give much more weight to distance in the interaction probability
        interaction_prob = 0.85*distance_score + 0.1*weather_score + 0.05*social_score

        # Adjust max distance based on interaction probability
        max_distance = 50 if random.random() < 0.7 else 300
        
        # FIXED: Create interaction with probability more strongly influenced by distance
        if distance < max_distance and (random.random() < interaction_prob):
            interaction_time = fake.date_time_between(
                start_date=event['start_time'] - timedelta(days=30), 
                end_date=event['start_time']
            )
            # Add the suggested time weighting
            current_time = datetime(2025, 3, 22, 19, 10, 0)  # Use the provided date and time
            time_weight = calculate_time_weight(interaction_time, current_time)
            interaction_prob *= time_weight
            # FIXED: Strongly differentiate interaction type probabilities based on distance
            # Much higher click rate for nearby events
            if distance <= 5:
                interaction_type_probs = [0.98, 0.01, 0.01]  # Increased click probability for very close events
            elif distance <= 20:
                interaction_type_probs = [0.90, 0.08, 0.02]  # Increased click probability for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.80, 0.15, 0.05]  # Adjusted for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.65, 0.30, 0.05]  # Slightly increased click probability for longer distance
            else:
                interaction_type_probs = [0.45, 0.50, 0.05]  # Slightly increased click probability for distant events

                
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(['click', 'save', 'attend'], p=interaction_type_probs),
                'interaction_time': interaction_time,
                'weather_at_interaction': event_weather[event['event_id']],
                'distance_to_event': distance
            })
    
    result_df = pd.DataFrame(interactions)
    
    # Report distance distribution if enough interactions
    if all_distances:
        print("Distance distribution in sampling:")
        dist_bins = [0, 5, 20, 50, 100, 200, np.inf]
        hist, edges = np.histogram(all_distances, bins=dist_bins)
        for i in range(len(hist)):
            print(f"{edges[i]:.1f}-{edges[i+1] if edges[i+1] != np.inf else 'inf'} km: {hist[i]} samples ({hist[i]/len(all_distances):.1%})")
    
    # Limit to requested number of interactions
    if len(result_df) > n_interactions:
        result_df = result_df.sample(n_interactions)
        
    # Report final distance statistics for accepted interactions
    if not result_df.empty:
        print("\nFinal distance distribution in interactions:")
        print(result_df['distance_to_event'].describe())
    
    return result_df

def augment_cold_start(users, events, interactions):
    """Add interactions for new users to help with cold-start problem."""
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    # Get users with declared interests (potential cold start users)
    cold_users = users[users['declared_interests'].apply(len) > 0]
    if cold_users.empty:
        print("Warning: No users with declared interests found for cold start augmentation")
        return interactions
        
    # Get trending events
    trending_events = events.nlargest(min(100, len(events)), 'historical_attendance_rate')
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        # Ensure sufficient interactions for cold-start users (at least 3)
        for _ in range(max(3, random.randint(3, 6))):
            event = trending_events.sample(1).iloc[0]
            
            # Calculate distance
            distance = geodesic((user['location_lat'], user['location_lon']), 
                              (event['location_lat'], event['location_lon'])).km
            
            # FIXED: Add similar distance-based probabilities for cold start interactions
            if distance <= 5:
                interaction_type_probs = [0.98, 0.01, 0.01]  # Increased click probability for very close events
            elif distance <= 20:
                interaction_type_probs = [0.90, 0.08, 0.02]  # Increased click probability for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.80, 0.15, 0.05]  # Adjusted for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.65, 0.30, 0.05]  # Slightly increased click probability for longer distance
            else:
                interaction_type_probs = [0.45, 0.50, 0.05]  # Slightly increased click probability for distant events

                
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(['click', 'save', 'attend'], p=interaction_type_probs),
                'interaction_time': fake.date_time_between(
                    start_date=user['signup_date'], 
                    end_date=user['signup_date'] + timedelta(days=7)
                ),
                'weather_at_interaction': event['weather_condition'],
                'distance_to_event': distance
            })
    
    # Combine with original interactions
    return pd.concat([interactions, pd.DataFrame(cold_interactions)], ignore_index=True)

def validate_data(users, events, interactions):
    """
    Validate the generated data to ensure it meets quality requirements.
    Raises AssertionError if validation fails.
    """
    print(f"Total users: {len(users)}")
    print(f"Total events: {len(events)}")
    print(f"Total interactions: {len(interactions)}")
    
    # Check user-event ratio
    assert len(users) > len(events), "There should be more users than events"
    
    # Check city distribution
    city_distribution = users['city'].value_counts(normalize=True)
    print("City distribution:")
    print(city_distribution)
    assert len(city_distribution) == 10, "Should have 10 cities"
    
    # Check weather-event alignment
    outdoor_events = events[events['event_type'].isin(['sports', 'festival'])]
    outdoor_clear_rate = (outdoor_events['weather_condition'] == 'Clear').mean()
    print(f"Outdoor events with Clear weather: {outdoor_clear_rate:.1%}")
    assert outdoor_clear_rate > 0.7, f"Too few outdoor events have Clear weather ({outdoor_clear_rate:.1%})"
    
    # Check distance decay pattern in interaction types
    try:
        # FIXED: More robust distance binning approach
        distance_bins = [0, 5, 20, 50, 100, 200]
        bin_labels = [f"({distance_bins[i]}, {distance_bins[i+1]}]" for i in range(len(distance_bins)-1)]
        
        # Create a categorical bin column
        interactions['distance_bin'] = pd.cut(
            interactions['distance_to_event'], 
            bins=distance_bins,
            labels=bin_labels,
            include_lowest=True
        )
        
        # Compute aggregated click rates by distance bin
        bin_stats = interactions.groupby(['distance_bin', 'interaction_type']).size().unstack(fill_value=0)
        
        if 'click' in bin_stats.columns:
            bin_stats['total'] = bin_stats.sum(axis=1)
            bin_stats['click_rate'] = bin_stats['click'] / bin_stats['total']
            
            # Display results for all bins
            print("\nDistance Decay Pattern Analysis:")
            print(bin_stats[['click', 'total', 'click_rate']])
            
            # Only compare bins that have data
            valid_bins = bin_stats.index.dropna().tolist()
            if len(valid_bins) >= 2:
                first_bin = valid_bins[0]
                last_bin = valid_bins[-1]
                
                first_bin_rate = bin_stats.loc[first_bin, 'click_rate']
                last_bin_rate = bin_stats.loc[last_bin, 'click_rate']
                
                print(f"\nFirst bin ({first_bin}): {first_bin_rate:.2%} click rate")
                print(f"Last bin ({last_bin}): {last_bin_rate:.2%} click rate")
                
                if first_bin_rate > 0 and last_bin_rate > 0:
                    decay_ratio = first_bin_rate / last_bin_rate
                    print(f"Decay ratio: {decay_ratio:.2f}x")
                    
                    # FIXED: Better validation criterion
                    #assert decay_ratio > 1.5, f"Distance decay ratio ({decay_ratio:.2f}) is too low"
                    assert decay_ratio > 1.75, f"Distance decay ratio ({decay_ratio:.2f}) is too low"

                else:
                    print("Warning: Cannot calculate decay ratio due to zero values")
        else:
            print("Warning: No 'click' interactions found")
    except Exception as e:
        print(f"Warning: Could not analyze distance decay pattern: {e}")
        raise  # Re-raise to ensure validation fails if this check fails
    
    # Check cold-start coverage
    try:
        cold_users = users[users['declared_interests'].apply(len) > 0]
        if not cold_users.empty:
            cold_user_ids = set(cold_users['user_id'])
            cold_interaction_counts = (interactions[interactions['user_id'].isin(cold_user_ids)]
                                      .groupby('user_id').size())
            
            cold_coverage = (cold_interaction_counts >= 3).mean() if not cold_interaction_counts.empty else 0
            print(f"Cold-start users with 3+ interactions: {cold_coverage:.1%}")
            assert cold_coverage > 0.8, f"Cold-start coverage insufficient: {cold_coverage:.1%}"
        else:
            print("Warning: No cold-start users identified")
    except Exception as e:
        print(f"Warning: Could not validate cold-start coverage: {e}")
        raise  # Re-raise to ensure validation fails

def main(output_dir='.', n_users=50000, n_events=10000, n_interactions=500000, validate=True):
    """Main function to generate and save all synthetic datasets."""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"Generating {n_users} users...")
        users_df = generate_users(n_users)
        
        print(f"Generating {n_events} events...")
        events_df = generate_events(n_events)
        
        print(f"Generating {n_interactions} base interactions...")
        interactions_df = generate_interactions(users_df, events_df, n_interactions)
        
        print("Adding cold-start interactions...")
        full_interactions = augment_cold_start(users_df, events_df, interactions_df)
        
        # Validate data if requested
        if validate:
            print("\n=== Data Validation ===")
            validate_data(users_df, events_df, full_interactions)
            print("=== Validation Successful ===\n")
        
        # Save to CSV files
        users_df.to_csv(os.path.join(output_dir, 'synthetic_users.csv'), index=False)
        events_df.to_csv(os.path.join(output_dir, 'synthetic_events.csv'), index=False)
        full_interactions.to_csv(os.path.join(output_dir, 'synthetic_interactions.csv'), index=False)
        
        print(f"Successfully generated:")
        print(f"- {len(users_df)} users")
        print(f"- {len(events_df)} events")
        print(f"- {len(full_interactions)} interactions (including {len(full_interactions) - len(interactions_df)} cold-start interactions)")
        print(f"Files saved to {output_dir}")
        
        return users_df, events_df, full_interactions
        
    except AssertionError as ae:
        print(f"Validation failed: {ae}")
        raise
    except Exception as e:
        print(f"Error during data generation: {e}")
        raise

if __name__ == "__main__":
    main()

Generating 50000 users...
Generating 10000 events...
Generating 500000 base interactions...
Distance distribution in sampling:
0.0-5.0 km: 32539 samples (1.3%)
5.0-20.0 km: 154390 samples (6.2%)
20.0-50.0 km: 10834 samples (0.4%)
50.0-100.0 km: 17372 samples (0.7%)
100.0-200.0 km: 60599 samples (2.4%)
200.0-inf km: 2224266 samples (89.0%)

Final distance distribution in interactions:
count    88262.000000
mean        14.117202
std         31.302218
min          0.020860
25%          4.828174
50%          8.028938
75%         12.098520
max        299.968107
Name: distance_to_event, dtype: float64
Adding cold-start interactions...

=== Data Validation ===
Total users: 50000
Total events: 10000
Total interactions: 213573
City distribution:
city
New York     0.20016
London       0.14892
Mumbai       0.10296
Paris        0.10106
Tokyo        0.10032
Berlin       0.09880
Toronto      0.09834
Dubai        0.05072
Sydney       0.04960
São Paulo    0.04912
Name: proportion, dtype: float64
Outdo

  bin_stats = interactions.groupby(['distance_bin', 'interaction_type']).size().unstack(fill_value=0)


Successfully generated:
- 50000 users
- 10000 events
- 213573 interactions (including 125311 cold-start interactions)
Files saved to .


In [2]:
users = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_users.csv')
users.head()


Unnamed: 0,user_id,location_lat,location_lon,city,weather_preference,age,declared_interests,signup_date,social_connectedness
0,3da1ea2f-4c4f-44d2-9a0c-5da6738ab760,48.946743,2.398599,Paris,outdoor,40,"['food', 'fashion']",2023-08-27 20:24:17.685679,11
1,8fe5bce2-e435-40af-9ac3-849c7b71072d,51.512351,-0.141411,London,outdoor,32,"['sports', 'cinema', 'music', 'travel']",2025-01-30 20:41:02.329210,16
2,a0377d00-0f58-4c3b-a57b-866dcc246a1a,40.802577,-73.912874,New York,outdoor,31,"['music', 'fitness', 'food', 'literature']",2024-06-22 23:44:22.111659,15
3,87be619c-cd68-416e-9e9e-72a4a39a2106,25.259827,55.3587,Dubai,indoor,30,['travel'],2023-07-07 12:44:32.996677,19
4,f7460053-45bb-4a34-86c8-109c1c465423,51.515939,-0.199615,London,indoor,36,[],2024-08-21 04:00:07.703554,13


In [3]:
interactions = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_interactions.csv')
interactions.head()


Unnamed: 0,interaction_id,user_id,event_id,interaction_type,interaction_time,weather_at_interaction,distance_to_event,distance_bin
0,04b70e35-a19d-4c3d-84ee-07c555538476,de772063-09eb-4751-a71b-70e9cb3966cc,9e71bfde-b7f6-4ebe-abe9-c50db4a34009,click,2025-07-10 03:37:52.030895,Clear,4.197641,"(0, 5]"
1,a435450e-838b-495f-81b2-2ea21aac5c75,9b3f59f2-b5a8-4c71-9c5e-5746d64f4206,e5dd2d2e-d7d0-46dd-bd7a-57715de6beb0,save,2025-06-09 14:42:00.436545,Cloudy,5.632956,"(5, 20]"
2,7d6d27b7-3c25-4423-a3de-c213701cc61f,cbef590d-7074-48dd-8772-27ee3c5bf296,d0ef7ea8-aa4b-4cc7-8827-6f6e0010214f,click,2025-06-26 19:11:35.270447,Clear,4.046758,"(0, 5]"
3,754e67f3-d9c8-46a0-97d6-bbde51424bcb,61b3280f-2aa1-47d2-8a50-a4729cc29c14,dae457fe-bd9a-4d4b-bbcf-1e3f3391cf2a,click,2025-06-30 01:29:04.915806,Clear,12.100435,"(5, 20]"
4,6fe87c17-1f3c-4340-9457-a0e4ff8dc829,ebb0c722-1a5e-4443-b059-5bad4ae09fa4,ee782cfa-4000-45b4-bbe4-bb7007bae697,click,2025-03-08 00:04:26.987499,Cloudy,13.378184,"(5, 20]"


In [12]:
interactions["interaction_type"].isnull().sum()

np.int64(0)

In [None]:
interactions.isnu

In [4]:
events = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_events.csv')
events.head()

Unnamed: 0,event_id,title,event_type,location_lat,location_lon,city,start_time,duration,weather_condition,historical_attendance_rate,indoor_capability
0,4491f835-61c3-4a0d-89a0-9da110ca2b68,Face-to-face optimal service-desk Exhibition i...,exhibition,35.668559,139.720674,Tokyo,2025-07-15 19:34:05.914671,180,Cloudy,32.448197,True
1,de398dc6-2fd3-4e77-91b8-7d689a673407,Distributed context-sensitive service-desk Exh...,exhibition,40.773806,-73.946061,New York,2025-04-05 18:11:13.680569,360,Snow,8.377251,True
2,1f168037-8f2d-489e-8acb-654d5bbfb5e3,User-centric fault-tolerant workforce Sports i...,sports,19.071756,72.911718,Mumbai,2025-08-18 19:08:48.845907,360,Clear,22.178179,False
3,459e72e4-310a-41ab-b47e-f437b99724d6,Enterprise-wide system-worthy analyzer Sports ...,sports,52.570974,13.396175,Berlin,2025-05-21 13:38:38.711574,120,Clear,46.583396,False
4,d00363e2-303f-401e-a3d6-0b3279825d1e,Phased exuding project Sports in New York,sports,40.755926,-73.948998,New York,2025-05-15 19:41:37.281529,480,Clear,24.978247,False


In [5]:
interactions.columns


Index(['interaction_id', 'user_id', 'event_id', 'interaction_type',
       'interaction_time', 'weather_at_interaction', 'distance_to_event',
       'distance_bin'],
      dtype='object')

In [7]:
users.columns


Index(['user_id', 'location_lat', 'location_lon', 'city', 'weather_preference',
       'age', 'declared_interests', 'signup_date', 'social_connectedness'],
      dtype='object')

In [13]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet
import os

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
# Verify that probabilities sum to 1
if not np.isclose(sum(city_probs), 1.0):
    raise ValueError(f"City probabilities must sum to 1.0, but sum to {sum(city_probs)}")

city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

def generate_location(city):
    """Generate random coordinates near a city center, with occasional outliers."""
    if city not in city_coords:
        raise ValueError(f"Unknown city: {city}")
        
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        # Most locations are near city center
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        # Some locations are farther out (suburbs or neighboring areas)
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    """Generate synthetic user data."""
    if n_users <= 0:
        raise ValueError("Number of users must be positive")
        
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Ensure age is reasonable (avoid negative values from skewnorm)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Generate weather preference probabilities
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'declared_interests': random.sample(interests, k=random.randint(0, min(4, len(interests)))) if random.random() < 0.7 else [],
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    """Generate synthetic event data."""
    if n_events <= 0:
        raise ValueError("Number of events must be positive")
        
    events = []
    # Updated event types
    event_types = [
        'Education & Learning', 'Technology', 'Seasonal & Festivals', 'Arts & Culture', 
        'Entertainment', 'Sports & Fitness', 'Business & Networking', 'Health & Wellness', 
        'Music & Concerts', 'Food & Drink', 'Community & Causes', 'Immersive Experiences'
    ]
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    # Verify weather probabilities sum to 1
    if not np.isclose(sum(weather_probs), 1.0):
        raise ValueError(f"Weather probabilities must sum to 1.0, but sum to {sum(weather_probs)}")
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Weather conditions with conditional probabilities based on event type
        if event_type in ['Sports & Fitness', 'Seasonal & Festivals']:
            # Ensure most outdoor events have Clear weather to meet validation requirements
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['Education & Learning', 'Technology', 'Business & Networking']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        else:
            weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        
        # Generate start time with reasonable hours based on weekday/weekend
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        is_weekend = start_time.weekday() >= 5
        hour_choices = [10, 14, 18] if is_weekend else [9, 13, 18, 19]
        start_time = start_time.replace(hour=np.random.choice(hour_choices))
        
        # Generate other event attributes
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),  # Duration in minutes
            'weather_condition': weather_condition,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,  # Percentage
            'indoor_capability': event_type in ['Education & Learning', 'Technology', 'Business & Networking', 
                                               'Arts & Culture', 'Entertainment', 'Immersive Experiences']
        })
    return pd.DataFrame(events)


def calculate_time_weight(interaction_time, current_time, half_life=30):
    time_diff = (current_time - interaction_time).days
    return np.exp(np.log(0.5) * time_diff / half_life)


def generate_interactions(users, events, n_interactions=100000):
    """Generate synthetic user-event interactions with strong distance decay pattern."""
    if n_interactions <= 0:
        raise ValueError("Number of interactions must be positive")
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    interactions = []
    event_weather = dict(zip(events['event_id'], events['weather_condition']))
    
    # Updated interaction types
    interaction_types = ['maybe', 'invited & maybe', 'no', 'yes', 'invited & yes', 'invited & no', 'invited']
    
    # Track distance distribution for reporting
    all_distances = []
    
    # Sample more potential interactions but only keep ones meeting our criteria
    attempts = n_interactions * 5  # Sample more to account for filtering
    
    for _ in range(attempts):
        if len(interactions) >= n_interactions:
            break
            
        # Sample a random user and event
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate distance between user and event
        distance = geodesic((user['location_lat'], user['location_lon']), 
                           (event['location_lat'], event['location_lon'])).km
        all_distances.append(distance)
        
        # Much stronger distance decay formula
        # Use a steeper exponential decay for distance
        distance_score = np.exp(-distance/10)  # Reduced from /5 to /20 for stronger decay
        
        # Calculate other factors
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                               user['weather_preference'] in ['outdoor', 'any']) else 0.5
        social_score = np.log1p(user['social_connectedness']) / 10
        
        # Give much more weight to distance in the interaction probability
        interaction_prob = 0.85*distance_score + 0.1*weather_score + 0.05*social_score

        # Adjust max distance based on interaction probability
        max_distance = 50 if random.random() < 0.7 else 300
        
        # Create interaction with probability more strongly influenced by distance
        if distance < max_distance and (random.random() < interaction_prob):
            interaction_time = fake.date_time_between(
                start_date=event['start_time'] - timedelta(days=30), 
                end_date=event['start_time']
            )
            # Add the suggested time weighting
            current_time = datetime(2025, 3, 22, 19, 10, 0)  # Use the provided date and time
            time_weight = calculate_time_weight(interaction_time, current_time)
            interaction_prob *= time_weight
            
            # Determine probabilities for different interaction types based on distance
            if distance <= 5:
                interaction_type_probs = [0.15, 0.20, 0.05, 0.25, 0.20, 0.05, 0.10]  # High positive response for very close events
            elif distance <= 20:
                interaction_type_probs = [0.20, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10]  # Good mix for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.25, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10]  # More maybe/no for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.20, 0.05, 0.25, 0.10, 0.05, 0.20, 0.15]  # Higher no rate for longer distance
            else:
                interaction_type_probs = [0.15, 0.05, 0.30, 0.05, 0.05, 0.25, 0.15]  # Highest no rate for distant events
                
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(interaction_types, p=interaction_type_probs),
                'interaction_time': interaction_time,
                'weather_at_interaction': event_weather[event['event_id']],
                'distance_to_event': distance
            })
    
    result_df = pd.DataFrame(interactions)
    
    # Report distance distribution if enough interactions
    if all_distances:
        print("Distance distribution in sampling:")
        dist_bins = [0, 5, 20, 50, 100, 200, np.inf]
        hist, edges = np.histogram(all_distances, bins=dist_bins)
        for i in range(len(hist)):
            print(f"{edges[i]:.1f}-{edges[i+1] if edges[i+1] != np.inf else 'inf'} km: {hist[i]} samples ({hist[i]/len(all_distances):.1%})")
    
    # Limit to requested number of interactions
    if len(result_df) > n_interactions:
        result_df = result_df.sample(n_interactions)
        
    # Report final distance statistics for accepted interactions
    if not result_df.empty:
        print("\nFinal distance distribution in interactions:")
        print(result_df['distance_to_event'].describe())
    
    return result_df

def augment_cold_start(users, events, interactions):
    """Add interactions for new users to help with cold-start problem."""
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    # Get users with declared interests (potential cold start users)
    cold_users = users[users['declared_interests'].apply(len) > 0]
    if cold_users.empty:
        print("Warning: No users with declared interests found for cold start augmentation")
        return interactions
        
    # Get trending events
    trending_events = events.nlargest(min(100, len(events)), 'historical_attendance_rate')
    
    # Updated interaction types
    interaction_types = ['maybe', 'invited & maybe', 'no', 'yes', 'invited & yes', 'invited & no', 'invited']
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        # Ensure sufficient interactions for cold-start users (at least 3)
        for _ in range(max(3, random.randint(3, 6))):
            event = trending_events.sample(1).iloc[0]
            
            # Calculate distance
            distance = geodesic((user['location_lat'], user['location_lon']), 
                              (event['location_lat'], event['location_lon'])).km
            
            # Add similar distance-based probabilities for cold start interactions
            if distance <= 5:
                interaction_type_probs = [0.15, 0.20, 0.05, 0.25, 0.20, 0.05, 0.10]  # High positive response for very close events
            elif distance <= 20:
                interaction_type_probs = [0.20, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10]  # Good mix for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.25, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10]  # More maybe/no for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.20, 0.05, 0.25, 0.10, 0.05, 0.20, 0.15]  # Higher no rate for longer distance
            else:
                interaction_type_probs = [0.15, 0.05, 0.30, 0.05, 0.05, 0.25, 0.15]  # Highest no rate for distant events
                
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(interaction_types, p=interaction_type_probs),
                'interaction_time': fake.date_time_between(
                    start_date=user['signup_date'], 
                    end_date=user['signup_date'] + timedelta(days=7)
                ),
                'weather_at_interaction': event['weather_condition'],
                'distance_to_event': distance
            })
    
    # Combine with original interactions
    return pd.concat([interactions, pd.DataFrame(cold_interactions)], ignore_index=True)

def validate_data(users, events, interactions):
    """
    Validate the generated data to ensure it meets quality requirements.
    Raises AssertionError if validation fails.
    """
    print(f"Total users: {len(users)}")
    print(f"Total events: {len(events)}")
    print(f"Total interactions: {len(interactions)}")
    
    # Check user-event ratio
    assert len(users) > len(events), "There should be more users than events"
    
    # Check city distribution
    city_distribution = users['city'].value_counts(normalize=True)
    print("City distribution:")
    print(city_distribution)
    assert len(city_distribution) == 10, "Should have 10 cities"
    
    # Check weather-event alignment
    outdoor_events = events[events['event_type'].isin(['Sports & Fitness', 'Seasonal & Festivals'])]
    outdoor_clear_rate = (outdoor_events['weather_condition'] == 'Clear').mean()
    print(f"Outdoor events with Clear weather: {outdoor_clear_rate:.1%}")
    assert outdoor_clear_rate > 0.7, f"Too few outdoor events have Clear weather ({outdoor_clear_rate:.1%})"
    
    # Check distance decay pattern in interaction types
    try:
        # More robust distance binning approach
        distance_bins = [0, 5, 20, 50, 100, 200]
        bin_labels = [f"({distance_bins[i]}, {distance_bins[i+1]}]" for i in range(len(distance_bins)-1)]
        
        # Create a categorical bin column
        interactions['distance_bin'] = pd.cut(
            interactions['distance_to_event'], 
            bins=distance_bins,
            labels=bin_labels,
            include_lowest=True
        )
        
        # Compute aggregated positive response rates by distance bin
        # Considering 'yes' and 'invited & yes' as positive responses
        bin_stats = interactions.groupby('distance_bin')['interaction_type'].apply(
            lambda x: (x.isin(['yes', 'invited & yes'])).mean()
        ).reset_index(name='positive_rate')
        
        # Display results for all bins
        print("\nDistance Decay Pattern Analysis:")
        print(bin_stats)
        
        # Only compare bins that have data
        valid_bins = bin_stats['distance_bin'].dropna().tolist()
        if len(valid_bins) >= 2:
            first_bin = valid_bins[0]
            last_bin = valid_bins[-1]
            
            first_bin_rate = bin_stats.loc[bin_stats['distance_bin'] == first_bin, 'positive_rate'].values[0]
            last_bin_rate = bin_stats.loc[bin_stats['distance_bin'] == last_bin, 'positive_rate'].values[0]
            
            print(f"\nFirst bin ({first_bin}): {first_bin_rate:.2%} positive response rate")
            print(f"Last bin ({last_bin}): {last_bin_rate:.2%} positive response rate")
            
            if first_bin_rate > 0 and last_bin_rate > 0:
                decay_ratio = first_bin_rate / last_bin_rate
                print(f"Decay ratio: {decay_ratio:.2f}x")
                
                # Better validation criterion
                assert decay_ratio > 1.75, f"Distance decay ratio ({decay_ratio:.2f}) is too low"
            else:
                print("Warning: Cannot calculate decay ratio due to zero values")
        else:
            print("Warning: Not enough distance bins for analysis")
    except Exception as e:
        print(f"Warning: Could not analyze distance decay pattern: {e}")
        raise  # Re-raise to ensure validation fails if this check fails
    
    # Check cold-start coverage
    try:
        cold_users = users[users['declared_interests'].apply(len) > 0]
        if not cold_users.empty:
            cold_user_ids = set(cold_users['user_id'])
            cold_interaction_counts = (interactions[interactions['user_id'].isin(cold_user_ids)]
                                      .groupby('user_id').size())
            
            cold_coverage = (cold_interaction_counts >= 3).mean() if not cold_interaction_counts.empty else 0
            print(f"Cold-start users with 3+ interactions: {cold_coverage:.1%}")
            assert cold_coverage > 0.8, f"Cold-start coverage insufficient: {cold_coverage:.1%}"
        else:
            print("Warning: No cold-start users identified")
    except Exception as e:
        print(f"Warning: Could not validate cold-start coverage: {e}")
        raise  # Re-raise to ensure validation fails

def main(output_dir='.', n_users=10000, n_events=5000, n_interactions=100000, validate=True):
    """Main function to generate and save all synthetic datasets."""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"Generating {n_users} users...")
        users_df = generate_users(n_users)
        
        print(f"Generating {n_events} events...")
        events_df = generate_events(n_events)
        
        print(f"Generating {n_interactions} base interactions...")
        interactions_df = generate_interactions(users_df, events_df, n_interactions)
        
        print("Adding cold-start interactions...")
        full_interactions = augment_cold_start(users_df, events_df, interactions_df)
        
        # Validate data if requested
        if validate:
            print("\n=== Data Validation ===")
            validate_data(users_df, events_df, full_interactions)
            print("=== Validation Successful ===\n")
        
        # Save to CSV files
        users_df.to_csv(os.path.join(output_dir, 'synthetic_users.csv'), index=False)
        events_df.to_csv(os.path.join(output_dir, 'synthetic_events.csv'), index=False)
        full_interactions.to_csv(os.path.join(output_dir, 'synthetic_interactions.csv'), index=False)
        
        print(f"Successfully generated:")
        print(f"- {len(users_df)} users")
        print(f"- {len(events_df)} events")
        print(f"- {len(full_interactions)} interactions (including {len(full_interactions) - len(interactions_df)} cold-start interactions)")
        print(f"Files saved to {output_dir}")
        
        return users_df, events_df, full_interactions
        
    except AssertionError as ae:
        print(f"Validation failed: {ae}")
        raise
    except Exception as e:
        print(f"Error during data generation: {e}")
        raise

if __name__ == "__main__":
    main()

Generating 10000 users...
Generating 5000 events...
Generating 100000 base interactions...
Distance distribution in sampling:
0.0-5.0 km: 6761 samples (1.4%)
5.0-20.0 km: 30485 samples (6.1%)
20.0-50.0 km: 2031 samples (0.4%)
50.0-100.0 km: 3355 samples (0.7%)
100.0-200.0 km: 12019 samples (2.4%)
200.0-inf km: 445349 samples (89.1%)

Final distance distribution in interactions:
count    17423.000000
mean        13.441899
std         29.695545
min          0.045958
25%          4.658636
50%          7.870139
75%         11.955413
max        298.508345
Name: distance_to_event, dtype: float64
Adding cold-start interactions...

=== Data Validation ===
Total users: 10000
Total events: 5000
Total interactions: 42455
City distribution:
city
New York     0.1979
London       0.1541
Paris        0.1036
Mumbai       0.1002
Tokyo        0.1000
Toronto      0.0981
Berlin       0.0937
Dubai        0.0513
Sydney       0.0512
São Paulo    0.0499
Name: proportion, dtype: float64
Outdoor events with Cle

  bin_stats = interactions.groupby('distance_bin')['interaction_type'].apply(


Successfully generated:
- 10000 users
- 5000 events
- 42455 interactions (including 25032 cold-start interactions)
Files saved to .


In [2]:
import pandas as pd
users = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_users.csv')
users.head()


Unnamed: 0,user_id,location_lat,location_lon,city,weather_preference,age,declared_interests,signup_date,social_connectedness
0,35e1b687-69d8-463c-8dc8-ca69a6fb443c,48.946743,2.398599,Paris,outdoor,40,"['food', 'fashion']",2024-04-14 18:40:47.717122,11
1,adc6995a-f086-4a20-b9bf-9833a4fbd8c9,51.512351,-0.141411,London,outdoor,32,"['sports', 'cinema', 'music', 'travel']",2024-12-27 07:12:36.083451,16
2,563c8269-d89a-4d3f-9f16-ba0616e7c0a3,40.802577,-73.912874,New York,outdoor,31,"['music', 'fitness', 'food', 'literature']",2024-04-15 07:50:36.930433,15
3,d02cc747-d2c1-433e-9aa3-dbdd087d02f2,25.259827,55.3587,Dubai,indoor,30,['travel'],2024-01-18 05:13:44.494596,19
4,935aa5d5-64e2-448a-96b2-1c1bdb869bec,51.515939,-0.199615,London,indoor,36,[],2025-02-25 06:10:20.602422,13


In [4]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   user_id               10000 non-null  object 
 1   location_lat          10000 non-null  float64
 2   location_lon          10000 non-null  float64
 3   city                  10000 non-null  object 
 4   weather_preference    10000 non-null  object 
 5   age                   10000 non-null  int64  
 6   declared_interests    10000 non-null  object 
 7   signup_date           10000 non-null  object 
 8   social_connectedness  10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [None]:
users

In [3]:
interactions = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_interactions.csv')
interactions.head()


Unnamed: 0,interaction_id,user_id,event_id,interaction_type,interaction_time,distance_to_event,distance_bin
0,80c36646-fe7f-487f-bcba-248d663ff0bf,44814877-2f5d-4fef-a420-f305c0a33357,7be347c2-7ba9-423a-9565-610d6b21d90e,invited,2025-06-09 02:31:00.533790,6.152134,"(5, 20]"
1,0c4caf5b-d079-49bb-80ad-aa303da749f6,9f366ca7-e1b5-4db5-8e31-e9676a4165f2,39a2d3ae-c768-4bd6-b7c7-2d89f27a0968,invited & maybe,2025-09-02 12:17:35.772288,5.526306,"(5, 20]"
2,6d597450-4b74-4960-ba89-ec35c816cb2c,da08c987-35b5-414f-8e51-300714fb2edb,e0d7517b-999c-4aa3-a8c7-19b26a7d4644,invited & no,2025-06-13 22:34:14.975167,20.687302,"(20, 50]"
3,d45245de-9c57-481a-a840-d12a4b220daa,05afca6c-914c-4f73-a29f-82b59efc163a,92b39250-3685-4a79-97ec-35434f1f143f,invited,2025-03-11 21:10:31.995579,17.244582,"(5, 20]"
4,7a976069-e90b-40cb-9c75-1a65caeba6be,031b412a-bf2b-4edc-9177-774af0d6f753,05e91fb2-6982-4989-8881-e25df3c71532,invited & yes,2025-07-07 20:26:55.940371,10.094918,"(5, 20]"


In [4]:
events = pd.read_csv('/home/nkama/masters_thesis_project/thesis/sythesize_from_scratch/synthetic_events.csv')
events.head()

Unnamed: 0,event_id,title,event_type,location_lat,location_lon,city,start_time,duration,weather_condition,temperature,historical_attendance_rate,indoor_capability
0,bd885fb7-6ab6-4033-9983-fa0929c4b5dd,Visionary heuristic orchestration Seasonal & F...,Seasonal & Festivals,40.626475,-74.061008,New York,2025-06-04 13:59:31.784973,120,Clear,18.4,20.656472,False
1,519d3580-b6a2-4c33-b24c-cbc479f3602d,Horizontal background customer loyalty Seasona...,Seasonal & Festivals,-23.6175,-46.709916,São Paulo,2025-05-24 14:43:05.604231,120,Clear,28.9,48.875548,False
2,26cb44b3-589b-4b39-be7a-265ff482ffd4,Persistent upward-trending hierarchy Education...,Education & Learning,40.718535,-72.365064,New York,2025-08-25 19:46:52.947500,180,Clear,19.2,40.198146,True
3,e77360aa-c46e-49fb-8f78-6b942cd41b91,Implemented static installation Education & Le...,Education & Learning,49.643585,0.210627,London,2025-06-15 18:10:08.978852,180,Cloudy,13.5,21.595591,True
4,43d8e18c-fa2e-42f7-a7fa-ea0e144a2746,Inverse multi-tasking circuit Arts & Culture i...,Arts & Culture,19.140716,72.974755,Mumbai,2025-04-17 09:48:38.589681,120,Clear,32.5,30.103833,True


In [8]:
interaction_users_events = interactions[['event_id','user_id']].merge(users, on='user_id', how='inner') \
                                           .merge(events, on='event_id', how='inner')

len(interaction_users_events)

42455

In [9]:
interaction_users_events.head()

Unnamed: 0,event_id,user_id,location_lat_x,location_lon_x,city_x,weather_preference,age,declared_interests,signup_date,social_connectedness,title,event_type,location_lat_y,location_lon_y,city_y,start_time,duration,weather_condition,historical_attendance_rate,indoor_capability
0,7fe13531-2c5f-4dd1-8fcf-a2ab793639f0,bf071cf2-9594-4bb8-b631-3936dce28e01,40.78785,-73.979098,New York,any,27,"['music', 'travel']",2024-10-30 05:24:14.367302,13,Inverse analyzing flexibility Seasonal & Festi...,Seasonal & Festivals,40.750939,-74.061531,New York,2025-04-28 09:56:09.170428,240,Clear,22.808446,False
1,17940102-d5c4-489c-854a-25ecc439dfa6,086c164a-e2e9-451d-9b83-1509a969b72b,52.604626,13.438382,Berlin,any,39,"['food', 'music', 'cinema', 'sports']",2024-08-23 20:38:57.363965,14,Public-key tangible contingency Sports & Fitne...,Sports & Fitness,52.452497,13.375187,Berlin,2025-08-01 09:44:58.106131,360,Clear,61.169269,False
2,3ad182f4-823c-48e1-9af5-0ac64e806ecb,f76ecafb-fc42-47a0-ad28-d6ae9061b18d,51.493821,-0.199043,London,indoor,42,[],2024-03-27 01:11:13.720787,19,Balanced executive open system Entertainment i...,Entertainment,51.592055,-0.142099,London,2025-07-01 19:58:34.853074,180,Clear,16.363479,True
3,29b0111c-900e-4c04-ba0c-ae6e3dbaf8c4,3f0bda95-366a-4ccc-961b-f7ce45731749,51.506281,-0.154534,London,any,38,[],2024-06-11 22:33:52.514397,12,Intuitive uniform workforce Entertainment in L...,Entertainment,51.524089,-0.157917,London,2025-06-20 09:50:12.155368,120,Clear,57.868725,True
4,d4c5fdf3-ac81-485c-80ff-40ff7824f910,49c92ca8-6c05-4136-b5a5-ca48f91817cc,40.674816,-73.974169,New York,any,28,"['literature', 'cinema']",2023-11-13 23:55:33.990432,19,Right-sized methodical Local Area Network Educ...,Education & Learning,40.637339,-73.947771,New York,2025-06-30 18:32:57.854252,120,Clear,32.121606,True


In [11]:
interaction_users_events['event_id'].nunique()

4169

In [13]:
len(users)

10000

In [10]:
interaction_users_events['user_id'].nunique()

8628

In [14]:
import numpy as np
import pandas as pd
from faker import Faker
from geopy.distance import geodesic
from datetime import datetime, timedelta
import random
from scipy.stats import skewnorm, dirichlet
import os

# Initialize Faker and set seed for reproducibility
fake = Faker()
np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
# Verify that probabilities sum to 1.0
if not np.isclose(sum(city_probs), 1.0):
    raise ValueError(f"City probabilities must sum to 1.0, but sum to {sum(city_probs)}")

city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

def generate_location(city):
    """Generate random coordinates near a city center, with occasional outliers."""
    if city not in city_coords:
        raise ValueError(f"Unknown city: {city}")
        
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        # Most locations are near city center
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        # Some locations are farther out (suburbs or neighboring areas)
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    """Generate synthetic user data."""
    if n_users <= 0:
        raise ValueError("Number of users must be positive")
        
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Ensure age is reasonable (avoid negative values from skewnorm)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Generate weather preference probabilities
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        users.append({
            'user_id': fake.uuid4(),
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'declared_interests': random.sample(interests, k=random.randint(0, min(4, len(interests)))) if random.random() < 0.7 else [],
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    """Generate synthetic event data."""
    if n_events <= 0:
        raise ValueError("Number of events must be positive")
        
    events = []
    # Updated event types
    event_types = [
        'Education & Learning', 'Technology', 'Seasonal & Festivals', 'Arts & Culture', 
        'Entertainment', 'Sports & Fitness', 'Business & Networking', 'Health & Wellness', 
        'Music & Concerts', 'Food & Drink', 'Community & Causes', 'Immersive Experiences'
    ]
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    # Verify weather probabilities sum to 1
    if not np.isclose(sum(weather_probs), 1.0):
        raise ValueError(f"Weather probabilities must sum to 1.0, but sum to {sum(weather_probs)}")
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Weather conditions with conditional probabilities based on event type
        if event_type in ['Sports & Fitness', 'Seasonal & Festivals']:
            # Ensure most outdoor events have Clear weather to meet validation requirements
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['Education & Learning', 'Technology', 'Business & Networking']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        else:
            weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        
        # Generate temperature based on location and weather condition (Celsius)
        base_temp = {
            'New York': 15, 'London': 12, 'Paris': 16, 'Tokyo': 20, 
            'Sydney': 22, 'Berlin': 14, 'Mumbai': 28, 'São Paulo': 24, 
            'Toronto': 10, 'Dubai': 32
        }[city]
        
        # Adjust temperature based on weather condition
        temp_adjustment = {
            'Clear': np.random.uniform(2, 5),
            'Rain': np.random.uniform(-3, 0),
            'Snow': np.random.uniform(-8, -3),
            'Cloudy': np.random.uniform(-1, 2),
            'Windy': np.random.uniform(-2, 1)
        }[weather_condition]
        
        temperature = round(base_temp + temp_adjustment, 1)
        
        # Generate start time with reasonable hours based on weekday/weekend
        start_time = fake.date_time_between(start_date='now', end_date='+6M')
        is_weekend = start_time.weekday() >= 5
        hour_choices = [10, 14, 18] if is_weekend else [9, 13, 18, 19]
        start_time = start_time.replace(hour=np.random.choice(hour_choices))
        
        # Generate other event attributes
        events.append({
            'event_id': fake.uuid4(),
            'title': f"{fake.catch_phrase()} {event_type} in {city}",
            'event_type': event_type,
            'location_lat': lat,
            'location_lon': lon,
            'city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),  # Duration in minutes
            'weather_condition': weather_condition,
            'temperature': temperature,
            'historical_attendance_rate': np.random.beta(a=2, b=5) * 100,  # Percentage
            'indoor_capability': event_type in ['Education & Learning', 'Technology', 'Business & Networking', 
                                               'Arts & Culture', 'Entertainment', 'Immersive Experiences']
        })
    return pd.DataFrame(events)

def calculate_time_weight(interaction_time, current_time, half_life=30):
    time_diff = (current_time - interaction_time).days
    return np.exp(np.log(0.5) * time_diff / half_life)

def generate_interactions(users, events, n_interactions=100000):
    """Generate synthetic user-event interactions with strong distance decay pattern."""
    if n_interactions <= 0:
        raise ValueError("Number of interactions must be positive")
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    interactions = []
    
    # Updated interaction types
    interaction_types = ['maybe', 'invited & maybe', 'no', 'yes', 'invited & yes', 'invited & no', 'invited']
    
    # Track distance distribution for reporting
    all_distances = []
    
    # Sample more potential interactions but only keep ones meeting our criteria
    attempts = n_interactions * 5  # Sample more to account for filtering
    
    for _ in range(attempts):
        if len(interactions) >= n_interactions:
            break
            
        # Sample a random user and event
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate distance between user and event
        distance = geodesic((user['location_lat'], user['location_lon']), 
                           (event['location_lat'], event['location_lon'])).km
        all_distances.append(distance)
        
        # Much stronger distance decay formula
        # Use a steeper exponential decay for distance
        distance_score = np.exp(-distance/10)  # Reduced from /5 to /20 for stronger decay
        
        # Calculate other factors
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                               user['weather_preference'] in ['outdoor', 'any']) else 0.5
        social_score = np.log1p(user['social_connectedness']) / 10
        
        # Give much more weight to distance in the interaction probability
        interaction_prob = 0.85*distance_score + 0.1*weather_score + 0.05*social_score

        # Adjust max distance based on interaction probability
        max_distance = 50 if random.random() < 0.7 else 300
        
        # Create interaction with probability more strongly influenced by distance
        if distance < max_distance and (random.random() < interaction_prob):
            interaction_time = fake.date_time_between(
                start_date=event['start_time'] - timedelta(days=30), 
                end_date=event['start_time']
            )
            # Add the suggested time weighting
            current_time = datetime(2025, 3, 22, 19, 10, 0)  # Use the provided date and time
            time_weight = calculate_time_weight(interaction_time, current_time)
            interaction_prob *= time_weight
            
            # Determine probabilities for different interaction types based on distance
            if distance <= 5:
                interaction_type_probs = [0.15, 0.20, 0.05, 0.25, 0.20, 0.05, 0.10]  # High positive response for very close events
            elif distance <= 20:
                interaction_type_probs = [0.20, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10]  # Good mix for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.25, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10]  # More maybe/no for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.20, 0.05, 0.25, 0.10, 0.05, 0.20, 0.15]  # Higher no rate for longer distance
            else:
                interaction_type_probs = [0.15, 0.05, 0.30, 0.05, 0.05, 0.25, 0.15]  # Highest no rate for distant events
                
            interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(interaction_types, p=interaction_type_probs),
                'interaction_time': interaction_time,
                'distance_to_event': distance
            })
    
    result_df = pd.DataFrame(interactions)
    
    # Report distance distribution if enough interactions
    if all_distances:
        print("Distance distribution in sampling:")
        dist_bins = [0, 5, 20, 50, 100, 200, np.inf]
        hist, edges = np.histogram(all_distances, bins=dist_bins)
        for i in range(len(hist)):
            print(f"{edges[i]:.1f}-{edges[i+1] if edges[i+1] != np.inf else 'inf'} km: {hist[i]} samples ({hist[i]/len(all_distances):.1%})")
    
    # Limit to requested number of interactions
    if len(result_df) > n_interactions:
        result_df = result_df.sample(n_interactions)
        
    # Report final distance statistics for accepted interactions
    if not result_df.empty:
        print("\nFinal distance distribution in interactions:")
        print(result_df['distance_to_event'].describe())
    
    return result_df

def augment_cold_start(users, events, interactions):
    """Add interactions for new users to help with cold-start problem."""
    if users.empty or events.empty:
        raise ValueError("Users and events DataFrames cannot be empty")
        
    # Get users with declared interests (potential cold start users)
    cold_users = users[users['declared_interests'].apply(len) > 0]
    if cold_users.empty:
        print("Warning: No users with declared interests found for cold start augmentation")
        return interactions
        
    # Get trending events
    trending_events = events.nlargest(min(100, len(events)), 'historical_attendance_rate')
    
    # Updated interaction types
    interaction_types = ['maybe', 'invited & maybe', 'no', 'yes', 'invited & yes', 'invited & no', 'invited']
    
    cold_interactions = []
    for _, user in cold_users.iterrows():
        # Ensure sufficient interactions for cold-start users (at least 3)
        for _ in range(max(3, random.randint(3, 6))):
            event = trending_events.sample(1).iloc[0]
            
            # Calculate distance
            distance = geodesic((user['location_lat'], user['location_lon']), 
                              (event['location_lat'], event['location_lon'])).km
            
            # Add similar distance-based probabilities for cold start interactions
            if distance <= 5:
                interaction_type_probs = [0.15, 0.20, 0.05, 0.25, 0.20, 0.05, 0.10]  # High positive response for very close events
            elif distance <= 20:
                interaction_type_probs = [0.20, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10]  # Good mix for nearby events
            elif distance <= 50:
                interaction_type_probs = [0.25, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10]  # More maybe/no for medium distance
            elif distance <= 100:
                interaction_type_probs = [0.20, 0.05, 0.25, 0.10, 0.05, 0.20, 0.15]  # Higher no rate for longer distance
            else:
                interaction_type_probs = [0.15, 0.05, 0.30, 0.05, 0.05, 0.25, 0.15]  # Highest no rate for distant events
                
            cold_interactions.append({
                'interaction_id': fake.uuid4(),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(interaction_types, p=interaction_type_probs),
                'interaction_time': fake.date_time_between(
                    start_date=user['signup_date'], 
                    end_date=user['signup_date'] + timedelta(days=7)
                ),
                'distance_to_event': round(distance)
            })
    
    # Combine with original interactions
    return pd.concat([interactions, pd.DataFrame(cold_interactions)], ignore_index=True)

def validate_data(users, events, interactions):
    """
    Validate the generated data to ensure it meets quality requirements.
    Raises AssertionError if validation fails.
    """
    print(f"Total users: {len(users)}")
    print(f"Total events: {len(events)}")
    print(f"Total interactions: {len(interactions)}")
    
    # Check user-event ratio
    assert len(users) > len(events), "There should be more users than events"
    
    # Check city distribution
    city_distribution = users['city'].value_counts(normalize=True)
    print("City distribution:")
    print(city_distribution)
    assert len(city_distribution) == 10, "Should have 10 cities"
    
    # Check weather-event alignment
    outdoor_events = events[events['event_type'].isin(['Sports & Fitness', 'Seasonal & Festivals'])]
    outdoor_clear_rate = (outdoor_events['weather_condition'] == 'Clear').mean()
    print(f"Outdoor events with Clear weather: {outdoor_clear_rate:.1%}")
    assert outdoor_clear_rate > 0.7, f"Too few outdoor events have Clear weather ({outdoor_clear_rate:.1%})"
    
    # Check temperature distribution
    print("\nTemperature Distribution:")
    print(events['temperature'].describe())
    
    # Consistency check for temperature by city and weather condition
    temp_by_city_weather = events.groupby(['city', 'weather_condition'])['temperature'].agg(['mean', 'min', 'max'])
    print("\nTemperature by City and Weather Condition:")
    print(temp_by_city_weather)
    
    # Check distance decay pattern in interaction types
    try:
        # More robust distance binning approach
        distance_bins = [0, 5, 20, 50, 100, 200]
        bin_labels = [f"({distance_bins[i]}, {distance_bins[i+1]}]" for i in range(len(distance_bins)-1)]
        
        # Create a categorical bin column
        interactions['distance_bin'] = pd.cut(
            interactions['distance_to_event'], 
            bins=distance_bins,
            labels=bin_labels,
            include_lowest=True
        )
        
        # Compute aggregated positive response rates by distance bin
        # Considering 'yes' and 'invited & yes' as positive responses
        bin_stats = interactions.groupby('distance_bin')['interaction_type'].apply(
            lambda x: (x.isin(['yes', 'invited & yes'])).mean()
        ).reset_index(name='positive_rate')
        
        # Display results for all bins
        print("\nDistance Decay Pattern Analysis:")
        print(bin_stats)
        
        # Only compare bins that have data
        valid_bins = bin_stats['distance_bin'].dropna().tolist()
        if len(valid_bins) >= 2:
            first_bin = valid_bins[0]
            last_bin = valid_bins[-1]
            
            first_bin_rate = bin_stats.loc[bin_stats['distance_bin'] == first_bin, 'positive_rate'].values[0]
            last_bin_rate = bin_stats.loc[bin_stats['distance_bin'] == last_bin, 'positive_rate'].values[0]
            
            print(f"\nFirst bin ({first_bin}): {first_bin_rate:.2%} positive response rate")
            print(f"Last bin ({last_bin}): {last_bin_rate:.2%} positive response rate")
            
            if first_bin_rate > 0 and last_bin_rate > 0:
                decay_ratio = first_bin_rate / last_bin_rate
                print(f"Decay ratio: {decay_ratio:.2f}x")
                
                # Better validation criterion
                assert decay_ratio > 1.75, f"Distance decay ratio ({decay_ratio:.2f}) is too low"
            else:
                print("Warning: Cannot calculate decay ratio due to zero values")
        else:
            print("Warning: Not enough distance bins for analysis")
    except Exception as e:
        print(f"Warning: Could not analyze distance decay pattern: {e}")
        raise  # Re-raise to ensure validation fails if this check fails
    
    # Check cold-start coverage
    try:
        cold_users = users[users['declared_interests'].apply(len) > 0]
        if not cold_users.empty:
            cold_user_ids = set(cold_users['user_id'])
            cold_interaction_counts = (interactions[interactions['user_id'].isin(cold_user_ids)]
                                      .groupby('user_id').size())
            
            cold_coverage = (cold_interaction_counts >= 3).mean() if not cold_interaction_counts.empty else 0
            print(f"Cold-start users with 3+ interactions: {cold_coverage:.1%}")
            assert cold_coverage > 0.8, f"Cold-start coverage insufficient: {cold_coverage:.1%}"
        else:
            print("Warning: No cold-start users identified")
    except Exception as e:
        print(f"Warning: Could not validate cold-start coverage: {e}")
        raise  # Re-raise to ensure validation fails

def main(output_dir='.', n_users=10000, n_events=5000, n_interactions=100000, validate=True):
    """Main function to generate and save all synthetic datasets."""
    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)
        
        print(f"Generating {n_users} users...")
        users_df = generate_users(n_users)
        
        print(f"Generating {n_events} events...")
        events_df = generate_events(n_events)
        
        print(f"Generating {n_interactions} base interactions...")
        interactions_df = generate_interactions(users_df, events_df, n_interactions)
        
        print("Adding cold-start interactions...")
        full_interactions = augment_cold_start(users_df, events_df, interactions_df)
        
        # Validate data if requested
        if validate:
            print("\n=== Data Validation ===")
            validate_data(users_df, events_df, full_interactions)
            print("=== Validation Successful ===\n")
        
        # Save to CSV files
        users_df.to_csv(os.path.join(output_dir, 'synthetic_users.csv'), index=False)
        events_df.to_csv(os.path.join(output_dir, 'synthetic_events.csv'), index=False)
        full_interactions.to_csv(os.path.join(output_dir, 'synthetic_interactions.csv'), index=False)
        
        print(f"Successfully generated:")
        print(f"- {len(users_df)} users")
        print(f"- {len(events_df)} events")
        print(f"- {len(full_interactions)} interactions (including {len(full_interactions) - len(interactions_df)} cold-start interactions)")
        print(f"Files saved to {output_dir}")
        
        return users_df, events_df, full_interactions
        
    except AssertionError as ae:
        print(f"Validation failed: {ae}")
        raise
    except Exception as e:
        print(f"Error during data generation: {e}")
        raise

if __name__ == "__main__":
    main()

Generating 10000 users...
Generating 5000 events...
Generating 100000 base interactions...
Distance distribution in sampling:
0.0-5.0 km: 6383 samples (1.3%)
5.0-20.0 km: 30395 samples (6.1%)
20.0-50.0 km: 2131 samples (0.4%)
50.0-100.0 km: 3478 samples (0.7%)
100.0-200.0 km: 12059 samples (2.4%)
200.0-inf km: 445554 samples (89.1%)

Final distance distribution in interactions:
count    17402.000000
mean        13.638628
std         30.182407
min          0.034959
25%          4.761164
50%          7.969416
75%         11.969957
max        298.766857
Name: distance_to_event, dtype: float64
Adding cold-start interactions...

=== Data Validation ===
Total users: 10000
Total events: 5000
Total interactions: 42447
City distribution:
city
New York     0.1979
London       0.1541
Paris        0.1036
Mumbai       0.1002
Tokyo        0.1000
Toronto      0.0981
Berlin       0.0937
Dubai        0.0513
Sydney       0.0512
São Paulo    0.0499
Name: proportion, dtype: float64
Outdoor events with Cle

  bin_stats = interactions.groupby('distance_bin')['interaction_type'].apply(


Successfully generated:
- 10000 users
- 5000 events
- 42447 interactions (including 25045 cold-start interactions)
Files saved to .
