In [None]:
import numpy as np
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
from geopy.distance import geodesic
from scipy.stats import skewnorm, dirichlet
from mimesis import Generic
import re

# Initialize Faker and set seed for reproducibility
fake = Faker()
generic = Generic('en')
np.random.seed(42)
random.seed(42)

# ====================== #
# Global City Configurations #
# ====================== #
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 
          'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
city_coords = {
    'New York': (40.7128, -74.0060),
    'London': (51.5074, -0.1278),
    'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503),
    'Sydney': (-33.8688, 151.2093),
    'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777),
    'São Paulo': (-23.5505, -46.6333),
    'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

# ====================== #
# 1. Location Generation #
# ====================== #
def generate_location(city):
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:  # 80% nearby, 20% further out
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

# ====================== #
# 2. User Generation #
# ====================== #
def generate_users(n_users=20000):
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 
                'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        # City selection uses global cities and city_probs
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Rest of user generation logic...

interest_event_map = {
    'music': ['Music & Concerts', 'Festivals'],
    'sports': ['Sports Competitions', 'Fitness Events'],
    'tech': ['Tech Conferences', 'Startup Events'],
    'food': ['Food Festivals', 'Culinary Workshops'],
    'art': ['Art Exhibitions', 'Museum Events'],
    'literature': ['Book Fairs', 'Writing Workshops'],
    'cinema': ['Film Festivals', 'Movie Premieres'],
    'travel': ['Travel Expos', 'Adventure Tours'],
    'fitness': ['Marathons', 'Yoga Retreats'],
    'fashion': ['Fashion Shows', 'Designer Events']
}

event_type_weights = {
    'Music & Concerts': 0.18, 'Sports Competitions': 0.15, 
    'Tech Conferences': 0.12, 'Food Festivals': 0.15,
    'Art Exhibitions': 0.10, 'Book Fairs': 0.08,
    'Film Festivals': 0.10, 'Travel Expos': 0.05,
    'Marathons': 0.04, 'Fashion Shows': 0.03
}

# ====================== #
# 2. User Generation #
# ====================== #
def generate_users(n_users=20000):
    users = []
    all_interests = ['music', 'sports', 'tech', 'food', 'art', 
                    'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Gender-aware preferences
        gender = generic.person.gender()
        age_group = 'young' if age < 30 else 'middle' if age < 50 else 'senior'
        
        # Generate core interest + 1-3 additional interests
        core_interest = random.choice(list(interest_event_map.keys()))
        interests = [core_interest] + random.sample(interests, k=random.randint(1, 3))
        interests = list(set(interests))[:4]  # Max 4 unique interests
        
        users.append({
            'user_id': generic.person.identifier(mask='@@###@'),
            'gender': gender,
            'age_group': age_group,
            'user_lat': lat,
            'user_lon': lon,
            'user_city': city,
            'user_weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], 
                                                      p=[0.2, 0.3, 0.5]),
            'age': age,
            'core_interest': core_interest,
            'secondary_interests': ','.join(interests[1:]),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

# ====================== #
# 3. Event Generation #
# ====================== #
def generate_events(n_events=5000):
    events = []
    event_types = list(event_type_weights.keys())
    
    for _ in range(n_events):
        # Weighted event type selection
        event_type = np.random.choice(event_types, p=list(event_type_weights.values()))
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        # Temporal patterns
        start_time = generate_time_based_on_type(event_type)
        
        events.append({
            'event_id': generic.person.identifier(mask='@@###@'),
            'title': f"{fake.catch_phrase()} {event_type}",
            'event_type': event_type,
            'primary_category': event_type.split()[0],
            'event_lat': lat,
            'event_lon': lon,
            'event_city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240], p=[0.4, 0.4, 0.2]),
            'expected_attendance': int(np.random.lognormal(mean=6, sigma=0.5))
        })
    return pd.DataFrame(events)

# ====================== #
# 4. Interaction Generation #
# ====================== #
def generate_interactions(users, events, n_interactions=100000):
    interactions = []
    hard_negatives = []
    
    # Create popularity baseline
    event_popularity = events['expected_attendance'].rank(pct=True)
    
    # Generate positive interactions
    for _ in range(n_interactions):
        user, event = sample_pair(users, events)
        
        # Calculate match score using theoretical framework
        match_score = calculate_match_score(user, event, event_popularity)
        
        if np.random.rand() < match_score:
            interactions.append(create_interaction(user, event, 'positive'))
            
    # Generate hard negatives (HNS theory from paper [3])
    for _ in range(n_interactions//10):
        user, event = sample_pair(users, events)
        
        if is_hard_negative(user, event):
            hard_negatives.append(create_interaction(user, event, 'hard_negative'))
    
    return pd.concat([pd.DataFrame(interactions), pd.DataFrame(hard_negatives)])

# ====================== #
# 5. Core Matching Logic #
# ====================== #
def calculate_match_score(user, event, popularity):
    # Interest alignment (paper [5] correlation preservation)
    interest_score = len(set(get_user_interests(user)) & 
                        set(interest_event_map.get(event['primary_category'], [])))
    
    # Geographic proximity
    distance = geodesic((user['user_lat'], user['user_lon']),
                       (event['event_lat'], event['event_lon'])).km
    geo_score = np.exp(-distance/15)
    
    # Temporal relevance
    time_score = 1.2 if event['start_time'].weekday() >= 5 else 0.8
    
    # Popularity bias (paper [3] HNS theory)
    popularity_score = popularity[event.name]
    
    # Demographic alignment
    demo_score = 1.0
    if user['age_group'] == 'young' and event['event_type'] in ['Music & Concerts', 'Festivals']:
        demo_score *= 1.3
    elif user['age_group'] == 'senior' and event['event_type'] in ['Book Fairs']:
        demo_score *= 1.4
        
    # Combine scores using learned weights
    return sigmoid(0.4*interest_score + 0.3*geo_score + 0.2*time_score + 0.1*popularity_score)

# ====================== #
# 6. Hard Negative Sampling #
# ====================== #
def is_hard_negative(user, event):
    # Partial match criteria (paper [3] theory)
    user_interests = get_user_interests(user)
    event_categories = interest_event_map.get(event['primary_category'], [])
    
    return (len(set(user_interests) & set(event_categories)) > 0) and \
           (event['event_id'] not in user['interacted_events'])

# ====================== #
# 7. Helper Functions #
# ====================== #
def get_user_interests(user):
    return [user['core_interest']] + user['secondary_interests'].split(',')

def generate_time_based_on_type(event_type):
    base_date = datetime(2025, 1, 1)
    if 'Music' in event_type:
        return fake.date_time_between(start_date=base_date, end_date=base_date+timedelta(days=180))
    elif 'Conference' in event_type:
        return fake.date_time_between(start_date=base_date+timedelta(days=180), end_date=base_date+timedelta(days=360))
    else:
        return fake.date_time_between(start_date=base_date, end_date=base_date+timedelta(days=365))

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# ====================== #
# 8. Execution #
# ====================== #
if __name__ == "__main__":
    users_df = generate_users()
    events_df = generate_events()
    interactions_df = generate_interactions(users_df, events_df)
    
    # # Save datasets with correlation preservation (paper [5])
    # users_df.to_csv('synthetic_users.csv', index=False)
    # events_df.to_csv('synthetic_events.csv', index=False)
    # interactions_df.to_csv('synthetic_interactions.csv', index=False)


ValueError: Sample larger than population or is negative

In [61]:
import numpy as np
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
from geopy.distance import geodesic
from scipy.stats import skewnorm
from mimesis import Generic
import re

# Initialize Faker and set seed for reproducibility
fake = Faker()
generic = Generic('en')
np.random.seed(42)
random.seed(42)

# ====================== #
# Global City Configurations #
# ====================== #
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 
          'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
city_coords = {
    'New York': (40.7128, -74.0060),
    'London': (51.5074, -0.1278),
    'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503),
    'Sydney': (-33.8688, 151.2093),
    'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777),
    'São Paulo': (-23.5505, -46.6333),
    'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

# ====================== #
# 1. Location Generation #
# ====================== #
def generate_location(city):
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:  # 80% nearby, 20% further out
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

# ====================== #
# 2. User Generation #
# ====================== #
def generate_users(n_users=20000):
    users = []
    all_interests = ['music', 'sports', 'tech', 'food', 'art', 
                    'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        
        # Gender-aware preferences
        gender = generic.person.gender()
        age_group = 'young' if age < 30 else 'middle' if age < 50 else 'senior'
        
        # Generate core interest + 1-3 additional interests
        core_interest = random.choice(all_interests)
        k = random.randint(1, 3)
        additional_interests = random.sample(
            [i for i in all_interests if i != core_interest], 
            k=min(k, len(all_interests)-1)
        )
        user_interests = list(set([core_interest] + additional_interests))
        
        users.append({
            'user_id': generic.person.identifier(mask='@@###@'),
            'gender': gender,
            'age_group': age_group,
            'user_lat': lat,
            'user_lon': lon,
            'user_city': city,
            'user_weather_preference': np.random.choice(
                ['indoor', 'outdoor', 'any'], 
                p=[0.2, 0.3, 0.5]
            ),
            'age': age,
            'core_interest': core_interest,
            'secondary_interests': ','.join(user_interests[1:]),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

# ====================== #
# 3. Event Generation #
# ====================== #
interest_event_map = {
    'music': ['Music & Concerts', 'Festivals'],
    'sports': ['Sports Competitions', 'Fitness Events'],
    'tech': ['Tech Conferences', 'Startup Events'],
    'food': ['Food Festivals', 'Culinary Workshops'],
    'art': ['Art Exhibitions', 'Museum Events'],
    'literature': ['Book Fairs', 'Writing Workshops'],
    'cinema': ['Film Festivals', 'Movie Premieres'],
    'travel': ['Travel Expos', 'Adventure Tours'],
    'fitness': ['Marathons', 'Yoga Retreats'],
    'fashion': ['Fashion Shows', 'Designer Events']
}

event_type_weights = list(interest_event_map.values())
event_type_weights = [item for sublist in event_type_weights for item in sublist]
event_type_weights = {
    et: 0.5/len(event_type_weights) for et in event_type_weights
}  # Equal weights for simplicity

def generate_events(n_events=5000):
    events = []
    event_types = list(event_type_weights.keys())
    
    for _ in range(n_events):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        event_type = np.random.choice(event_types)
        events.append({
            'event_id': generic.person.identifier(mask='@@###@'),
            'title': f"{fake.catch_phrase()} {event_type} in {city}",
            'event_type': random.choice(event_types),
            'event_lat': lat,
            'event_lon': lon,
            'event_city': city,
            'start_time': fake.date_time_between(
                start_date=datetime(2025, 1, 1), 
                end_date=datetime(2025, 12, 31)
            ),
            'duration': random.choice([60, 90, 120]),
            'expected_attendance': int(np.random.lognormal(mean=6, sigma=0.5))
        })
    return pd.DataFrame(events)

# ====================== #
# 4. Interaction Generation #
# ====================== #
def generate_interactions(users, events, n_interactions=100000):
    interactions = []
    
    # Create popularity baseline
    event_popularity = events['expected_attendance'].rank(pct=True)
    
    for _ in range(n_interactions):
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        # Calculate geographic distance
        distance = geodesic(
            (user['user_lat'], user['user_lon']),
            (event['event_lat'], event['event_lon'])
        ).km
        
        # Calculate interest alignment
        user_interests = [user['core_interest']] + user['secondary_interests'].split(',')
        event_categories = interest_event_map.get(
            re.sub(r'\s+', ' ', event['event_type'].split()[0].lower()), []
        )
        interest_match = len(set(user_interests) & set(event_categories))
        
        # Generate interaction probability
        interaction_prob = (
            0.4 * interest_match +
            0.3 * np.exp(-distance/50) +  # 50km distance decay
            0.2 * event_popularity[event.name] +
            0.1 * random.random()
        )
        
        if random.random() < interaction_prob:
            interactions.append({
                'interaction_id': generic.person.identifier(mask='@@###@'),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': random.choices(
                    ['view', 'save', 'attend'],
                    weights=[0.7, 0.2, 0.1]
                )[0],
                'distance_to_event': distance
            })
    
    return pd.DataFrame(interactions)

# ====================== #
# 5. Execution #
# ====================== #
if __name__ == "__main__":
    users_df = generate_users(5000)
    events_df = generate_events(2000)
    interactions_df = generate_interactions(users_df, events_df, 10000)
    
    # # Save datasets
    # users_df.to_csv('synthetic_users.csv', index=False)
    # events_df.to_csv('synthetic_events.csv', index=False)
    # interactions_df.to_csv('synthetic_interactions.csv', index=False)


In [18]:
users_df.head()

Unnamed: 0,user_id,gender,age_group,user_lat,user_lon,user_city,user_weather_preference,age,core_interest,secondary_interests,social_connectedness
0,OM335X,Male,middle,48.946743,2.398599,Paris,indoor,40,music,"art,literature,music",16
1,ID403V,Female,middle,40.806782,-73.939511,New York,indoor,35,sports,"fashion,tech,sports",13
2,HG972A,Other,middle,48.814846,2.374571,Paris,outdoor,33,music,music,14
3,BF417Q,Other,young,40.715647,-73.987517,New York,indoor,25,fashion,fashion,13
4,GV450O,Female,middle,52.50803,13.329408,Berlin,any,32,fitness,"food,travel",12


In [19]:
users_df["user_id"].nunique()

4999

In [39]:
events_df.head()

Unnamed: 0,event_id,title,event_type,event_lat,event_lon,event_city,start_time,duration,weather_condition,temperature,attendance_rate,event_indoor_capability
0,RY823V,Innovative incremental circuit Seasonal & Fest...,Seasonal & Festivals,38.986291,-75.106163,New York,2025-04-08 13:10:17.349674,120,Clear,18.4,20.656472,False
1,CN297A,Re-engineered explicit knowledge user Seasonal...,Seasonal & Festivals,-23.6175,-46.709916,São Paulo,2025-07-15 09:25:44.908898,240,Cloudy,24.3,14.775524,False
2,VV502T,Profit-focused context-sensitive infrastructur...,Sports & Fitness,37.317136,138.85086,Tokyo,2025-07-02 19:11:25.074699,180,Clear,22.8,40.198146,False
3,QB605T,Secured empowering success Education & Learnin...,Education & Learning,51.414209,-0.110879,London,2025-05-18 18:20:35.709724,180,Cloudy,13.5,21.595591,True
4,XN991I,Centralized bandwidth-monitored initiative Art...,Arts & Culture,20.370323,74.818798,Mumbai,2025-04-12 10:27:30.465703,120,Clear,32.5,30.103833,True


In [38]:
events_df["title"][1:5].values

array(['Re-engineered explicit knowledge user Seasonal & Festivals in São Paulo',
       'Profit-focused context-sensitive infrastructure Sports & Fitness in Tokyo',
       'Secured empowering success Education & Learning in London',
       'Centralized bandwidth-monitored initiative Arts & Culture in Mumbai'],
      dtype=object)

In [22]:
interactions_df.head()


Unnamed: 0,interaction_id,user_id,event_id,interaction_type,distance_to_event
0,QP412E,HF620X,JV517H,view,658.259646
1,AW641F,TI615N,XE760J,save,10.173327
2,HQ068A,LC341M,NT411W,save,6293.82367
3,UN837C,AI471V,KW121S,view,339.571753
4,SJ872I,EF057Z,LO190F,view,5690.197951


In [62]:
interactions_df[["user_id","event_id"]].groupby("user_id").value_counts()

user_id  event_id
AA384G   VJ390I      1
AA420W   UW503I      1
AA485N   QE637J      1
AA558Q   TT639R      1
AA888P   OI625M      1
                    ..
ZZ109E   JB399Y      1
ZZ603L   GC412F      1
ZZ604X   EZ480V      1
ZZ644X   CI397P      1
ZZ902E   SC888L      1
Name: count, Length: 1682, dtype: int64

In [24]:
import numpy as np
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
from geopy.distance import geodesic
from scipy.stats import skewnorm, dirichlet
from mimesis import Generic
import hopsworks
import re

# Initialize Faker and set seed for reproducibility
fake = Faker()
generic = Generic('en')

np.random.seed(42)
random.seed(42)

# City coordinates and probabilities
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
city_coords = {
    'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278), 'Paris': (48.8566, 2.3522),
    'Tokyo': (35.6762, 139.6503), 'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
    'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333), 'Toronto': (43.6532, -79.3832),
    'Dubai': (25.2048, 55.2708)
}

#generic = Person()

def generate_location(city):
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        # Ensure at least one interest is selected
        user_interests = random.sample(interests, k=random.randint(1, min(4, len(interests))))
        
        users.append({
            'user_id': generic.person.identifier(mask='@@###@'),
            'user_lat': lat,
            'user_lon': lon,
            'user_city': city,
            'user_weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'user_interests': ','.join(user_interests),  # Join interests into a comma-separated string
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)


def generate_events(n_events=5000):
    events = []
    event_types = [
        'Education & Learning', 'Technology', 'Seasonal & Festivals', 'Arts & Culture', 
        'Entertainment', 'Sports & Fitness', 'Business & Networking', 'Health & Wellness', 
        'Music & Concerts', 'Food & Drink', 'Community & Causes', 'Immersive Experiences'
    ]
    weather_conditions = ['Clear', 'Rain', 'Snow', 'Cloudy', 'Windy']
    weather_probs = [0.5, 0.2, 0.05, 0.2, 0.05]
    
    current_date = datetime(2025, 3, 27, 11, 48)  # Current date and time
    
    for _ in range(n_events):
        event_type = np.random.choice(event_types)
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        
        if event_type in ['Sports & Fitness', 'Seasonal & Festivals']:
            weather_condition = 'Clear' if random.random() < 0.8 else np.random.choice(['Rain', 'Cloudy'])
        elif event_type in ['Education & Learning', 'Technology', 'Business & Networking']:
            weather_condition = np.random.choice(['Clear', 'Cloudy'])
        else:
            weather_condition = np.random.choice(weather_conditions, p=weather_probs)
        
        base_temp = {
            'New York': 15, 'London': 12, 'Paris': 16, 'Tokyo': 20, 
            'Sydney': 22, 'Berlin': 14, 'Mumbai': 28, 'São Paulo': 24, 
            'Toronto': 10, 'Dubai': 32
        }[city]
        
        temp_adjustment = {
            'Clear': np.random.uniform(2, 5),
            'Rain': np.random.uniform(-3, 0),
            'Snow': np.random.uniform(-8, -3),
            'Cloudy': np.random.uniform(-1, 2),
            'Windy': np.random.uniform(-2, 1)
        }[weather_condition]
        
        temperature = round(base_temp + temp_adjustment, 1)
        
        start_time = fake.date_time_between(start_date=current_date, end_date=current_date + timedelta(days=180))
        is_weekend = start_time.weekday() >= 5
        hour_choices = [10, 14, 18] if is_weekend else [9, 13, 18, 19]
        start_time = start_time.replace(hour=np.random.choice(hour_choices))
        
        events.append({
            'event_id': generic.person.identifier(mask='@@###@'),
            'title': f"{fake.catch_phrase()} {event_type} in {city}",
            'event_type': event_type,
            'event_lat': lat,
            'event_lon': lon,
            'event_city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),
            'weather_condition': weather_condition,
            'temperature': temperature,
            'attendance_rate': np.random.beta(a=2, b=5) * 100,
            'event_indoor_capability': event_type in ['Education & Learning', 'Technology', 'Business & Networking', 
                                               'Arts & Culture', 'Entertainment', 'Immersive Experiences']
        })
    return pd.DataFrame(events)

def calculate_time_weight(interaction_time, current_time, half_life=30):
    time_diff = (current_time - interaction_time).days
    return np.exp(np.log(0.5) * time_diff / half_life)

def generate_interactions(users, events, n_interactions=100000):
    interactions = []
    interaction_types = ['maybe', 'invited & maybe', 'no', 'yes', 'invited & yes', 'invited & no', 'invited']
    attempts = n_interactions * 5
    current_time = datetime(2025, 3, 27, 11, 48)
    
    for _ in range(attempts):
        if len(interactions) >= n_interactions:
            break
            
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        
        distance = geodesic((user['user_lat'], user['user_lon']), 
                           (event['event_lat'], event['event_lon'])).km
        
        distance_score = np.exp(-distance/10)
        weather_score = 1.2 if (event['weather_condition'] == 'Clear' and 
                               user['user_weather_preference'] in ['outdoor', 'any']) else 0.5
        social_score = np.log1p(user['social_connectedness']) / 10
        
        interaction_prob = 0.85*distance_score + 0.1*weather_score + 0.05*social_score

        max_distance = 50 if random.random() < 0.7 else 300
        
        if distance < max_distance and (random.random() < interaction_prob):
            interaction_time = fake.date_time_between(
                start_date=event['start_time'] - timedelta(days=30), 
                end_date=event['start_time']
            )
            time_weight = calculate_time_weight(interaction_time, current_time)
            interaction_prob *= time_weight
            
            if distance <= 5:
                interaction_type_probs = [0.15, 0.20, 0.05, 0.25, 0.20, 0.05, 0.10]
            elif distance <= 20:
                interaction_type_probs = [0.20, 0.15, 0.10, 0.20, 0.15, 0.10, 0.10]
            elif distance <= 50:
                interaction_type_probs = [0.25, 0.10, 0.15, 0.15, 0.10, 0.15, 0.10]
            elif distance <= 100:
                interaction_type_probs = [0.20, 0.05, 0.25, 0.10, 0.05, 0.20, 0.15]
            else:
                interaction_type_probs = [0.15, 0.05, 0.30, 0.05, 0.05, 0.25, 0.15]
                
            interactions.append({
                'interaction_id': generic.person.identifier(mask='@@###@'),
                'user_id': user['user_id'],
                'event_id': event['event_id'],
                'interaction_type': np.random.choice(interaction_types, p=interaction_type_probs),
                'distance_to_event': distance,
                "interaction_label": interactions_df['interaction_type'].apply(
                    lambda x: 1 if x in ['maybe', 'invited & maybe', 'yes', 'invited & yes'] else 0
                )
                            })
    
    return pd.DataFrame(interactions)

In [25]:
#Generate data
print("Generating user data...")
users_df = generate_users(10000)

print("Generating event data...")
events_df = generate_events(10000)

print("Generating interaction data...")
interactions_df = generate_interactions(users_df, events_df, 100000)

Generating user data...
Generating event data...
Generating interaction data...


In [29]:
events_per_user = interactions_df.groupby("user_id").size()
max(events_per_user)

13

In [63]:
events_per_user = interactions_df.groupby("user_id").size()
max(events_per_user)

for i in range(1, 14):
    print(f"{events_per_user[events_per_user == i].size} attended {i} events")

1205 attended 1 events
206 attended 2 events
19 attended 3 events
2 attended 4 events
0 attended 5 events
0 attended 6 events
0 attended 7 events
0 attended 8 events
0 attended 9 events
0 attended 10 events
0 attended 11 events
0 attended 12 events
0 attended 13 events


In [45]:
interactions_df["interaction_label"] = interactions_df['interaction_type'].apply(
    lambda x: 1 if x in ['maybe', 'invited & maybe', 'yes', 'invited & yes'] else 0
)

In [48]:
interactions_df["interaction_label"].value_counts()

interaction_label
1    12497
0     5081
Name: count, dtype: int64

In [56]:
import numpy as np
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta
from geopy.distance import geodesic
from scipy.stats import skewnorm, dirichlet
from mimesis import Generic
import re

# Initialize Faker and set seed for reproducibility
fake = Faker()
generic = Generic('en')
np.random.seed(42)
random.seed(42)

# City configurations
cities = ['New York', 'London', 'Paris', 'Tokyo', 'Sydney', 
          'Berlin', 'Mumbai', 'São Paulo', 'Toronto', 'Dubai']
city_probs = [0.2, 0.15, 0.1, 0.1, 0.05, 0.1, 0.1, 0.05, 0.1, 0.05]
city_coords = {
    k: (float(v[0]), float(v[1])) for k,v in {
        'New York': (40.7128, -74.0060), 'London': (51.5074, -0.1278),
        'Paris': (48.8566, 2.3522), 'Tokyo': (35.6762, 139.6503),
        'Sydney': (-33.8688, 151.2093), 'Berlin': (52.5200, 13.4050),
        'Mumbai': (19.0760, 72.8777), 'São Paulo': (-23.5505, -46.6333),
        'Toronto': (43.6532, -79.3832), 'Dubai': (25.2048, 55.2708)
    }.items()
}

def generate_location(city):
    base_lat, base_lon = city_coords[city]
    if random.random() < 0.8:
        lat = base_lat + np.random.uniform(-0.1, 0.1)
        lon = base_lon + np.random.uniform(-0.1, 0.1)
    else:
        lat = base_lat + np.random.uniform(-2, 2)
        lon = base_lon + np.random.uniform(-2, 2)
    return lat, lon

def generate_users(n_users=20000):
    users = []
    interests = ['music', 'sports', 'tech', 'food', 'art', 
                'literature', 'cinema', 'travel', 'fitness', 'fashion']
    
    for _ in range(n_users):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        age = max(18, min(100, int(skewnorm.rvs(5, loc=25, scale=15))))
        weather_probs = dirichlet.rvs([0.3, 0.5, 0.2])[0]
        
        # Generate interests with power-law distribution
        num_interests = min(np.random.zipf(1.2), 5)
        user_interests = random.sample(interests, k=num_interests)
        user_interests = list(set(user_interests))[:4]  # Max 4 unique interests
        
        users.append({
            'user_id': generic.person.identifier(mask='@@###@'),
            'user_lat': lat,
            'user_lon': lon,
            'user_city': city,
            'user_weather_preference': np.random.choice(['indoor', 'outdoor', 'any'], p=weather_probs),
            'age': age,
            'user_interests': ','.join(user_interests),
            'signup_date': fake.date_time_between(start_date='-2y', end_date='now'),
            'social_connectedness': np.random.poisson(lam=15)
        })
    return pd.DataFrame(users)

def generate_events(n_events=5000):
    events = []
    event_types = [
        'Education & Learning', 'Technology', 'Seasonal & Festivals', 
        'Arts & Culture', 'Entertainment', 'Sports & Fitness', 
        'Business & Networking', 'Health & Wellness', 'Music & Concerts', 
        'Food & Drink', 'Community & Causes', 'Immersive Experiences'
    ]
    current_date = datetime(2025, 3, 27, 11, 48)
    
    for _ in range(n_events):
        city = np.random.choice(cities, p=city_probs)
        lat, lon = generate_location(city)
        event_type = random.choice(event_types)
        start_time = fake.date_time_between(
            start_date=current_date, 
            end_date=current_date + timedelta(days=180)
        )
        
        # Weather and temperature logic
        if event_type in ['Sports & Fitness', 'Seasonal & Festivals']:
            weather_condition = 'Clear' if random.random() < 0.8 else random.choice(['Rain', 'Cloudy'])
        else:
            weather_condition = random.choice(['Clear', 'Cloudy', 'Rain', 'Snow', 'Windy'])
        
        base_temp = {
            'New York': 15, 'London': 12, 'Paris': 16, 'Tokyo': 20, 
            'Sydney': 22, 'Berlin': 14, 'Mumbai': 28, 'São Paulo': 24, 
            'Toronto': 10, 'Dubai': 32
        }[city]
        
        temp_adjustment = {
            'Clear': np.random.uniform(2, 5),
            'Rain': np.random.uniform(-3, 0),
            'Snow': np.random.uniform(-8, -3),
            'Cloudy': np.random.uniform(-1, 2),
            'Windy': np.random.uniform(-2, 1)
        }[weather_condition]
        
        events.append({
            'event_id': generic.person.identifier(mask='@@###@'),
            'title': f"{fake.catch_phrase()} {event_type} in {city}",
            'event_type': event_type,
            'event_lat': lat,
            'event_lon': lon,
            'event_city': city,
            'start_time': start_time,
            'duration': np.random.choice([120, 180, 240, 360, 480]),
            'weather_condition': weather_condition,
            'temperature': round(base_temp + temp_adjustment, 1),
            'attendance_rate': np.random.beta(a=2, b=5) * 100,
            'event_indoor_capability': event_type in ['Education & Learning', 'Technology', 
                                                    'Business & Networking', 'Arts & Culture',
                                                    'Entertainment', 'Immersive Experiences']
        })
    return pd.DataFrame(events)

def generate_interactions(users, events, n_interactions=100000):
    interactions = []
    zipf_param = 1.8  # More skewed distribution
    max_events_per_user = 25
    current_time = datetime(2025, 3, 27, 11, 48)
    
    def safe_sample(df, n):
        return df.sample(n, replace=True) if len(df) > 0 else pd.DataFrame()

    # Phase 1: Core interactions with power-law distribution
    for user in users.itertuples():
        user_interests = set(user.user_interests.split(','))
        base_count = np.random.zipf(zipf_param)
        events_to_attend = min(base_count, max_events_per_user)
        
        # Get interest-matching events with case-insensitive check
        candidate_events = events[
            events['event_type'].apply(
                lambda x: any(i.lower() in x.lower() for i in user_interests)
            )
        ]
        
        if len(candidate_events) == 0:
            continue  # Skip users with no matching events
            
        candidate_sample = safe_sample(candidate_events, 50)
        if len(candidate_sample) == 0:
            continue

        for event in candidate_sample.itertuples():
            if events_to_attend <= 0:
                break
            
            distance = geodesic(
                (user.user_lat, user.user_lon),
                (event.event_lat, event.event_lon)
            ).km
            
            if distance < 50 and random.random() < 0.7:
                interactions.append({
                    'user_id': user.user_id,
                    'event_id': event.event_id,
                    'distance_to_event': distance,
                    'event_type_in_user_interests': 1,
                    'interaction_label': 1
                })
                events_to_attend -= 1

    # Phase 2: Power users with many interactions
    power_users = users.nlargest(500, 'social_connectedness')
    for user in power_users.itertuples():
        additional_events = min(np.random.zipf(zipf_param*1.5), max_events_per_user)
        candidate_events = events.sample(100)
        
        for event in candidate_events.itertuples():
            if additional_events <= 0:
                break
            
            distance = geodesic(
                (user.user_lat, user.user_lon),
                (event.event_lat, event.event_lon)
            ).km
            
            if distance < 100 and random.random() < 0.8:
                interactions.append({
                    'user_id': user.user_id,
                    'event_id': event.event_id,
                    'distance_to_event': distance,
                    'event_type_in_user_interests': int(
                        any(i in event.event_type for i in user.user_interests.split(','))
                    ),
                    'interaction_label': 1
                })
                additional_events -= 1

    # Phase 3: Fill remaining interactions
    while len(interactions) < n_interactions:
        user = users.sample(1).iloc[0]
        event = events.sample(1).iloc[0]
        distance = geodesic(
            (user['user_lat'], user['user_lon']),
            (event['event_lat'], event['event_lon'])
        ).km
        
        interactions.append({
            'user_id': user['user_id'],
            'event_id': event['event_id'],
            'distance_to_event': distance,
            'event_type_in_user_interests': int(
                any(i in event['event_type'] for i in user['user_interests'].split(','))
            ),
            'interaction_label': 1 if random.random() < 0.3 else 0
        })

    return pd.DataFrame(interactions[:n_interactions])

# Generate datasets
# users_df = generate_users(10000)
# events_df = generate_events(5000)
# interactions_df = generate_interactions(users_df, events_df, 150000)

# # Save to CSV
# users_df.to_csv('synthetic_users.csv', index=False)
# events_df.to_csv('synthetic_events.csv', index=False)
# interactions_df.to_csv('synthetic_interactions.csv', index=False)

    
    # # Save datasets
    # users_df.to_csv('synthetic_users.csv', index=False)
    # events_df.to_csv('synthetic_events.csv', index=False)
    # interactions_df.to_csv('synthetic_interactions.csv', index=False)



# Generate datasets
print("Generating user data...")
users_df = generate_users(10000)
print("Generating event data...")
events_df = generate_events(5000)
print("Generating interactions data...")
interactions_df = generate_interactions(users_df, events_df, 150000)

# # Save to CSV
# users_df.to_csv('users.csv', index=False)
# events_df.to_csv('events.csv', index=False)
# interactions_df.to_csv('interactions.csv', index=False)


Generating user data...
Generating event data...
Generating interactions data...


In [60]:
events_per_user = interactions_df.groupby("user_id").size()
max(events_per_user)

for i in range(1, 14):
    print(f"{events_per_user[events_per_user == i].size} attended {i} events")

0 attended 1 events
1 attended 2 events
1 attended 3 events
9 attended 4 events
17 attended 5 events
60 attended 6 events
115 attended 7 events
210 attended 8 events
346 attended 9 events
534 attended 10 events
649 attended 11 events
822 attended 12 events
875 attended 13 events


In [58]:
interactions_df["interaction_label"].value_counts()

interaction_label
0    95645
1    54355
Name: count, dtype: int64