In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os

# Configuration
START_DATE = datetime.now() - timedelta(days=365)
END_DATE = datetime.now()
MIN_ACTIVITIES = 15  # Minimum for well-being days eligibility

# Sport types with characteristics
SPORT_TYPES = {
    'Course à pied': {'has_distance': True, 'distance_range': (2000, 25000), 'speed_range': (2.5, 4.5), 'duration_range': (1200, 7200)},
    'Vélo': {'has_distance': True, 'distance_range': (5000, 50000), 'speed_range': (4.0, 10.0), 'duration_range': (1800, 10800)},
    'Marche': {'has_distance': True, 'distance_range': (1000, 15000), 'speed_range': (1.0, 1.8), 'duration_range': (1800, 5400)},
    'Randonnée': {'has_distance': True, 'distance_range': (3000, 30000), 'speed_range': (0.8, 1.5), 'duration_range': (3600, 14400)},
    'Trottinette': {'has_distance': True, 'distance_range': (2000, 20000), 'speed_range': (3.0, 6.0), 'duration_range': (900, 3600)},
    'Natation': {'has_distance': True, 'distance_range': (500, 3000), 'speed_range': (0.5, 1.5), 'duration_range': (1800, 5400)},
    'Escalade': {'has_distance': False, 'duration_range': (1800, 7200)},
    'Yoga': {'has_distance': False, 'duration_range': (1800, 5400)},
    'Musculation': {'has_distance': False, 'duration_range': (1800, 7200)}
}

def load_employee_data(hr_file_path):
    """Load employee data from HR Excel file"""
    try:
        hr_df = pd.read_excel(hr_file_path)
        print(f"Loaded HR data with {len(hr_df)} employees")
        return hr_df
    except Exception as e:
        print(f"Error loading HR file: {e}")
        raise

def generate_activity(employee_id):
    """Generate a single activity record"""
    sport_type = random.choice(list(SPORT_TYPES.keys()))
    sport_params = SPORT_TYPES[sport_type]
    
    # Generate random start time in the last 12 months
    start_time = START_DATE + timedelta(seconds=random.randint(0, int((END_DATE - START_DATE).total_seconds())))
    
    # Generate duration based on sport type
    duration = random.randint(*sport_params['duration_range'])
    end_time = start_time + timedelta(seconds=duration)
    
    # Generate distance if applicable
    distance = None
    if sport_params['has_distance']:
        speed = random.uniform(*sport_params['speed_range'])
        distance = int(speed * duration)
        # Ensure distance stays within reasonable bounds for the sport
        distance = max(sport_params['distance_range'][0], min(distance, sport_params['distance_range'][1]))
    
    # Generate comment (30% chance)
    comment = None
    if random.random() < 0.3:
        comments = {
            'Course à pied': ["Entraînement matinal", "Préparation course", "Endurance fondamentale"],
            'Vélo': ["Sortie en groupe", "Entraînement intensif", "Parcours vallonné"],
            'Randonnée': ["Belle vue aujourd'hui", "Randonnée en montagne", "Découverte d'un nouveau sentier"],
            'Natation': ["Séance de crawl", "Nage en eau libre", "Exercices techniques"],
            'Yoga': ["Séance détente", "Yoga matinal", "Session de méditation"]
        }
        comment = random.choice(comments.get(sport_type, ["Bonne séance d'entraînement"]))
    
    return {
        'ID_salarié': employee_id,
        'Date_de_début': start_time,
        'Type': sport_type,
        'Distance': distance,
        'Date_de_fin': end_time,
        'Commentaire': comment
    }

def generate_employee_activities(employee_id, hr_row=None):
    """Generate activities for one employee"""
    # Determine number of activities based on transport mode
    transport_mode = hr_row['Moyen de déplacement'] if hr_row is not None else None
    
    if transport_mode in ['Vélo', 'Marche', 'Trottinette']:
        # More active if using active transport
        num_activities = random.randint(MIN_ACTIVITIES+10, MIN_ACTIVITIES*3)
    else:
        # Standard activity level
        num_activities = random.randint(max(0, MIN_ACTIVITIES-5), MIN_ACTIVITIES*2)
    
    return [generate_activity(employee_id) for _ in range(num_activities)]

def generate_all_activities(hr_df):
    """Generate activities for all employees"""
    all_activities = []
    
    for _, employee in hr_df.iterrows():
        activities = generate_employee_activities(employee['ID salarié'], employee)
        all_activities.extend(activities)
    
    # Create DataFrame
    df = pd.DataFrame(all_activities)
    
    # Add unique ID
    df.insert(0, 'ID', range(1, len(df)+1))
    
    # Ensure correct column order
    df = df[['ID', 'ID_salarié', 'Date_de_début', 'Type', 'Distance', 'Date_de_fin', 'Commentaire']]
    
    return df

# Main execution
if __name__ == "__main__":
    # Path to HR data file
    hr_file_path = 'data/DonneesRH.xlsx'
    
    if not os.path.exists(hr_file_path):
        raise FileNotFoundError(f"HR data file not found at: {hr_file_path}")
    
    print(f"Loading employee data from {hr_file_path}...")
    hr_df = load_employee_data(hr_file_path)
    
    print("Generating synthetic Strava-like activity data...")
    activities_df = generate_all_activities(hr_df)
    
    # Save to CSV
    csv_path = 'strava_simulation.csv'
    activities_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
    print(f"\nGenerated {len(activities_df)} activities for {len(hr_df)} employees")
    print(f"Data saved to {csv_path}")
    
    # Sample output
    print("\nSample data:")
    print(activities_df.sample(3).to_markdown(index=False))

Loading employee data from data/DonneesRH.xlsx...
Loaded HR data with 161 employees
Generating synthetic Strava-like activity data...

Generated 3211 activities for 161 employees
Data saved to strava_simulation.csv

Sample data:
|   ID |   ID_salarié | Date_de_début              | Type      |   Distance | Date_de_fin                | Commentaire           |
|-----:|-------------:|:---------------------------|:----------|-----------:|:---------------------------|:----------------------|
| 1661 |        45221 | 2024-09-01 20:50:40.283923 | Vélo      |      44351 | 2024-09-01 23:22:30.283923 |                       |
|  870 |        37866 | 2024-10-06 21:55:56.283923 | Vélo      |      37486 | 2024-10-06 23:37:25.283923 | Parcours vallonné     |
|   18 |        19841 | 2025-07-04 03:27:50.283923 | Randonnée |       3395 | 2025-07-04 04:32:41.283923 | Randonnée en montagne |
