In [24]:
import pandas as pd
from datetime import datetime, timedelta
import random
import os

# Configuration
START_DATE = datetime.now() - timedelta(days=365)
END_DATE = datetime.now()
random.seed(42)

# Sport types with characteristics
SPORT_TYPES = {
    'Course à pied': {'has_distance': True, 'distance_range': (2000, 25000), 'speed_range': (2.5, 4.5), 'duration_range': (1200, 7200)},
    'Vélo': {'has_distance': True, 'distance_range': (5000, 50000), 'speed_range': (4.0, 10.0), 'duration_range': (1800, 10800)},
    'Marche': {'has_distance': True, 'distance_range': (1000, 15000), 'speed_range': (1.0, 1.8), 'duration_range': (1800, 5400)},
    'Randonnée': {'has_distance': True, 'distance_range': (3000, 30000), 'speed_range': (0.8, 1.5), 'duration_range': (3600, 14400)},
    'Trottinette': {'has_distance': True, 'distance_range': (2000, 20000), 'speed_range': (3.0, 6.0), 'duration_range': (900, 3600)},
    'Natation': {'has_distance': True, 'distance_range': (500, 3000), 'speed_range': (0.5, 1.5), 'duration_range': (1800, 5400)},
    'Escalade': {'has_distance': False, 'duration_range': (1800, 7200)},
    'Yoga': {'has_distance': False, 'duration_range': (1800, 5400)},
    'Musculation': {'has_distance': False, 'duration_range': (1800, 7200)}
}

def load_employee_data(hr_file_path):
    """Load employee data from HR Excel file"""
    try:
        hr_df = pd.read_excel(hr_file_path)
        print(f"Loaded HR data with {len(hr_df)} employees")
        return hr_df
    except Exception as e:
        print(f"Error loading HR file: {e}")
        raise

def generate_activity(employee_id):
    """Generate a single activity record"""
    sport_type = random.choice(list(SPORT_TYPES.keys()))
    sport_params = SPORT_TYPES[sport_type]
    
    # Generate random start time in the last 12 months
    start_time = START_DATE + timedelta(seconds=random.randint(0, int((END_DATE - START_DATE).total_seconds())))
    
    # Generate duration based on sport type
    duration = random.randint(*sport_params['duration_range'])
    end_time = start_time + timedelta(seconds=duration)
    
    # Generate distance if applicable
    distance = None
    if sport_params['has_distance']:
        speed = random.uniform(*sport_params['speed_range'])
        distance = int(speed * duration)
        # Ensure distance stays within reasonable bounds for the sport
        distance = int(max(sport_params['distance_range'][0], min(distance, sport_params['distance_range'][1])))
    
    # Generate comment (30% chance)
    comment = None
    if random.random() < 0.3:
        comments = {
            'Course à pied': ["Entraînement matinal", "Préparation course", "Endurance fondamentale"],
            'Vélo': ["Sortie en groupe", "Entraînement intensif", "Parcours vallonné"],
            'Randonnée': ["Belle vue aujourd'hui", "Randonnée en montagne", "Découverte d'un nouveau sentier"],
            'Natation': ["Séance de crawl", "Nage en eau libre", "Exercices techniques"],
            'Yoga': ["Séance détente", "Yoga matinal", "Session de méditation"]
        }
        comment = random.choice(comments.get(sport_type, ["Bonne séance d'entraînement"]))
    
    return {
        'ID_salarie': employee_id,
        'Date_de_debut': start_time,
        'Sport_type': sport_type,
        'Distance_m': distance,
        'Date_de_fin': end_time,
        'Commentaire': comment
    }

def generate_employee_activities(employee_id, hr_row=None):
    """Generate activities for one employee"""
    # Determine number of activities based on transport mode
    transport_mode = hr_row['Moyen de déplacement'] if hr_row is not None else None
    
    if transport_mode in ['Vélo', 'Marche', 'Trottinette']:
        # More active if using active transport
        num_activities = random.randint(5, 50)
    else:
        # Standard activity level
        num_activities = random.randint(0, 20)
    
    return [generate_activity(employee_id) for _ in range(num_activities)]

def generate_all_activities(hr_df):
    """Generate activities for all employees"""
    all_activities = []
    
    for _, employee in hr_df.iterrows():
        activities = generate_employee_activities(employee['ID salarié'], employee)
        all_activities.extend(activities)
    
    # Create DataFrame
    df = pd.DataFrame(all_activities)
    
    # Add unique ID
    df.insert(0, 'ID', range(1, len(df)+1))
    
    # Ensure correct column order
    df = df[['ID', 'ID_salarie', 'Date_de_debut', 'Sport_type', 'Distance_m', 'Date_de_fin', 'Commentaire']]
    df["Distance_m"] = df['Distance_m'].fillna(0).astype(int)  # Fill NaN distances with 0 and convert to int
    return df



In [25]:
# Main execution

# Path to HR data file
hr_file_path = '../data/DonneesRH.xlsx'

if not os.path.exists(hr_file_path):
    raise FileNotFoundError(f"HR data file not found at: {hr_file_path}")

print(f"Loading employee data from {hr_file_path}...")
hr_df = load_employee_data(hr_file_path)

print("Generating synthetic Strava-like activity data...")
activities_df = generate_all_activities(hr_df)

Loading employee data from ../data/DonneesRH.xlsx...
Loaded HR data with 161 employees
Generating synthetic Strava-like activity data...


In [26]:
activities_df.head()

Unnamed: 0,ID,ID_salarie,Date_de_debut,Sport_type,Distance_m,Date_de_fin,Commentaire
0,1,59019,2024-08-18 12:19:38.076346,Vélo,34489,2024-08-18 14:04:44.076346,Sortie en groupe
1,2,59019,2024-09-11 13:32:20.076346,Musculation,0,2024-09-11 15:22:57.076346,
2,3,59019,2024-09-14 04:30:47.076346,Course à pied,8869,2024-09-14 05:20:38.076346,
3,4,59019,2024-10-25 00:30:41.076346,Musculation,0,2024-10-25 02:29:24.076346,
4,5,59019,2024-11-02 09:51:56.076346,Escalade,0,2024-11-02 11:23:15.076346,


In [27]:
activities_df

Unnamed: 0,ID,ID_salarie,Date_de_debut,Sport_type,Distance_m,Date_de_fin,Commentaire
0,1,59019,2024-08-18 12:19:38.076346,Vélo,34489,2024-08-18 14:04:44.076346,Sortie en groupe
1,2,59019,2024-09-11 13:32:20.076346,Musculation,0,2024-09-11 15:22:57.076346,
2,3,59019,2024-09-14 04:30:47.076346,Course à pied,8869,2024-09-14 05:20:38.076346,
3,4,59019,2024-10-25 00:30:41.076346,Musculation,0,2024-10-25 02:29:24.076346,
4,5,59019,2024-11-02 09:51:56.076346,Escalade,0,2024-11-02 11:23:15.076346,
...,...,...,...,...,...,...,...
1618,1619,94680,2025-04-28 12:05:18.076346,Yoga,0,2025-04-28 12:50:36.076346,
1619,1620,94680,2025-08-05 05:39:40.076346,Marche,5442,2025-08-05 06:43:28.076346,
1620,1621,94680,2024-11-29 16:45:31.076346,Vélo,50000,2024-11-29 19:23:22.076346,
1621,1622,94680,2024-12-03 11:04:42.076346,Yoga,0,2024-12-03 11:44:50.076346,


In [28]:
# Save to CSV
csv_path = 'strava_simulation.csv'
activities_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"\nGenerated {len(activities_df)} activities for {len(hr_df)} employees")
print(f"Data saved to {csv_path}")

# Sample output
print("\nSample data:")
print(activities_df.sample(3).to_markdown(index=False))


Generated 1623 activities for 161 employees
Data saved to strava_simulation.csv

Sample data:
|   ID |   ID_salarie | Date_de_debut              | Sport_type   |   Distance_m | Date_de_fin                | Commentaire   |
|-----:|-------------:|:---------------------------|:-------------|-------------:|:---------------------------|:--------------|
|  532 |        99401 | 2025-06-11 19:11:11.076346 | Marche       |         7301 | 2025-06-11 20:28:14.076346 |               |
|  193 |        70643 | 2025-05-17 18:29:45.076346 | Trottinette  |        11697 | 2025-05-17 19:05:07.076346 |               |
| 1184 |        31222 | 2024-10-10 18:18:58.076346 | Marche       |         5977 | 2024-10-10 19:48:16.076346 |               |
