In [1]:
import os
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

In [None]:
# Config
num_users = np.random.randint(1000, 10000)   # between 1,000–10,000 users
start_date = datetime(2023, 4, 1)            # 6 months window
end_date = datetime(2023, 9, 30)

# Output folder
output_dir = "synthetic_user_data"
os.makedirs(output_dir, exist_ok=True)

# Expanded activities
activities = ["walking", "running", "cycling", "swimming", "yoga", "gym_workout", "hiking"]

# Generate date range
dates = pd.date_range(start=start_date, end=end_date, freq="D")

# Generate synthetic data
records = []
for user_id in range(1, num_users + 1):
    for date in dates:
        activity = random.choice(activities)

        if activity == "walking":
            steps = np.random.randint(3000, 15000)
            calories_burned = steps * 0.04 + np.random.randint(100, 300)
            heart_rate_avg = np.random.randint(70, 110)
        elif activity == "running":
            steps = np.random.randint(5000, 20000)
            calories_burned = steps * 0.06 + np.random.randint(200, 500)
            heart_rate_avg = np.random.randint(120, 170)
        elif activity == "cycling":
            steps = np.random.randint(500, 5000)  # fewer steps
            calories_burned = steps * 0.1 + np.random.randint(300, 700)
            heart_rate_avg = np.random.randint(100, 150)
        elif activity == "swimming":
            steps = np.random.randint(100, 1000)  # minimal steps
            calories_burned = np.random.randint(300, 800)
            heart_rate_avg = np.random.randint(110, 160)
        elif activity == "yoga":
            steps = np.random.randint(50, 500)   # very few steps
            calories_burned = np.random.randint(100, 300)
            heart_rate_avg = np.random.randint(60, 100)
        elif activity == "gym_workout":
            steps = np.random.randint(1000, 6000)
            calories_burned = np.random.randint(300, 900)
            heart_rate_avg = np.random.randint(100, 160)
        elif activity == "hiking":
            steps = np.random.randint(7000, 25000)
            calories_burned = steps * 0.05 + np.random.randint(400, 1000)
            heart_rate_avg = np.random.randint(110, 160)

        sleep_hours = np.random.uniform(4, 10)

        records.append([
            user_id,
            date.strftime("%Y-%m-%d"),
            steps,
            round(calories_burned, 2),
            heart_rate_avg,
            round(sleep_hours, 2),
            activity
        ])

# Create DataFrame
df = pd.DataFrame(records, columns=[
    "user_id", "date", "steps", "calories_burned", "heart_rate_avg", "sleep_hours", "activity_type"
])


In [3]:
# Save in partitioned folder structure by year/month/day
for day, group in df.groupby("date"):
    day_date = pd.to_datetime(day)
    path = os.path.join(output_dir, f"year={day_date.year}", f"month={day_date.month:02d}", f"day={day_date.day:02d}")
    os.makedirs(path, exist_ok=True)

    # Save CSV
    group.to_csv(os.path.join(path, "data.csv"), index=False)
    # Save Parquet
    group.to_parquet(os.path.join(path, "data.parquet"), index=False)

print(f"Synthetic data generated for {num_users} users over {len(dates)} days.")
print(f"Data saved in folder: {output_dir}")


Synthetic data generated for 1959 users over 183 days.
Data saved in folder: synthetic_user_data


In [4]:
import os
import pandas as pd
import json
from glob import glob

# Path to synthetic dataset
data_dir = "synthetic_user_data"

# Collect all parquet/csv files
files = glob(os.path.join(data_dir, "year=*/*/*/data.parquet"))

summary = {
    "dataset_overview": {
        "total_files": len(files),
        "time_range": [],
        "num_records": 0,
        "num_users": 0
    },
    "monthly_summary": {},   # per month stats
    "activity_summary": {}   # per activity stats
}

all_dfs = []
for f in files:
    df = pd.read_parquet(f)
    all_dfs.append(df)

# Concatenate (sample if too big for memory)
df = pd.concat(all_dfs, ignore_index=True)

# Fill dataset overview
summary["dataset_overview"]["num_records"] = len(df)
summary["dataset_overview"]["num_users"] = df["user_id"].nunique()
summary["dataset_overview"]["time_range"] = [df["date"].min(), df["date"].max()]

# Monthly summary
df["month"] = pd.to_datetime(df["date"]).dt.to_period("M")
monthly = df.groupby("month").agg(
    num_records=("user_id", "count"),
    num_users=("user_id", "nunique"),
    avg_steps=("steps", "mean"),
    avg_calories=("calories_burned", "mean"),
    avg_hr=("heart_rate_avg", "mean"),
    avg_sleep=("sleep_hours", "mean")
).reset_index()

summary["monthly_summary"] = monthly.to_dict(orient="records")

# Activity summary
activity = df.groupby("activity_type").agg(
    num_records=("user_id", "count"),
    num_users=("user_id", "nunique"),
    avg_steps=("steps", "mean"),
    min_steps=("steps", "min"),
    max_steps=("steps", "max"),
    avg_calories=("calories_burned", "mean"),
    avg_hr=("heart_rate_avg", "mean"),
    avg_sleep=("sleep_hours", "mean")
).reset_index()

summary["activity_summary"] = activity.to_dict(orient="records")

# Save to JSON
with open("dataset_summary.json", "w") as f:
    json.dump(summary, f, indent=4, default=str)

print("Summary saved to dataset_summary.json")


Summary saved to dataset_summary.json
