In [2]:
import numpy as np
import pandas as pd
import os

In [5]:
np.random.seed(42)
N = 2000

priority_choices = ["low", "medium", "high"]
category_choices = ["work", "study", "chores", "health", "personal"]
urgency_choices = ["low", "medium", "high"]
busy_choices = ["low", "medium", "high"]

tasks = pd.DataFrame({
    "task_id": np.arange(1, N+1),
    "task_length_minutes": np.random.randint(5, 180, N),
    "priority": np.random.choice(priority_choices, N),
    "category": np.random.choice(category_choices, N),
    "due_in_days": np.random.randint(0, 14, N),
    "reminders_set": np.random.randint(0, 2, N),
    "past_completion_rate": np.round(np.random.uniform(0.2, 1, N), 2),
    "is_weekend": np.random.randint(0, 2, N),
    "day_of_week": np.random.randint(0, 7, N),  # 0=Mon, 6=Sun
    "dead_urgency": np.random.choice(urgency_choices, N),
    "user_busy_level": np.random.choice(busy_choices, N)
})

# probability of completion based on features
prob = (
    0.3 +
    0.2 * (tasks["priority"] == "high") +
    0.15 * (tasks["reminders_set"]) +
    0.25 * tasks["past_completion_rate"] -
    0.1 * (tasks["due_in_days"] > 7) -
    0.05 * (tasks["user_busy_level"] == "high") +
    0.05 * (tasks["dead_urgency"] == "high")
)

prob = np.clip(prob, 0, 1)
tasks["completed"] = np.random.binomial(1, prob)

In [6]:
# create folder if missing and save CSV
os.makedirs("data", exist_ok=True)
tasks.to_csv("data/tasks.csv", index=False)

tasks.head()

Unnamed: 0,task_id,task_length_minutes,priority,category,due_in_days,reminders_set,past_completion_rate,is_weekend,day_of_week,dead_urgency,user_busy_level,completed
0,1,107,low,personal,6,1,0.65,1,1,medium,medium,1
1,2,97,low,study,4,1,0.2,0,0,low,high,1
2,3,19,high,health,9,0,0.26,1,2,medium,low,1
3,4,111,low,personal,13,1,0.91,1,6,low,medium,0
4,5,76,high,health,13,0,0.26,1,1,medium,high,0
