In [29]:
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

# Setting the seed for reproducibility
np.random.seed(42)

# 1. Random Sampling Rate

## a) Random missingness: 30 Days

In a set time range of 30 days, random full days will be missing, and random portions of the day will be missing

In [32]:

def generate_datetime_range_with_varied_seconds_and_days(start_date, end_date, min_days=25):
    # Determine the total number of days in the range and calculate missing days fraction
    total_days = (end_date - start_date).days + 1
    fraction_missing_days = np.random.uniform(0, 0.3)
    days_to_include = int((1 - fraction_missing_days) * total_days)
    
    # Randomly select days to include data
    all_days = [start_date + timedelta(days=i) for i in range(total_days)]
    included_days = sorted(np.random.choice(all_days, days_to_include, replace=False))
    
    datetime_range = []
    for day in included_days:
        current_date = datetime(day.year, day.month, day.day)
        end_of_day = current_date + timedelta(days=1)
        
        while current_date < end_of_day:
            # Determine if we should introduce missingness
            if np.random.rand() < 0.1:  # 10% chance to skip data for a batch of hours
                current_date += timedelta(hours=np.random.rand())  # Skip between 0 to 1 hours
            else:
                # Add random seconds to the datetime
                seconds = np.random.randint(0, 60)
                current_date_with_seconds = current_date.replace(second=seconds)
                datetime_range.append(current_date_with_seconds)
                current_date += timedelta(minutes=np.random.randint(1, 7))  # Increment by 1 to 7 minutes sampling rate
            
            # Break the loop if the day is fully covered
            if current_date >= end_of_day:
                break

    return datetime_range

# Now let's test the updated function by generating a smaller sample dataframe for one person
test_person_id = 100001
start_datetime = datetime(2024, 1, 1)
end_datetime = datetime(2024, 1, 30, 23, 59)

# Generate datetimes with varied seconds and ensure coverage over more than 25 days with the updated function
datetimes = generate_datetime_range_with_varied_seconds_and_days(start_datetime, end_datetime)
heart_rates = np.random.randint(60, 100, size=len(datetimes))  # Random heart rate between 60 and 100

test_df = pd.DataFrame({
    'person_id': test_person_id,
    'datetime': datetimes,
    'heart_rate': heart_rates
})

test_df.head(), len(test_df)


(   person_id                   datetime  heart_rate
 0     100001 2024-01-02 00:00:50.000000          85
 1     100001 2024-01-02 00:04:33.000000          87
 2     100001 2024-01-02 00:58:52.653376          91
 3     100001 2024-01-02 01:03:49.653376          84
 4     100001 2024-01-02 01:05:21.653376          66,
 4671)

Generate dataset:

In [33]:
# Generate 50 unique person_ids
n_people = 50
person_ids = np.random.choice(range(100000, 999999), 50, replace=False)

start_datetime = datetime(2024, 1, 1)
end_datetime = datetime(2024, 1, 30, 23, 59)

# Creating the dataframe
data = []
for person_id in person_ids:
    datetimes = generate_datetime_range_with_varied_seconds_and_days(start_datetime, end_datetime)
    heart_rates = np.random.randint(60, 100, size=len(datetimes))  # Random heart rate between 60 and 100

    for dt, hr in zip(datetimes, heart_rates):
        data.append([person_id, dt, hr])

df = pd.DataFrame(data, columns=["person_id", "datetime", "heart_rate"])

In [36]:
df.to_csv('Random_Missingness_30_days.csv')

## Fixed Missingness Time-Range Clusters

All patients will fall into a cluster of "more probabalistic missingness" during a fixed time period

e.g. some patients will have high missingness in the first 7 days, some will have high missingness in the first 15 days, some will have missingness in the last 10 days 

In [37]:
np.random.seed(42)  # For reproducibility

def adjust_missingness(current_date, start_date, cluster_type):
    """
    Adjust the probability of missingness based on the cluster type and the date.
    """
    delta_days = (current_date - start_date).days
    if cluster_type == 1 and delta_days > 23:  # Last 7 days for a 30-day period
        return 0.3  # Increase to 30% chance
    elif cluster_type == 2 and delta_days > 15:  # Last 15 days
        return 0.3
    elif cluster_type == 3 and 10 <= delta_days <= 20:  # Between day 10 and 20
        return 0.3
    else:
        return 0.1  # Default 10% chance

def generate_datetime_range_with_varied_seconds_and_days(start_date, end_date, min_days=25, cluster_type=4):
    total_days = (end_date - start_date).days + 1
    all_days = [start_date + timedelta(days=i) for i in range(total_days)]


        # Initialize probabilities for each day to be included
    p = np.ones(total_days) / total_days  # Default equal probability
    
    # Adjust probabilities based on the cluster type
    if cluster_type == 1:  # Last 7 days have higher missingness
        for i in range(-7, 0):
            p[i] *= 0.5  # Reduce the probability of including these days
    elif cluster_type == 2:  # Last 15 days have higher missingness
        for i in range(-15, 0):
            p[i] *= 0.5
    elif cluster_type == 3:  # Higher missingness between day 10 and 20
        for i in range(9, 20):
            if i < total_days:  # Check needed to avoid index error
                p[i] *= 0.5
    
    # Normalize probabilities to sum to 1
    p /= p.sum()
    
    # Determine days to include with adjusted probabilities
    fraction_missing_days = np.random.uniform(0, 0.3)
    days_to_include = int((1 - fraction_missing_days) * total_days)
    included_days = sorted(np.random.choice(all_days, days_to_include, replace=False, p=p))
    
    datetime_range = []
    for day in included_days:
        current_date = datetime(day.year, day.month, day.day)
        end_of_day = current_date + timedelta(days=1)
        
        while current_date < end_of_day:
            missingness_chance = adjust_missingness(current_date, start_date, cluster_type)
            if np.random.rand() < missingness_chance:
                current_date += timedelta(hours=np.random.rand())
            else:
                seconds = np.random.randint(0, 60)
                current_date_with_seconds = current_date.replace(second=seconds)
                datetime_range.append(current_date_with_seconds)
                current_date += timedelta(minutes=np.random.randint(1, 7)) # Random sampling rate
            
            if current_date >= end_of_day:
                break

    return datetime_range

# Generate 50 unique person_ids and assign them to 4 clusters
n_people = 50
person_ids = np.random.choice(range(100000, 999999), n_people, replace=False)
clusters = np.repeat([1, 2, 3, 4], repeats=n_people // 4)

start_datetime = datetime(2024, 1, 1)
end_datetime = datetime(2024, 1, 30, 23, 59)

# Creating the dataframe
data = []
for person_id, cluster in zip(person_ids, clusters):
    datetimes = generate_datetime_range_with_varied_seconds_and_days(start_datetime, end_datetime, cluster_type=cluster)
    heart_rates = np.random.randint(60, 100, size=len(datetimes))

    for dt, hr in zip(datetimes, heart_rates):
        data.append([person_id, dt, hr])

df = pd.DataFrame(data, columns=["person_id", "datetime", "heart_rate"])


In [38]:
df.to_csv('Four_Cluster_Missingness_30_days.csv')