In [1]:
!pip install matplotlib
!pip install seaborn
!pip install pandas
!pip install scipy
!pip install ctgan



In [2]:
# Let's read the CSV file into a pandas DataFrame using the given local file path.
import pandas as pd

# Path to the CSV file
file_path = 'combined_synthetic_data_with_patient_ids.csv'

# Read the CSV file into a DataFrame
combined_df = pd.read_csv(file_path)

# Show the first few rows of the DataFrame
combined_df.head()


Unnamed: 0,Datetime,Code,Activity_Type,Heart rate___beats/minute,Calories burned_kcal,Exercise duration_s,Sleep type duration_minutes,Sleep duration_minutes,Floors climbed___floors,Patient ID
0,2022=12-08 14:17,,No Physical Activity,72.0,,,,,,User0
1,2022=12-08 21:05,LA11836-6,Running,156.0,20.0,40851.0,,,,User0
2,2022=12-08 22:01,LA11836-6,Running,170.0,30.0,44878.0,,,,User0
3,2022=12-08 22:06,LA11836-6,Running,166.0,20.0,21497.0,,,,User0
4,2022=12-08 22:16,,No Physical Activity,78.0,,,,,,User0


In [3]:
# Import necessary libraries
import pandas as pd
from ctgan import CTGAN
import numpy as np

# Load the real dataset (combined_df should already exist from previous steps)

# Define the mapping dictionary for activity types
activity_mapping = {
    'Light Sleep': 1,
    'No Physical Activity': 2,
    'Running': 3,
    'Floors Climbed': 4,
    'REM Sleep': 5,
    'Walking': 6
}

# Replace the 'Activity_Type' values based on the mapping dictionary
combined_df['Activity_Type'] = combined_df['Activity_Type'].replace(activity_mapping)

# Retain Activity_Type in the filtered data
df_1_5 = combined_df[combined_df['Activity_Type'].isin([1, 5])][[
    'Activity_Type', 'Heart rate___beats/minute', 'Sleep type duration_minutes', 'Sleep duration_minutes'
]].dropna()

df_3_6 = combined_df[combined_df['Activity_Type'].isin([3, 6])][[
    'Activity_Type', 'Heart rate___beats/minute', 'Calories burned_kcal', 'Exercise duration_s'
]].dropna()

df_2 = combined_df[combined_df['Activity_Type'] == 2][[
    'Activity_Type', 'Heart rate___beats/minute'
]].dropna()

df_4 = combined_df[combined_df['Activity_Type'] == 4][[
    'Activity_Type', 'Heart rate___beats/minute', 'Floors climbed___floors'
]].dropna()

# Helper function to train and generate synthetic data using CTGAN
def generate_synthetic_data(real_data, num_samples):
    ctgan = CTGAN()
    ctgan.fit(real_data, epochs=100)
    synthetic_data = ctgan.sample(num_samples)
    return synthetic_data

# Generate synthetic data for each subset
synthetic_df_1_5 = generate_synthetic_data(df_1_5, num_samples=10000)
synthetic_df_3_6 = generate_synthetic_data(df_3_6, num_samples=10000)
synthetic_df_2 = generate_synthetic_data(df_2, num_samples=10000)
synthetic_df_4 = generate_synthetic_data(df_4, num_samples=10000)

# Combine all the generated synthetic data
synthetic_combined_df = pd.concat([synthetic_df_1_5, synthetic_df_3_6, synthetic_df_2, synthetic_df_4], ignore_index=True)

# Convert the relevant columns to integer (keeping only the integer part)
cols_to_convert = ['Heart rate___beats/minute', 'Sleep type duration_minutes', 'Sleep duration_minutes',
                   'Calories burned_kcal', 'Exercise duration_s', 'Floors climbed___floors']

for col in cols_to_convert:
    if col in synthetic_combined_df.columns:
        synthetic_combined_df[col] = np.floor(synthetic_combined_df[col]).astype('Int64')  # Keep only integer part

# Ensure 'Floors climbed___floors' is always 1 for "Floors Climbed" activity
synthetic_combined_df.loc[synthetic_combined_df['Activity_Type'] == 'Floors Climbed', 'Floors climbed___floors'] = 1

# Reverse the mapping to convert Activity_Type back to original string values
reverse_activity_mapping = {
    1: 'Light Sleep',
    2: 'No Physical Activity',
    3: 'Running',
    4: 'Floors Climbed',
    5: 'REM Sleep',
    6: 'Walking'
}

synthetic_combined_df['Activity_Type'] = synthetic_combined_df['Activity_Type'].replace(reverse_activity_mapping)

# Define the mapping for the Code column
code_mapping = {
    'Light Sleep': '93830-8',
    'No Physical Activity': '',
    'Running': 'LA11836-6',
    'Floors Climbed': '',
    'REM Sleep': '93829-0',
    'Walking': '370'
}

# Add the Code column to the DataFrame
synthetic_combined_df['Code'] = synthetic_combined_df['Activity_Type'].map(code_mapping)

# Show the final synthetic data with original Activity_Type values and Code
print(synthetic_combined_df.head())

# Save the synthetic data to a CSV file
synthetic_combined_df.to_csv('synthetic_data_with_activity_type.csv', index=False)


  combined_df['Activity_Type'] = combined_df['Activity_Type'].replace(activity_mapping)
  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


  Activity_Type  Heart rate___beats/minute  Sleep type duration_minutes  \
0   Light Sleep                         55                            6   
1   Light Sleep                         66                           13   
2   Light Sleep                         79                            6   
3   Light Sleep                         79                           13   
4   Light Sleep                         80                            9   

   Sleep duration_minutes  Calories burned_kcal  Exercise duration_s  \
0                      10                  <NA>                 <NA>   
1                      33                  <NA>                 <NA>   
2                      32                  <NA>                 <NA>   
3                      36                  <NA>                 <NA>   
4                      13                  <NA>                 <NA>   

   Floors climbed___floors     Code  
0                     <NA>  93830-8  
1                     <NA>  93830-8  
2 

In [4]:
import pandas as pd
import numpy as np
import random
from datetime import timedelta

def generate_synthetic_data(combined_df, start_date, end_date, num_samples):
    """
    Generate synthetic datetime and activity type data based on the original distribution.

    Parameters:
        combined_df (pd.DataFrame): Original DataFrame containing 'Datetime' and 'Activity_Type' columns.
        start_date (str or pd.Timestamp): The start date for generating synthetic datetime values.
        end_date (str or pd.Timestamp): The end date for generating synthetic datetime values.
        num_samples (int): The number of synthetic samples to generate.

    Returns:
        synthetic_df (pd.DataFrame): A DataFrame containing synthetic 'Datetime' and 'Activity_Type' columns.
    """
    # Step 1: Convert 'Datetime' to a string if it's not already in string format
    if combined_df['Datetime'].dtype != 'object':
        combined_df['Datetime'] = combined_df['Datetime'].astype(str)

    # Step 2: Replace '=' with '-' in the 'Datetime' strings
    combined_df['Datetime'] = combined_df['Datetime'].str.replace('=', '-', regex=False)

    # Step 3: Convert the cleaned 'Datetime' column back to a proper datetime object
    combined_df['Datetime'] = pd.to_datetime(combined_df['Datetime'], errors='coerce')

    # Step 4: Extract hour and minute, convert to total minutes of the day
    combined_df['Hour'] = combined_df['Datetime'].dt.hour
    combined_df['Minute'] = combined_df['Datetime'].dt.minute
    combined_df['Minutes_of_Day'] = combined_df['Hour'] * 60 + combined_df['Minute']

    # Step 5: Learn the distribution of 'Minutes_of_Day'
    minutes_distribution = combined_df['Minutes_of_Day'].value_counts(normalize=True)

    # Step 6: Generate similar 'Minutes_of_Day' values based on the learned distribution
    def generate_similar_minutes_of_day():
        return np.random.choice(minutes_distribution.index, p=minutes_distribution.values)

    # Generate random 'Minutes_of_Day' values based on the distribution
    random_minutes_of_day = [generate_similar_minutes_of_day() for _ in range(num_samples)]

    # Convert the generated 'Minutes_of_Day' back to hours and minutes
    generated_hours = [int(minute // 60) for minute in random_minutes_of_day]  # Ensure conversion to int
    generated_minutes = [int(minute % 60) for minute in random_minutes_of_day]  # Ensure conversion to int

    # Step 7: Generate random dates within the provided date range
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    def random_date(start, end):
        return start + timedelta(days=random.randint(0, (end - start).days))

    random_dates = [random_date(start_date, end_date) for _ in range(num_samples)]

    # Combine the random dates with the generated hours and minutes to form new Datetime values
    generated_datetimes = [pd.Timestamp(date) + timedelta(hours=int(hour), minutes=int(minute))
                           for date, hour, minute in zip(random_dates, generated_hours, generated_minutes)]

    # Step 8: Sort the generated datetimes in ascending order
    generated_datetimes_sorted = sorted(generated_datetimes)

    # Step 9: Learn the distribution of 'Activity_Type' from the original dataset
    activity_distribution = combined_df['Activity_Type'].value_counts(normalize=True)

    # Step 10: Generate 'Activity_Type' values based on the distribution
    def generate_activity_type():
        return np.random.choice(activity_distribution.index, p=activity_distribution.values)

    generated_activity_types = [generate_activity_type() for _ in range(num_samples)]

    # Step 11: Create a DataFrame to store the sorted Datetime and corresponding Activity_Type values
    synthetic_df = pd.DataFrame({
        'Datetime': generated_datetimes_sorted,
        'Activity_Type': generated_activity_types
    })

    return synthetic_df

# Example usage:
# combined_df = ...  # Load or create your DataFrame here
synthetic_data = generate_synthetic_data(combined_df, "2022-12-08", "2022-12-30", 380)
print(synthetic_data.head())
synthetic_data.to_csv('synthetic_data_with_datetime_and_activity.csv', index=False)


             Datetime  Activity_Type
0 2022-12-08 00:48:00              4
1 2022-12-08 01:47:00              1
2 2022-12-08 01:53:00              5
3 2022-12-08 02:39:00              1
4 2022-12-08 04:37:00              4


In [6]:
import pandas as pd
import numpy as np

# Load the datasets
synthetic_data_with_datetime = pd.read_csv('synthetic_data_with_datetime_and_activity.csv')
synthetic_data_with_activity_type = pd.read_csv('synthetic_data_with_activity_type.csv')

# Define the reverse activity mapping
reverse_activity_mapping = {
    1: 'Light Sleep',
    2: 'No Physical Activity',
    3: 'Running',
    4: 'Floors Climbed',
    5: 'REM Sleep',
    6: 'Walking'
}

# Map Activity_Type in datetime DataFrame to activity names
synthetic_data_with_datetime['Activity_Type'] = synthetic_data_with_datetime['Activity_Type'].replace(reverse_activity_mapping)

# Step 3: For each row in synthetic_data_with_datetime, randomly select a row from synthetic_data_with_activity_type
new_rows = []

for index, row in synthetic_data_with_datetime.iterrows():
    activity = row['Activity_Type']
    
    # Find rows in synthetic_data_with_activity_type that match the current activity
    matching_rows = synthetic_data_with_activity_type[synthetic_data_with_activity_type['Activity_Type'] == activity]
    
    if not matching_rows.empty:
        # Randomly select one row from matching_rows
        random_row = matching_rows.sample(n=1).iloc[0]
        
        # Create a new row by combining the datetime row with the randomly selected activity type row
        new_row = {**row, **random_row}
        new_rows.append(new_row)

# Create a new DataFrame from the new rows
final_combined_data = pd.DataFrame(new_rows)

# Reorder the columns as specified
column_order = [
    'Datetime', 
    'Code', 
    'Activity_Type', 
    'Heart rate___beats/minute', 
    'Calories burned_kcal', 
    'Exercise duration_s', 
    'Sleep duration_minutes', 
    'Sleep type duration_minutes', 
    'Floors climbed___floors'
]

final_combined_data = final_combined_data[column_order]

# Step 5: Save the final DataFrame to a new CSV file
final_combined_data.to_csv('Synthetic_User1.csv', index=False)

# Display the head of the final DataFrame
print(final_combined_data.head())


              Datetime     Code   Activity_Type  Heart rate___beats/minute  \
0  2022-12-08 00:48:00      NaN  Floors Climbed                         67   
1  2022-12-08 01:47:00  93830-8     Light Sleep                         49   
2  2022-12-08 01:53:00  93829-0       REM Sleep                         54   
3  2022-12-08 02:39:00  93830-8     Light Sleep                         80   
4  2022-12-08 04:37:00      NaN  Floors Climbed                         64   

   Calories burned_kcal  Exercise duration_s  Sleep duration_minutes  \
0                   NaN                  NaN                     NaN   
1                   NaN                  NaN                     9.0   
2                   NaN                  NaN                     4.0   
3                   NaN                  NaN                    34.0   
4                   NaN                  NaN                     NaN   

   Sleep type duration_minutes  Floors climbed___floors  
0                          NaN          