In [1]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# File path to your Google Drive
file_path = "/content/drive/MyDrive/Dr.Kumo/patients_data.csv"

**LOOP FOR MEASUREMENTS**

In [3]:
'''
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Load data
df = pd.read_csv(file_path) # Change the path into your path

# Convert dateTime column to datetime type and sort it in descending order
df['dateTime'] = pd.to_datetime(df['dateTime'])
df.sort_values('dateTime', ascending=False, inplace=True)

# Loop over each patient
patient_ids = df['patientId'].unique()
output = []
for id in patient_ids:
    patient_data = df[df['patientId'] == id]

    # Separate systolic, diastolic, and heart rate data
    systolic = patient_data[patient_data['vitalSign'] == 'systolic_blood_pressure']
    diastolic = patient_data[patient_data['vitalSign'] == 'diastolic_blood_pressure']
    heart_rate = patient_data[patient_data['vitalSign'] == 'heart_rate']

    # Loop over each 45-day period
    for i in range(45, len(patient_data), 15):
        past_30_days_systolic = systolic[i-30:i]
        past_30_days_diastolic = diastolic[i-30:i]
        past_30_days_heart_rate = heart_rate[i-30:i]

        future_15_days_systolic = systolic[i-45:i-30]
        future_15_days_diastolic = diastolic[i-45:i-30]

        # Skip if less than 30 days of past data or 15 days of future data are available
        if len(past_30_days_systolic) < 30 or len(past_30_days_diastolic) < 30 or len(past_30_days_heart_rate) < 30 or len(future_15_days_systolic) < 15 or len(future_15_days_diastolic) < 15:
            continue

        # Calculate statistics for this 45-day period
        stats = {}
        stats['patientId'] = id
        stats['Age'] = patient_data['Age'].iloc[0]
        stats['Gender'] = patient_data['Gender'].iloc[0]
        stats['Weight'] = patient_data['Weight'].iloc[0]
        stats['Height'] = patient_data['Height'].iloc[0]
        stats['start_date'] = future_15_days_systolic['dateTime'].max()
        stats['end_date'] = past_30_days_systolic['dateTime'].min()

        for name, data in [('systolic', past_30_days_systolic), ('diastolic', past_30_days_diastolic), ('heart_rate', past_30_days_heart_rate)]:
            stats['past_' + name + '_mean'] = data['value'].mean()
            stats['past_' + name + '_max'] = data['value'].max()
            stats['past_' + name + '_min'] = data['value'].min()
            stats['past_' + name + '_median'] = data['value'].median()
            stats['past_' + name + '_quartile_75'] = data['value'].quantile(0.75)

            # Calculate slope using Linear Regression
            X = np.array(range(len(data))).reshape(-1, 1)
            y = data['value'].values
            model = LinearRegression()
            model.fit(X, y)
            stats['past_' + name + '_slope'] = model.coef_[0]

            # Percentage of measurements above certain thresholds
            if name == 'systolic':
                stats['past_%_systolic_above_140'] = (past_30_days_systolic['value'] > 140).mean() * 100
            elif name == 'diastolic':
                stats['past_%_diastolic_above_90'] = (past_30_days_diastolic['value'] > 90).mean() * 100

        # Future 15 days stats
        stats['future_%_systolic_above_140'] = (future_15_days_systolic['value'] > 140).mean() * 100
        stats['future_%_diastolic_above_90'] = (future_15_days_diastolic['value'] > 90).mean() * 100
        stats['future_%_systolic_above_150'] = (future_15_days_systolic['value'] > 150).mean() * 100
        stats['future_%_diastolic_above_100'] = (future_15_days_diastolic['value'] > 100).mean() * 100

        output.append(stats)

# Convert output to DataFrame
output_df = pd.DataFrame(output)

# Save to CSV
output_df.to_csv('output.csv', index=False)
'''

"\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\n\n# Load data\ndf = pd.read_csv(file_path) # Change the path into your path\n\n# Convert dateTime column to datetime type and sort it in descending order\ndf['dateTime'] = pd.to_datetime(df['dateTime'])\ndf.sort_values('dateTime', ascending=False, inplace=True)\n\n# Loop over each patient\npatient_ids = df['patientId'].unique()\noutput = []\nfor id in patient_ids:\n    patient_data = df[df['patientId'] == id]\n\n    # Separate systolic, diastolic, and heart rate data\n    systolic = patient_data[patient_data['vitalSign'] == 'systolic_blood_pressure']\n    diastolic = patient_data[patient_data['vitalSign'] == 'diastolic_blood_pressure']\n    heart_rate = patient_data[patient_data['vitalSign'] == 'heart_rate']\n\n    # Loop over each 45-day period\n    for i in range(45, len(patient_data), 15):\n        past_30_days_systolic = systolic[i-30:i]\n        past_30_days_diastolic = diastolic[i-30:i]

**LOOP FOR DAYS FORMAT**

In [4]:
'''
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime

# Load data
df = pd.read_csv(file_path)  # Change the path into your path

# Convert dateTime column to datetime type and sort it in descending order
df['dateTime'] = [datetime.datetime.strptime(x[:23].replace('T', ' '), '%Y-%m-%d %H:%M:%S.%f') for x in df['dateTime']]
df.sort_values('dateTime', ascending=False, inplace=True)

past_range = 30
future_range = 15

# Loop over each patient
patient_ids = df['patientId'].unique()

output = []
for id in patient_ids:
    patient_data = df[df['patientId'] == id]
    patient_data.sort_values('dateTime', ascending=False, inplace=True)
    print(f"Processing patient: {id}")
    print(f"Data length: {len(patient_data)}")

    # Separate systolic, diastolic, and heart rate data
    systolic = patient_data[patient_data['vitalSign'] == 'systolic_blood_pressure']
    diastolic = patient_data[patient_data['vitalSign'] == 'diastolic_blood_pressure']
    heart_rate = patient_data[patient_data['vitalSign'] == 'heart_rate']

    # Loop over each 45-day period
    i = 0
    while i < len(patient_data) - past_range - future_range:
        current_datetime = patient_data['dateTime'].iloc[i]
        future_data = patient_data[(patient_data['dateTime'] <= current_datetime) & (patient_data['dateTime'] > (current_datetime - datetime.timedelta(days=future_range)))]
        past_data = patient_data[(patient_data['dateTime'] <= current_datetime - datetime.timedelta(days=future_range)) & (patient_data['dateTime'] > (current_datetime - datetime.timedelta(days=future_range + past_range)))]

        # Skip if less than 30 days of past data or 15 days of future data are available
        #if len(past_data) < 30 or len(future_data) < 15:
        #    continue

        # Calculate statistics for this 45-day period
        stats = {}
        stats['patientId'] = id
        stats['Age'] = patient_data['Age'].iloc[0]
        stats['Gender'] = patient_data['Gender'].iloc[0]
        stats['Weight'] = patient_data['Weight'].iloc[0]
        stats['Height'] = patient_data['Height'].iloc[0]
        stats['start_date'] = future_data['dateTime'].max()
        stats['end_date'] = past_data['dateTime'].min()

        for name, data in [('systolic', past_data), ('diastolic', past_data), ('heart_rate', past_data)]:
            stats['past_' + name + '_mean'] = data['value'].mean()
            stats['past_' + name + '_max'] = data['value'].max()
            stats['past_' + name + '_min'] = data['value'].min()
            stats['past_' + name + '_median'] = data['value'].median()
            stats['past_' + name + '_quartile_75'] = data['value'].quantile(0.75)

            # Calculate slope using Linear Regression
            X = np.array(range(len(data))).reshape(-1, 1)
            y = data['value'].values
            model = LinearRegression()
            model.fit(X, y)
            stats['past_' + name + '_slope'] = model.coef_[0]

            # Percentage of measurements above certain thresholds
            if name == 'systolic':
                stats['past_%_systolic_above_140'] = (data['value'] > 140).mean() * 100
            elif name == 'diastolic':
                stats['past_%_diastolic_above_90'] = (data['value'] > 90).mean() * 100

        # Future 15 days stats
        stats['future_%_systolic_above_140'] = (future_data['value'] > 140).mean() * 100
        stats['future_%_diastolic_above_90'] = (future_data['value'] > 90).mean() * 100
        stats['future_%_systolic_above_150'] = (future_data['value'] > 150).mean() * 100
        stats['future_%_diastolic_above_100'] = (future_data['value'] > 100).mean() * 100

        output.append(stats)

        check_next_future = list(patient_data['dateTime'] < future_data['dateTime'].iloc[-1])
        #print(check_next_future)
        # using filter() + lambda + index()
        res = check_next_future.index(next(filter(lambda i: i == True, check_next_future)))

        # printing result
        print("The values till first True value : " + str(res))

        i = res #len(patient_data)

# Convert output to DataFrame
output_df = pd.DataFrame(output)

# Save to CSV
output_df.to_csv('output.csv', index=False)
'''

'\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\nimport datetime\n\n# Load data\ndf = pd.read_csv(file_path)  # Change the path into your path\n\n# Convert dateTime column to datetime type and sort it in descending order\ndf[\'dateTime\'] = [datetime.datetime.strptime(x[:23].replace(\'T\', \' \'), \'%Y-%m-%d %H:%M:%S.%f\') for x in df[\'dateTime\']]\ndf.sort_values(\'dateTime\', ascending=False, inplace=True)\n\npast_range = 30\nfuture_range = 15\n\n# Loop over each patient\npatient_ids = df[\'patientId\'].unique()\n\noutput = []\nfor id in patient_ids:\n    patient_data = df[df[\'patientId\'] == id]\n    patient_data.sort_values(\'dateTime\', ascending=False, inplace=True)\n    print(f"Processing patient: {id}")\n    print(f"Data length: {len(patient_data)}")\n\n    # Separate systolic, diastolic, and heart rate data\n    systolic = patient_data[patient_data[\'vitalSign\'] == \'systolic_blood_pressure\']\n    diastolic = patient_data[patie

**THE SEMI FINAL CODE**

In [5]:
'''
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime

# Load data
df = pd.read_csv('patients_data.csv')  # Change the path into your path

# Remove the milliseconds part from the 'dateTime' column
df['dateTime'] = df['dateTime'].str.replace(r'\.\d+', '', regex=True)
# Convert dateTime column to datetime type and sort it in descending order
df['dateTime'] = [datetime.datetime.strptime(x[:23].replace('T', ' '), '%Y-%m-%d %H:%M:%S') for x in df['dateTime']]
df.sort_values('dateTime', ascending=False, inplace=True)

past_range = 30
future_range = 15

# Function to calculate statistics
def calculate_statistics(name, data):
        stats = {}
        stats['past_' + name + '_mean'] = data['value'].mean()
        stats['past_' + name + '_max'] = data['value'].max()
        stats['past_' + name + '_min'] = data['value'].min()
        stats['past_' + name + '_median'] = data['value'].median()
        stats['past_' + name + '_quartile_75'] = data['value'].quantile(0.75)

        # Calculate slope using Linear Regression
        X = np.array(range(len(data))).reshape(-1, 1)
        y = data['value'].values
        model = LinearRegression()
        model.fit(X, y)
        stats['past_' + name + '_slope'] = model.coef_[0]

        # Percentage of measurements above certain thresholds
        if name == 'systolic':
            stats['past_%_systolic_above_140'] = (data['value'] > 140).mean() * 100
        elif name == 'diastolic':
            stats['past_%_diastolic_above_90'] = (data['value'] > 90).mean() * 100

        return stats

output = []
for id, group in df.groupby('patientId'):
    print(f"Processing patient: {id}")
    print(f"Data length: {len(group)}")

    # Loop over each 45-day period
    i = 0
    while i < len(group) - past_range - future_range:
        current_datetime = group['dateTime'].iloc[i]
        future_data = group[(group['dateTime'] <= current_datetime) & (group['dateTime'] > (current_datetime - pd.Timedelta(days=future_range)))]
        past_data = group[(group['dateTime'] <= current_datetime - pd.Timedelta(days=future_range)) & (group['dateTime'] > (current_datetime - pd.Timedelta(days=future_range + past_range)))]

        # Calculate statistics for this 45-day period
        stats = {}
        stats['patientId'] = id
        stats['Age'] = group['Age'].iloc[0]
        stats['Gender'] = group['Gender'].iloc[0]
        stats['Weight'] = group['Weight'].iloc[0]
        stats['Height'] = group['Height'].iloc[0]
        stats['start_date'] = future_data['dateTime'].max()
        stats['end_date'] = past_data['dateTime'].min()

        for name in ['systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate']:
            data = past_data[past_data['vitalSign'] == name]
            stats.update(calculate_statistics(name, data))

        # Future 15 days stats
        stats['future_%_systolic_above_140'] = (future_data['value'] > 140).mean() * 100
        stats['future_%_diastolic_above_90'] = (future_data['value'] > 90).mean() * 100
        stats['future_%_systolic_above_150'] = (future_data['value'] > 150).mean() * 100
        stats['future_%_diastolic_above_100'] = (future_data['value'] > 100).mean() * 100

        output.append(stats)

        next_future = group['dateTime'] < future_data['dateTime'].iloc[-1]
        i = next_future.idxmax() if any(next_future) else len(group)

# Convert output to DataFrame
output_df = pd.DataFrame(output)

# Save to CSV
output_df.to_csv('output.csv', index=False)
'''

'\nimport pandas as pd\nimport numpy as np\nfrom sklearn.linear_model import LinearRegression\nimport datetime\n\n# Load data\ndf = pd.read_csv(\'patients_data.csv\')  # Change the path into your path\n\n# Remove the milliseconds part from the \'dateTime\' column\ndf[\'dateTime\'] = df[\'dateTime\'].str.replace(r\'\\.\\d+\', \'\', regex=True)\n# Convert dateTime column to datetime type and sort it in descending order\ndf[\'dateTime\'] = [datetime.datetime.strptime(x[:23].replace(\'T\', \' \'), \'%Y-%m-%d %H:%M:%S\') for x in df[\'dateTime\']]\ndf.sort_values(\'dateTime\', ascending=False, inplace=True)\n\npast_range = 30\nfuture_range = 15\n\n# Function to calculate statistics\ndef calculate_statistics(name, data):\n        stats = {}\n        stats[\'past_\' + name + \'_mean\'] = data[\'value\'].mean()\n        stats[\'past_\' + name + \'_max\'] = data[\'value\'].max()\n        stats[\'past_\' + name + \'_min\'] = data[\'value\'].min()\n        stats[\'past_\' + name + \'_median\'] = 

**CODE PRINT OUT THE MIDDLE DATE** _ **FINAL CODE**

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import datetime

# Load data
df = pd.read_csv(file_path)  # Change the path into your path

# Remove the milliseconds part from the 'dateTime' column
df['dateTime'] = df['dateTime'].str.replace(r'\.\d+', '', regex=True)
# Convert dateTime column to datetime type and sort it in descending order
df['dateTime'] = df['dateTime'] = [datetime.datetime.strptime(x[:23], '%Y-%m-%d %H:%M:%S') for x in df['dateTime']]
#df.sort_values(by=['patientId', 'dateTime'], ascending=[ True , False], inplace=True)
df.sort_values('dateTime', ascending=False, inplace=True)

past_range = 30
future_range = 15

# Function to calculate statistics
def calculate_statistics(name, data):
        stats = {}
        stats['past_' + name + '_mean'] = data['value'].mean()
        stats['past_' + name + '_max'] = data['value'].max()
        stats['past_' + name + '_min'] = data['value'].min()
        stats['past_' + name + '_median'] = data['value'].median()
        stats['past_' + name + '_quartile_75'] = data['value'].quantile(0.75)

        # Calculate slope using Linear Regression
        X = np.array(range(len(data))).reshape(-1, 1)
        y = data['value'].values
        model = LinearRegression()
        model.fit(X, y)
        stats['past_' + name + '_slope'] = model.coef_[0]

        # Percentage of measurements above certain thresholds
        if name == 'systolic':
            stats['past_%_systolic_above_140'] = (data['value'] > 140).mean() * 100
        elif name == 'diastolic':
            stats['past_%_diastolic_above_90'] = (data['value'] > 90).mean() * 100

        return stats

# Loop over each patient
patient_ids = df['patientId'].unique()
output = []
for id in patient_ids[1:]:
    print(f"Processing patient: {id}")
    patient_data = df[df['patientId'] == id]

    # Skip patients without age or gender
    if pd.isnull(patient_data['Age'].iloc[0]) or pd.isnull(patient_data['Gender'].iloc[0]):
        print(f"Skipping patient {id} due to missing age or gender.")
        continue
    patient_data = patient_data.copy()
    patient_data.sort_values('dateTime', ascending=False, inplace=True)

    print(f"Data length: {len(patient_data)}")

    # Loop over each 45-day period
    i = 0
    while i < len(patient_data) - past_range - future_range:
        current_datetime = patient_data['dateTime'].iloc[i]
        future_data = patient_data[(patient_data['dateTime'] <= current_datetime) & (patient_data['dateTime'] > (current_datetime - pd.Timedelta(days=future_range)))]
        past_data = patient_data[(patient_data['dateTime'] <= current_datetime - pd.Timedelta(days=future_range)) & (patient_data['dateTime'] > (current_datetime - pd.Timedelta(days=future_range + past_range)))]

        #print(len(future_data), len(past_data))

        if (len(future_data) > 0) & (len(past_data) > 0):
            # Calculate statistics for this 45-day period
            stats = {}
            stats['patientId'] = id
            stats['Age'] = patient_data['Age'].iloc[0]
            stats['Gender'] = patient_data['Gender'].iloc[0]
            stats['Weight'] = patient_data['Weight'].iloc[0]
            stats['Height'] = patient_data['Height'].iloc[0]
            stats['start_date'] = future_data['dateTime'].max()
            stats['end_date'] = past_data['dateTime'].min()
            stats['end_of_future_date'] = future_data['dateTime'].min()
            stats['start_of_past_date'] = past_data['dateTime'].max()

            for name in ['systolic_blood_pressure', 'diastolic_blood_pressure', 'heart_rate']:
                data = past_data[past_data['vitalSign'] == name]
                stats.update(calculate_statistics(name, data))

            # Future 15 days stats
            stats['future_%_systolic_above_140'] = (future_data['value'] > 140).mean() * 100
            stats['future_%_diastolic_above_90'] = (future_data['value'] > 90).mean() * 100
            stats['future_%_systolic_above_150'] = (future_data['value'] > 150).mean() * 100
            stats['future_%_diastolic_above_100'] = (future_data['value'] > 100).mean() * 100

            output.append(stats)

        # find first index before last future date
        j = 0
        while (j < len(patient_data)-1) & (patient_data['dateTime'].iloc[j] > current_datetime - pd.Timedelta(days=future_range)):
          j += 1


        #print(future_data['dateTime'].iloc[-1])
        #next_future = list(patient_data['dateTime'] < future_data['dateTime'].iloc[-1])
        #print(next_future[:5])

        i = j


# Convert output to DataFrame
output_df = pd.DataFrame(output)

# Save to CSV
output_df.to_csv('output.csv', index=False)

Processing patient: 6965717038627028992
Data length: 424
Processing patient: 6970466310777995264
Data length: 348
Processing patient: 6965723974827245568
Data length: 1200
Processing patient: 6967925542544736256
Data length: 2064
Processing patient: 6970465837496926208
Data length: 399
Processing patient: 6975585679572992000
Data length: 1287
Processing patient: 6967927629554909184
Data length: 429
Processing patient: 6975586417564975104
Data length: 474
Processing patient: 6970144858853867520
Data length: 222
Processing patient: 6968561481335963648
Data length: 1686
Processing patient: 6968562349888241664
Data length: 723
Processing patient: 6976596037771198464
Data length: 204
Processing patient: 6973772143402156032
Data length: 228
Processing patient: 6978104816941662208
Data length: 390
Processing patient: 6965770322649808896
Data length: 948
Processing patient: 6975584282580353024
Data length: 975
Processing patient: 6970144463138062336
Data length: 489
Processing patient: 6968564