In [4]:
import pandas as pd

file_path = "../data_downloads/aleppo_processed.csv"
data = pd.read_csv(file_path).drop(["Unnamed: 0.2", "Unnamed: 0.1", "Unnamed: 0"], axis=1)

In [8]:
data[data["p_num"] == 140].sort_values("date")

Unnamed: 0,p_num,date,bgl
4536910,140,2015-03-30 16:47:35,372.0
4536909,140,2015-03-30 16:48:04,320.0
4536908,140,2015-03-31 06:19:28,338.0
4538736,140,2015-03-31 20:53:15,161.0
4538735,140,2015-04-01 12:14:52,84.0
...,...,...,...
4478752,140,2015-12-05 13:33:27,105.0
4478751,140,2015-12-05 13:38:27,106.0
4438855,140,2015-12-05 13:43:27,93.0
4438854,140,2015-12-05 13:48:27,90.0


In [12]:
from src.data.preprocessing.sampling import ensure_regular_time_intervals

p = ensure_regular_time_intervals(p_data.rename({"date": "datetime", "bgl": "bg-0:00"}, axis=1))

In [44]:
import plotly.express as px

p_data = data[data["p_num"] == 2]
p_data = p_data.sort_values("date")

sliced = p_data[3000:5000]
fig = px.line(sliced, x='date', y="bgl")
fig.show()

In [54]:
p_data

Unnamed: 0_level_0,p_num,bgl
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-05-22 10:37:40,2,144.0
2015-05-22 10:38:32,2,154.0
2015-05-22 10:41:29,2,143.0
2015-05-22 10:46:29,2,151.0
2015-05-22 10:51:29,2,149.0
...,...,...
2016-01-19 07:33:58,2,81.0
2016-01-19 07:38:58,2,80.0
2016-01-19 07:43:58,2,80.0
2016-01-19 07:48:58,2,79.0


In [60]:
pd.Series(p_data.reset_index()['date'].diff())

0                   NaT
1       0 days 00:00:52
2       0 days 00:02:57
3       0 days 00:05:00
4       0 days 00:05:00
              ...      
68050   0 days 00:05:00
68051   0 days 00:05:00
68052   0 days 00:05:00
68053   0 days 00:05:00
68054   0 days 00:06:20
Name: date, Length: 68055, dtype: timedelta64[ns]

In [67]:
from src.data.datasets.aleppo.clean_data import ensure_datetime_index
p_data = ensure_datetime_index(p_data)
p_data['diff'] = p_data.index.to_series().diff()
p_data.sort_values("diff", ascending=False)

Unnamed: 0_level_0,p_num,bgl,diff
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-09-21 18:55:18,2,59.0,0 days 10:09:57
2015-08-29 21:22:40,2,76.0,0 days 09:45:56
2015-05-29 18:12:00,2,118.0,0 days 09:41:22
2015-07-25 13:41:52,2,154.0,0 days 06:51:42
2015-07-01 23:04:46,2,97.0,0 days 05:12:09
...,...,...,...
2015-09-14 07:56:14,2,127.0,0 days 00:00:01
2015-11-07 21:51:14,2,168.0,0 days 00:00:01
2015-10-22 18:18:12,2,64.0,0 days 00:00:01
2015-11-18 07:44:57,2,178.0,0 days 00:00:00


In [None]:
import polars as pl
from datetime import timedelta

def create_subpatients(df: pd.DataFrame, gap_threshold: timedelta = timedelta(hours=2), 
                       min_sample_size: int = 100, datetime_col: str = "datetime", p_col: str = "p_num"):
    """If large gaps occur for a particular patient, then split them
    into subpatients"""
    pl_df = pl.DataFrame(df).sort(datetime_col, descending=False)
    print(pl_df.schema)
    pl_df = pl_df.with_columns(
        pl.col(datetime_col).diff().over(p_col).alias("diff")
    )
    pl_df = pl_df.with_columns(
        pl.when(pl.col("diff") > gap_threshold)
        .then(1)
        .otherwise(0)
        .cum_sum()
        .over(p_col)
        .alias("group_ids")
    )
    relevant_groups = pl_df.group_by(p_col, "group_ids").agg(pl.len()).filter(pl.col("len") > min_sample_size)
    # pl_df = pl_df.join(group_id_counts, on="group_ids")
    print(pl_df.schema)
    pl_df = (
        pl_df
        .join(relevant_groups, on=["p_num", "group_ids"])
        .with_columns(
            (pl.col("p_num").cast(str) + "_" + pl.col("group_ids").cast(str)).alias("p_num")
        )
        .drop(["len", "group_ids", "diff"])
    )

    return pl_df

In [75]:
data = ensure_datetime_index(data)

In [81]:
data

Unnamed: 0_level_0,p_num,bgl
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-05-16 05:35:41,183,162.0
2015-05-16 05:30:41,183,164.0
2015-05-16 05:25:41,183,168.0
2015-05-16 05:20:41,183,169.0
2015-05-16 05:15:41,183,170.0
...,...,...
2015-09-04 08:47:46,293,210.0
2015-09-04 08:42:46,293,211.0
2015-09-04 08:37:46,293,210.0
2015-09-04 08:32:46,293,207.0


In [125]:
q = create_subpatients(data.reset_index(), datetime_col="date")
q

Schema([('date', Datetime(time_unit='ns', time_zone=None)), ('p_num', Int64), ('bgl', Float64)])
Schema([('date', Datetime(time_unit='ns', time_zone=None)), ('p_num', Int64), ('bgl', Float64), ('diff', Duration(time_unit='ns')), ('group_ids', Int32)])


date,p_num,bgl,diff
datetime[ns],str,f64,duration[ns]
2013-10-04 17:53:22,"""170_13""",52.0,8h 38m 40s
2013-10-04 17:58:22,"""170_13""",54.0,5m
2013-10-04 18:03:22,"""170_13""",57.0,5m
2013-10-04 18:08:22,"""170_13""",60.0,5m
2013-10-04 18:13:22,"""170_13""",62.0,5m
…,…,…,…
2016-04-15 14:17:32,"""240_168""",188.0,5m
2016-04-15 14:22:32,"""240_168""",186.0,5m
2016-04-15 14:27:32,"""240_168""",185.0,5m
2016-04-15 14:32:32,"""240_168""",184.0,5m


In [86]:
create_subpatients(data.reset_index(), datetime_col="date")[["p_num", "date", "group_ids"]]

Schema([('date', Datetime(time_unit='ns', time_zone=None)), ('p_num', Int64), ('bgl', Float64)])


Unnamed: 0,p_num,date,group_ids
0,170,2013-09-26 11:33:42,0
1,170,2013-09-27 08:26:25,1
2,170,2013-09-27 08:26:39,1
3,170,2013-09-27 14:46:23,2
4,170,2013-09-28 06:55:13,3
...,...,...,...
14950656,240,2016-04-15 14:17:32,168
14950657,240,2016-04-15 14:22:32,168
14950658,240,2016-04-15 14:27:32,168
14950659,240,2016-04-15 14:32:32,168


In [69]:
import polars as pl
pl.DataFrame(p_data.reset_index())

date,p_num,bgl,diff
datetime[ns],i64,f64,duration[ns]
2015-05-22 10:37:40,2,144.0,
2015-05-22 10:38:32,2,154.0,52s
2015-05-22 10:41:29,2,143.0,2m 57s
2015-05-22 10:46:29,2,151.0,5m
2015-05-22 10:51:29,2,149.0,5m
…,…,…,…
2016-01-19 07:33:58,2,81.0,5m
2016-01-19 07:38:58,2,80.0,5m
2016-01-19 07:43:58,2,80.0,5m
2016-01-19 07:48:58,2,79.0,5m
