# Forecast of creatinine values

## Requirements
* An AI model is delivered
* The model takes a series of creatinine measures (with the dates) and a time intervall as an input and outputs the extended curve (as a numpy array sampled every two weeks) over the time intervall (starting at the last given measurement)

## Outline of the project
1. Data processing :
    * Querying the creatinine measurement history of the patients where there have been made at least 10 such measurements
    * Interpolating the points to get a numpy array with the expected creatinine values every 2 weeks
2. Training the model
3. Testing the model


In [None]:
from Data_tools import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

min_total_measurements = 20
cutoff_duration = 90 #Days
min_measurements_before_t = 14
n = 1000000

query = f"""
WITH last_measurements AS (
    SELECT subject_id, MAX(CAST(storetime AS DATE)) AS last_measurement_date
    FROM hosp.labevents
    WHERE itemid = '50912'
    GROUP BY subject_id
),
eligible_patients AS (
    SELECT l.subject_id
    FROM hosp.labevents l
    JOIN last_measurements lm ON l.subject_id = lm.subject_id
    WHERE l.itemid = '50912'
    GROUP BY l.subject_id, lm.last_measurement_date
    HAVING COUNT(*) > {min_total_measurements}
       AND COUNT(*) FILTER (WHERE CAST(l.storetime AS DATE) >= lm.last_measurement_date - INTERVAL '{cutoff_duration} DAY') > {min_measurements_before_t}
)
SELECT 
    p.subject_id AS patient_id,
    l.storetime AS measurement_time,
    p.anchor_age + EXTRACT(YEAR FROM CAST(l.storetime AS DATE)) - p.anchor_year AS patient_age,
    MAX(l.valuenum) FILTER (WHERE dl.label = 'Creatinine') AS creatinine
FROM hosp.labevents l
JOIN hosp.d_labitems dl ON l.itemid = dl.itemid
JOIN eligible_patients ep ON l.subject_id = ep.subject_id
JOIN (
    SELECT 
        di.subject_id, 
        p.anchor_age,
        p.anchor_year,
        a.admittime,
        di.icd_code 
    FROM hosp.diagnoses_icd di 
    JOIN hosp.d_icd_diagnoses did ON di.icd_code = did.icd_code 
    JOIN hosp.patients p ON di.subject_id = p.subject_id 
    JOIN hosp.admissions a ON di.hadm_id = a.hadm_id 
) AS p ON l.subject_id = p.subject_id 
WHERE l.itemid = '50912'
GROUP BY p.subject_id, l.storetime, p.anchor_age, p.anchor_year
LIMIT {n};
"""
df = execute_query(query, **params) 
df


Unnamed: 0,patient_id,measurement_time,patient_age,creatinine
0,10000935,2182-05-10 20:26:00,52.0,0.7
1,10000935,2182-08-25 17:55:00,52.0,0.8
2,10000935,2183-03-27 12:27:00,53.0,0.7
3,10000935,2183-10-28 05:47:00,53.0,0.8
4,10000935,2183-10-29 08:45:00,53.0,0.7
...,...,...,...,...
995,10011427,2136-03-21 08:20:00,70.0,4.9
996,10011427,2136-03-21 16:49:00,70.0,4.6
997,10011427,2136-03-22 03:10:00,70.0,4.7
998,10011427,2136-03-22 15:32:00,70.0,1.9


In [3]:
save_df = df.copy()

In [4]:
df = save_df.copy()

In [5]:
# Convert measurement_time to datetime
df["measurement_time"] = pd.to_datetime(df["measurement_time"])

# Function to interpolate creatinine values for each patient
def interpolate_creatinine(patient_df):
    patient_df = patient_df.set_index("measurement_time").sort_index()
    
    # Create a complete time range with 14-day intervals
    full_time_range = pd.date_range(start=patient_df.index.min(),
                                    end=patient_df.index.max(),
                                    freq="14D")

    # Reindex to get a continuous timeline
    interpolated_df = patient_df.reindex(full_time_range)

    # Interpolate missing creatinine values
    interpolated_df["creatinine"] = interpolated_df["creatinine"].interpolate(method="linear")

    # Fill patient metadata
    interpolated_df["patient_id"] = patient_df["patient_id"].iloc[0]
    interpolated_df["patient_age"] = patient_df["patient_age"].iloc[0]

    # Reset index to restore measurement_time as a column
    interpolated_df = interpolated_df.reset_index().rename(columns={"index": "measurement_time"})

    # Normalize time: subtract the first measurement time
    interpolated_df["days_since_first_measurement"] = (interpolated_df["measurement_time"] - interpolated_df["measurement_time"].min()).dt.days

    return interpolated_df

# Apply interpolation
df_interpolated = df.groupby("patient_id", group_keys=False, as_index=False).apply(interpolate_creatinine)

# Reset index for final clean DataFrame
df_interpolated = df_interpolated.reset_index(drop=True)

# Drop the original measurement_time column if not needed
df_interpolated = df_interpolated.drop(columns=["measurement_time"])

# Display 
df_interpolated


  df_interpolated = df.groupby("patient_id", group_keys=False, as_index=False).apply(interpolate_creatinine)


Unnamed: 0,patient_id,patient_age,creatinine,days_since_first_measurement
0,10000935,52.0,0.7,0
1,10000935,52.0,0.7,14
2,10000935,52.0,0.7,28
3,10000935,52.0,0.7,42
4,10000935,52.0,0.7,56
...,...,...,...,...
1606,10011427,69.0,1.4,56
1607,10011427,69.0,1.4,70
1608,10011427,69.0,1.4,84
1609,10011427,69.0,1.4,98


In [6]:

# Pivot so that each column is a patient’s creatinine series
creatinine_df = df_interpolated.pivot(index='days_since_first_measurement', columns='patient_id', values='creatinine')
creatinine_df[:min_total_measurements]

patient_id,10000935,10001884,10002013,10003299,10003400,10004401,10005817,10005866,10006029,10007818,10008924,10010440,10011365,10011427
days_since_first_measurement,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,2.4,2.3,0.9,0.7,1.4
14,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,2.4,2.3,0.9,0.7,1.4
28,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,2.4,2.3,0.9,0.7,1.4
42,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,2.4,2.3,0.9,0.7,1.4
56,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,2.4,2.3,0.9,0.7,1.4
70,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,,2.3,0.9,0.7,1.4
84,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,,2.3,0.9,0.7,1.4
98,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,,2.3,0.9,0.7,1.4
112,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,,2.3,0.9,0.7,1.4
126,0.7,0.8,0.9,0.9,1.0,1.6,1.0,1.0,1.6,,2.3,,0.7,


In [21]:
# Assuming your DataFrame is named df
df_filtered = creatinine_df.dropna(axis=1, thresh=min_total_measurements)
df_filtered

patient_id
10000935    0.7
10001884    0.8
10002013    0.9
10003299    0.9
10003400    1.0
10004401    1.6
10005817    1.0
10005866    1.0
10006029    1.6
10011365    0.7
Name: 700, dtype: float64

In [23]:
# Index: days_since_first_measurement (e.g., 0, 14, 28, ...)
# Columns: patient IDs (e.g., '10000935', '10001884', etc.)
# Values: creatinine levels

# For reproducibility, set the random seed
np.random.seed(123)

# Get the list of patient IDs from the DataFrame columns
patient_ids = df_filtered.columns.tolist()

# Determine the number of patients to use for training (70% of total)
n_train = int(0.7 * len(patient_ids))

# Randomly select 70% of patient IDs for training
train_ids = np.random.choice(patient_ids, n_train, replace=False)

# The remaining patient IDs will be used for testing
test_ids = [pid for pid in patient_ids if pid not in train_ids]

# Create training and testing DataFrames (all share the same index)
data_train = df_filtered[train_ids]
data_test  = df_filtered[test_ids]

# Print shapes for verification
print("Training Data Shape:", data_train.shape)
print("Testing Data Shape:", data_test.shape)

# Initialize the forecaster with your chosen regressor and number of lags.
forecaster = ForecasterRecursiveMultiSeries(
    regressor=Ridge(random_state=123),  # You can try other regressors too.
    lags=14                           # Use the past 14 time steps as predictors.
)

# Fit the forecaster on the training data (all patients at once).
forecaster.fit(series=data_train)


Training Data Shape: (324, 7)
Testing Data Shape: (324, 3)




In [32]:
# Number of time steps to forecast
steps = 6  

# Choose a valid patient ID from df_filtered; for example, 10000935.
patient_id = 10000935  # or "10000935" if your columns are strings

# Extract the last window as a DataFrame using double brackets and using the length of the lags array.
last_window = df_filtered[[patient_id]].iloc[-len(forecaster.lags):]

# Forecast for the selected patient by passing last_window explicitly.
predictions = forecaster.predict(steps=steps, levels=[patient_id], last_window=last_window)
print("Predictions for patient", patient_id, ":")
print(predictions)




ValueError: Input X contains NaN.
Ridge does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:

# Backtesting example:
# Define the initial training size (e.g., 70% of the total time steps in your DataFrame)
initial_train_size = int(0.7 * len(creatinine_df))


metrics, backtest_predictions = backtesting_forecaster_multiseries(
    forecaster            = forecaster,
    series                = creatinine_df,         # Wide DataFrame with patient IDs as columns
    steps                 = steps,
    metric                = 'mean_absolute_error',
    initial_train_size    = initial_train_size,
    fixed_train_size      = True,
    gap                   = 0,
    allow_incomplete_fold = True,
    refit                 = True,
    verbose               = False
)

print("Backtesting metrics:")
print(metrics)


In [27]:
# Number of time steps to forecast
steps = 6  

# Forecasting for a single patient (using a list for levels)
predictions = forecaster.predict(steps=steps, levels=['patient_1'])
print("Predictions for patient_1:")
print(predictions)

# Define the initial training size (e.g., 70% of the total time steps in your DataFrame)
initial_train_size = int(0.7 * len(creatinine_df))

# Perform backtesting on all patients
metrics, backtest_predictions = backtesting_forecaster_multiseries(
    forecaster            = forecaster,
    series                = creatinine_df,         # Wide DataFrame with patient ids as columns
    steps                 = steps,
    metric                = 'mean_absolute_error',
    initial_train_size    = initial_train_size,
    fixed_train_size      = True,
    gap                   = 0,
    allow_incomplete_fold = True,
    refit                 = True,
    verbose               = False
)

print("Backtesting metrics:")
print(metrics)


ValueError: No series to predict. None of the series {'patient_1'} are present in `last_window_` attribute. Provide `last_window` as argument in predict method.