# Sprint 1: Foundational Setup

## Task 1 (CIR-8): Load and Validate all Datasets

In [1]:
import pandas as pd
import numpy as np
import os
import logging

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/CIR-8.log"),
        logging.StreamHandler()
    ]
)

In [3]:
# CSVs Directory 
data_path = "../04_ANN/CSV/exports/split_set/without_multiple_rows"
all_files = os.listdir(data_path)

logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Start Loading Dataframes.")

# Load CSVs into a dictionary of dataframes
dataframes = {}
for file in all_files:
    if file.endswith(".csv"):
        var_name = file.replace(".csv", "").replace("-", "_")
        logging.info(f"Loading... -> {file}")
        dataframes[var_name] = pd.read_csv(os.path.join(data_path, file)).astype('float32')

# Log loaded datasets
for var_name, df in dataframes.items():
    globals()[var_name] = df
    logging.info(f"{var_name} loaded successfully with shape {df.shape}")
logging.info("Load Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 15:55:23,866 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 15:55:23,868 - INFO - Start Loading Dataframes.
2025-03-23 15:55:23,869 - INFO - Loading... -> o1_X_external.csv
2025-03-23 15:55:30,749 - INFO - Loading... -> o1_X_test.csv
2025-03-23 15:55:31,275 - INFO - Loading... -> o1_X_train.csv
2025-03-23 15:55:35,267 - INFO - Loading... -> o1_X_validate.csv
2025-03-23 15:55:35,779 - INFO - Loading... -> o1_y_external_los.csv
2025-03-23 15:55:35,832 - INFO - Loading... -> o1_y_external_mortality.csv
2025-03-23 15:55:35,868 - INFO - Loading... -> o1_y_test_los.csv
2025-03-23 15:55:35,883 - INFO - Loading... -> o1_y_test_mortality.csv
2025-03-23 15:55:35,895 - INFO - Loading... -> o1_y_train_los.csv
2025-03-23 15:55:35,945 - INFO - Loading... -> o1_y_train_mortality.csv
2025-03-23 15:55:35,964 - INFO - Loading... -> o1_y_validate_los.csv
2025-03-23 15:55:35,987 - INFO - Loading... -> o1_y_validate_mortality.csv
2025-03-23 15:55:35,999 - INFO - Loading... -> o2_

In [4]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Check Datatypes...")
for name, df in dataframes.items():
    logging.info(f"{name} types:\n{df.dtypes.value_counts()}")
logging.info("Check Complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 15:55:49,666 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 15:55:49,668 - INFO - Check Datatypes...
2025-03-23 15:55:49,675 - INFO - o1_X_external types:
float32    345
Name: count, dtype: int64
2025-03-23 15:55:49,678 - INFO - o1_X_test types:
float32    345
Name: count, dtype: int64
2025-03-23 15:55:49,679 - INFO - o1_X_train types:
float32    345
Name: count, dtype: int64
2025-03-23 15:55:49,681 - INFO - o1_X_validate types:
float32    345
Name: count, dtype: int64
2025-03-23 15:55:49,683 - INFO - o1_y_external_los types:
float32    1
Name: count, dtype: int64
2025-03-23 15:55:49,684 - INFO - o1_y_external_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 15:55:49,686 - INFO - o1_y_test_los types:
float32    1
Name: count, dtype: int64
2025-03-23 15:55:49,688 - INFO - o1_y_test_mortality types:
float32    1
Name: count, dtype: int64
2025-03-23 15:55:49,690 - INFO - o1_y_train_los types:
float32    1
Name: count, dtype: int64
2025-03-23 15

In [5]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate missing values...")
for name, df in dataframes.items():
    missing_total = df.isnull().sum().sum()
    missing_cols = df.columns[df.isnull().any()].tolist()
    logging.info(f"{name}: {missing_total} missing values across {len(missing_cols)} columns")
logging.info("Missing values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 15:55:49,771 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 15:55:49,772 - INFO - Calculate missing values...
2025-03-23 15:55:49,948 - INFO - o1_X_external: 39527616 missing values across 308 columns
2025-03-23 15:55:49,963 - INFO - o1_X_test: 1985664 missing values across 292 columns
2025-03-23 15:55:50,053 - INFO - o1_X_train: 15876288 missing values across 304 columns
2025-03-23 15:55:50,067 - INFO - o1_X_validate: 2007696 missing values across 296 columns
2025-03-23 15:55:50,070 - INFO - o1_y_external_los: 0 missing values across 0 columns
2025-03-23 15:55:50,072 - INFO - o1_y_external_mortality: 0 missing values across 0 columns
2025-03-23 15:55:50,075 - INFO - o1_y_test_los: 0 missing values across 0 columns
2025-03-23 15:55:50,077 - INFO - o1_y_test_mortality: 0 missing values across 0 columns
2025-03-23 15:55:50,079 - INFO - o1_y_train_los: 0 missing values across 0 columns
2025-03-23 15:55:50,081 - INFO - o1_y_train_mortality: 0 missing values acros

In [6]:
logging.info("++++++++++++++++++++++++++++++++++++++++++")
logging.info("Calculate unique values.")
for name, df in dataframes.items():
    if 'y_' in name:
        unique_vals = df.nunique()
        logging.info(f"{name} unique target values: {unique_vals.to_dict()}")

logging.info("Unique values calculation complete.")
logging.info("++++++++++++++++++++++++++++++++++++++++++")

2025-03-23 15:55:50,480 - INFO - ++++++++++++++++++++++++++++++++++++++++++
2025-03-23 15:55:50,481 - INFO - Calculate unique values.
2025-03-23 15:55:50,491 - INFO - o1_y_external_los unique target values: {'los': 830}
2025-03-23 15:55:50,495 - INFO - o1_y_external_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 15:55:50,497 - INFO - o1_y_test_los unique target values: {'los': 319}
2025-03-23 15:55:50,498 - INFO - o1_y_test_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 15:55:50,502 - INFO - o1_y_train_los unique target values: {'los': 2548}
2025-03-23 15:55:50,504 - INFO - o1_y_train_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 15:55:50,506 - INFO - o1_y_validate_los unique target values: {'los': 319}
2025-03-23 15:55:50,507 - INFO - o1_y_validate_mortality unique target values: {'hospital_expire_flag': 2}
2025-03-23 15:55:50,511 - INFO - o2_y_external_los unique target values: {'los': 830}
2025-03-23 15:55:50,5

In [7]:
summary = []
for name, df in dataframes.items():
    missing_values = df.isnull().sum().sum()
    missing_cols = df.isnull().any().sum()
    total_cells = df.shape[0] * df.shape[1]
    summary.append({
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Values': missing_values,
        'Accross Missing Columns': missing_cols,
        'Missing %': 100 * missing_values / total_cells
    })

summary_df = pd.DataFrame(summary)
# Save to file
summary_df.to_csv("CSV/exports/01_dataset_summary.csv", index=False)

In [8]:
percent = 7

# Dictionary of datasets for iteration
datasets = {
    "o1_X_train": o1_X_train,
    "o2_X_train": o2_X_train,
    "o3_X_train": o3_X_train,
    "o4_X_train": o4_X_train
}

# Loop through each dataset
for name, df in datasets.items():
    missing_percentage_per_row = df.isnull().mean(axis=1) * 100
    missing_rows = (missing_percentage_per_row <= percent).sum()
    total_rows, total_columns = df.shape
    percent_between = (missing_rows * 100) / total_rows

    print(f"\nDataset: {name}")
    print(f"Total Rows: {total_rows}, Total Columns: {total_columns}")
    print(f"Number of rows with missing values up to {percent}%: {missing_rows}")
    print(f"The percentage between total and missing values sets is {percent_between:.2f}%")


Dataset: o1_X_train
Total Rows: 122496, Total Columns: 345
Number of rows with missing values up to 7%: 48
The percentage between total and missing values sets is 0.04%

Dataset: o2_X_train
Total Rows: 61248, Total Columns: 345
Number of rows with missing values up to 7%: 24
The percentage between total and missing values sets is 0.04%

Dataset: o3_X_train
Total Rows: 40832, Total Columns: 345
Number of rows with missing values up to 7%: 16
The percentage between total and missing values sets is 0.04%

Dataset: o4_X_train
Total Rows: 30624, Total Columns: 345
Number of rows with missing values up to 7%: 12
The percentage between total and missing values sets is 0.04%


## Task 2 (CIR-9): Analyze Missingness (per row/column)

In [9]:
missingness_summary = []

for name, df in dataframes.items():
    if not isinstance(df, pd.DataFrame):
        continue
    
    total_cells = df.shape[0] * df.shape[1]
    total_missing = df.isnull().sum().sum()
    
    col_missing = df.isnull().mean() * 100
    row_missing = df.isnull().mean(axis=1) * 100
    
    summary = {
        'Dataset': name,
        'Shape': df.shape,
        'Total Missing Cells': total_missing,
        'Total Missing %': round(100 * total_missing / total_cells, 2),
        'Columns with Missing (%)': (col_missing > 0).sum(),
        'Max % Missing in Row': round(row_missing.max(), 2),
        'Mean % Missing in Row': round(row_missing.mean(), 2),
        'Min % Missing in Row': round(row_missing.min(), 2)
    }
    missingness_summary.append(summary)

# Create DataFrame for summary
missingness_df = pd.DataFrame(missingness_summary)

# Sort by most missing
missingness_df.sort_values("Total Missing %", ascending=False, inplace=True)

# Save to file
missingness_df.to_csv("CSV/exports/02_missingness_summary.csv", index=False)

# Show top rows, I have leave out the y_ files which are labels
display(missingness_df.head(16))

Unnamed: 0,Dataset,Shape,Total Missing Cells,Total Missing %,Columns with Missing (%),Max % Missing in Row,Mean % Missing in Row,Min % Missing in Row
0,o1_X_external,"(234720, 345)",39527616,48.81,308,89.28,48.81,13.91
12,o2_X_external,"(117360, 345)",19763572,48.81,308,89.28,48.81,13.91
36,o4_X_external,"(58680, 345)",9881900,48.81,308,89.28,48.81,13.91
24,o3_X_external,"(78240, 345)",13175808,48.81,308,89.28,48.81,13.91
27,o3_X_validate,"(5104, 345)",669232,38.01,296,78.84,38.01,9.28
3,o1_X_validate,"(15312, 345)",2007696,38.01,296,78.84,38.01,9.28
15,o2_X_validate,"(7656, 345)",1003848,38.01,296,78.84,38.01,9.28
39,o4_X_validate,"(3828, 345)",501924,38.01,296,78.84,38.01,9.28
25,o3_X_test,"(5104, 345)",661888,37.59,292,80.0,37.59,10.43
1,o1_X_test,"(15312, 345)",1985664,37.59,292,80.0,37.59,10.43


## Task 3 (CIR-10): Visualize Missingness

In [10]:
# Create output folder
output_dir = "figures/task3_missingness"
os.makedirs(output_dir, exist_ok=True)

# Functions for plotting
def plot_missing_heatmap(
    df, title, max_rows=200, save_path=None, suffix_filter='(Min)', cmap='Blues'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    df_filtered = df[filtered_cols].head(max_rows)

    plt.figure(figsize=(16, 6))
    sns.heatmap(df_filtered.isnull(), cbar=False, cmap=cmap, yticklabels=False,
                linecolor='lightgrey', linewidths=0.001)

    plt.title(f"Missing Data Heatmap – First {max_rows} Rows\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Features", fontsize=12)
    plt.ylabel("Rows", fontsize=12)
    plt.xticks(fontsize=8, rotation=90)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def plot_column_missing_bar(
    df, title, save_path=None, top_n=40, suffix_filter='(Min)', color='#3E8ED0'
):
    filtered_cols = [col for col in df.columns if col.endswith(suffix_filter)]
    missing_perc = df[filtered_cols].isnull().mean() * 100
    missing_perc = missing_perc[missing_perc > 0].sort_values(ascending=False).head(top_n)

    plt.figure(figsize=(10, 0.4 * len(missing_perc)))
    ax = missing_perc.plot(kind='barh', color=color)

    plt.title(f"Top {len(missing_perc)} Features with Missing Values\n({suffix_filter}) – {title}",
              fontsize=14, weight='bold')
    plt.xlabel("Percentage Missing", fontsize=12)
    plt.ylabel("Feature", fontsize=12)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=8)
    plt.grid(axis='x', linestyle='--', linewidth=0.5)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


# Filter only X datasets from the summary
top_datasets = missingness_df[
    (~missingness_df['Dataset'].str.startswith('y_')) &
    (missingness_df['Dataset'].isin(dataframes.keys()))
].head(16)['Dataset'].tolist()

# Generate plots
for dataset_name in top_datasets:
    df = dataframes.get(dataset_name)
    if df is not None:
        print(f"Generating plots for: {dataset_name}")
        heatmap_path = os.path.join(output_dir, f"{dataset_name}_heatmap.png")
        barplot_path = os.path.join(output_dir, f"{dataset_name}_barplot.png")
        
        plot_missing_heatmap(df, dataset_name, save_path=heatmap_path)
        plot_column_missing_bar(df, dataset_name, save_path=barplot_path)
    else:
        print(f"Dataset not found in dataframes: {dataset_name}")

Generating plots for: o1_X_external
Generating plots for: o2_X_external
Generating plots for: o4_X_external
Generating plots for: o3_X_external
Generating plots for: o3_X_validate
Generating plots for: o1_X_validate
Generating plots for: o2_X_validate
Generating plots for: o4_X_validate
Generating plots for: o3_X_test
Generating plots for: o1_X_test
Generating plots for: o2_X_test
Generating plots for: o4_X_test
Generating plots for: o3_X_train
Generating plots for: o1_X_train
Generating plots for: o2_X_train
Generating plots for: o4_X_train


## Task 4 (CIR-11): Define Clinical Value Ranges

In [11]:
# Step 1: Choose suffix
SUFFIX = '(Min)'  # or '(Mean)', etc.

# Step 2: Collect global min/max per feature
global_min_max = {}

for name, df in dataframes.items():
    if name.startswith('y_'):  # Skip target datasets
        continue
    filtered_cols = [col for col in df.columns if col.endswith(SUFFIX)]
    filtered_df = df[filtered_cols]
    
    # For each feature, collect min and max
    for col in filtered_cols:
        col_min = filtered_df[col].min(skipna=True)
        col_max = filtered_df[col].max(skipna=True)
        
        if col not in global_min_max:
            global_min_max[col] = {'min': col_min, 'max': col_max}
        else:
            global_min_max[col]['min'] = min(global_min_max[col]['min'], col_min)
            global_min_max[col]['max'] = max(global_min_max[col]['max'], col_max)

# Step 3: Convert to DataFrame
clinical_ranges_df = pd.DataFrame.from_dict(global_min_max, orient='index')
clinical_ranges_df = clinical_ranges_df.rename(columns={'min': 'Observed Min', 'max': 'Observed Max'})
clinical_ranges_df = clinical_ranges_df.sort_index()

# Save it to a CSV file
#clinical_ranges_df.to_csv("CSV/exports/03_observed_clinical_ranges.csv")

# Show a preview
clinical_ranges_df.head(10)

Unnamed: 0,Observed Min,Observed Max
Alanine_Aminotransferase_(ALT)_(Min),1.0,7510.0
Albumin_(Min),1.0,5.1
Alkaline_Phosphatase_(Min),8.0,1448.0
Ammonia_(Min),2.2,130.0
Amylase_(Min),7.0,4867.0
Anion_Gap_(Min),-3.0,49.0
Arterial_Blood_Pressure_diastolic_(mmHg)_(Min),-2.0,355.0
Arterial_Blood_Pressure_mean_(mmHg)_(Min),-37.0,363.0
Arterial_Blood_Pressure_systolic_(mmHg)_(Min),0.0,365.0
Asparate_Aminotransferase_(AST)_(Min),0.0,12652.0


In [21]:
labevents_df = pd.read_csv(r"..\00_Datasets\mimic-iv-3_1\icu\d_items.csv.gz")

In [22]:
display(labevents_df)

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,220001,Problem List,Problem List,chartevents,General,,Text,,
1,220003,ICU Admission date,ICU Admission date,datetimeevents,ADT,,Date and time,,
2,220045,Heart Rate,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
3,220046,Heart rate Alarm - High,HR Alarm - High,chartevents,Alarms,bpm,Numeric,,
4,220047,Heart Rate Alarm - Low,HR Alarm - Low,chartevents,Alarms,bpm,Numeric,,
...,...,...,...,...,...,...,...,...,...
4090,230172,Patient Reversed,Patient Reversed,procedureevents,3-Significant Events,,Processes,,
4091,230173,Patient - Fast Track Protocol,Patient - Fast Track Protocol,procedureevents,3-Significant Events,,Processes,,
4092,230174,Nerve block in OR,Nerve block in OR,procedureevents,3-Significant Events,,Processes,,
4093,230176,IUC Stabilization Device,IUC Stabilization Device,chartevents,GI/GU,,Checkbox,,


In [25]:

abp_df = labevents_df[labevents_df['label'].str.startswith('Arterial Blood Pressure mean', na=False)]

# Display the result
display(abp_df)


Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
8,220052,Arterial Blood Pressure mean,ABPm,chartevents,Routine Vital Signs,mmHg,Numeric,,


In [26]:
temp_df = pd.read_csv(r"../01_MimicIV/CSV/Exports/o04_icu_chartevent.csv")

In [27]:
display(temp_df)

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,seq_num,icd_code,icd_version,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,2,43491,9,8991.0,2174-12-11 20:00:00,2174-12-11 21:48:00,220045,81,81.000,bpm,0.0
1,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,2,43491,9,8991.0,2174-12-11 20:00:00,2174-12-11 21:48:00,220179,159,159.000,mmHg,0.0
2,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,2,43491,9,8991.0,2174-12-11 20:00:00,2174-12-11 21:48:00,220180,99,99.000,mmHg,0.0
3,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,2,43491,9,8991.0,2174-12-11 20:00:00,2174-12-11 21:48:00,220181,112,112.000,mmHg,0.0
4,10004733,27411876,39635619,Medical/Surgical Intensive Care Unit (MICU/SICU),Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,8.357373,2,43491,9,8991.0,2174-12-11 20:00:00,2174-12-11 21:48:00,220210,19,19.000,insp/min,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8530606,19999987,23865745,36195440,Trauma SICU (TSICU),Trauma SICU (TSICU),2145-11-02 22:59:00,2145-11-04 21:29:30,1.937847,1,431,9,,2145-11-04 10:40:00,2145-11-04 11:28:00,225642,7.3,7.300,%,0.0
8530607,19999987,23865745,36195440,Trauma SICU (TSICU),Trauma SICU (TSICU),2145-11-02 22:59:00,2145-11-04 21:29:30,1.937847,1,431,9,,2145-11-04 10:40:00,2145-11-04 11:28:00,225643,68.6,68.600,%,0.0
8530608,19999987,23865745,36195440,Trauma SICU (TSICU),Trauma SICU (TSICU),2145-11-02 22:59:00,2145-11-04 21:29:30,1.937847,1,431,9,,2145-11-04 10:40:00,2145-11-04 11:28:00,227457,120,120.000,K/uL,1.0
8530609,19999987,23865745,36195440,Trauma SICU (TSICU),Trauma SICU (TSICU),2145-11-02 22:59:00,2145-11-04 21:29:30,1.937847,1,431,9,,2145-11-04 10:40:00,2145-11-04 11:30:00,220734,5.5,5.500,units,0.0


In [33]:
filtered_df = temp_df[['itemid', 'valuenum', 'valueuom']]

In [34]:
filtered_df = temp_df[temp_df['itemid'] == 220052][['itemid', 'valuenum', 'valueuom']]
display(filtered_df)

Unnamed: 0,itemid,valuenum,valueuom
4589,220052,102.0,mmHg
4596,220052,102.0,mmHg
4640,220052,101.0,mmHg
4648,220052,107.0,mmHg
4663,220052,119.0,mmHg
...,...,...,...
8528240,220052,86.0,mmHg
8528260,220052,94.0,mmHg
8528294,220052,84.0,mmHg
8528298,220052,85.0,mmHg


In [35]:
filtered_df = filtered_df.sort_values('valuenum', ascending=False)

In [36]:
display(filtered_df)

Unnamed: 0,itemid,valuenum,valueuom
5757288,220052,1410.0,mmHg
1482453,220052,1100.0,mmHg
6357711,220052,1085.0,mmHg
3572462,220052,667.0,mmHg
6529446,220052,361.0,mmHg
...,...,...,...
375709,220052,-36.0,mmHg
912433,220052,-36.0,mmHg
444747,220052,-37.0,mmHg
5511956,220052,-39.0,mmHg


In [38]:
# File path
file_path = "../00_Datasets/mimic-iv-3_1/icu/chartevents.csv.gz"

# Chunk size
chunksize = 1000000  # Adjust based on your memory capacity

# Filtered data will be appended here
filtered_chunks = []

# Loop through the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, compression='gzip', usecols=['subject_id', 'itemid', 'valuenum', 'valueuom']):
    # Filter only rows with itemid == 220052
    filtered_chunk = chunk[chunk['itemid'] == 220052]
    filtered_chunks.append(filtered_chunk)

# Combine all filtered chunks into one DataFrame
result_df = pd.concat(filtered_chunks, ignore_index=True)

# Show a preview
print(result_df.head())

# Optional: Save to CSV
# result_df.to_csv("filtered_chartevents_220052.csv", index=False)


   subject_id  itemid  valuenum valueuom
0    10002013  220052      89.0     mmHg
1    10002013  220052      73.0     mmHg
2    10002013  220052      86.0     mmHg
3    10002013  220052      66.0     mmHg
4    10002013  220052      80.0     mmHg


In [39]:
result_df = result_df.sort_values('valuenum', ascending=False)
display(result_df)

In [40]:
display(result_df)

Unnamed: 0,subject_id,itemid,valuenum,valueuom
1562086,15063388,220052,930000.0,mmHg
2931144,19524390,220052,117113.0,mmHg
1391541,14504850,220052,115123.0,mmHg
1370362,14441902,220052,115105.0,mmHg
2871240,19329421,220052,108122.0,mmHg
...,...,...,...,...
3007319,19738358,220052,-60.0,mmHg
749330,12385813,220052,-86.0,mmHg
1681133,15452443,220052,-99.0,mmHg
1432872,14661924,220052,-100.0,mmHg


In [41]:
# File path
file_path = "../00_Datasets/mimic-iv-3_1/icu/chartevents.csv.gz"

# Chunk size
chunksize = 1000000  # Adjust based on your system's RAM

# Container for filtered data
negative_valuenum_chunks = []

# Process the file in chunks
for chunk in pd.read_csv(file_path, chunksize=chunksize, compression='gzip', usecols=['subject_id', 'itemid', 'value', 'valuenum', 'valueuom']):
    # Drop rows where valuenum is NaN to avoid errors
    chunk = chunk.dropna(subset=['valuenum'])

    # Filter for negative valuenum values
    negative_chunk = chunk[chunk['valuenum'] < 0]
    
    # Append to list
    negative_valuenum_chunks.append(negative_chunk)

# Combine all the negative valuenum rows
result_df = pd.concat(negative_valuenum_chunks, ignore_index=True)

# Display a few rows
print(result_df.head())

# Optional: Save to CSV
# result_df.to_csv("negative_valuenum_chartevents.csv", index=False)


   subject_id  itemid                                              value  \
0    10001217  228096  -1 Awakens to voice (eye opening/contact) > 10...   
1    10001217  228096  -1 Awakens to voice (eye opening/contact) > 10...   
2    10001843  228096  -2 Light sedation, briefly awakens to voice (e...   
3    10001843  228096  -1 Awakens to voice (eye opening/contact) > 10...   
4    10001843  228096  -2 Light sedation, briefly awakens to voice (e...   

   valuenum valueuom  
0      -1.0      NaN  
1      -1.0      NaN  
2      -2.0      NaN  
3      -1.0      NaN  
4      -2.0      NaN  


In [46]:
result_df = result_df.sort_values('valuenum', ascending=True)
display(result_df)

Unnamed: 0,subject_id,itemid,value,valuenum,valueuom
399075,12424661,229896,-1e+07,-10000000.00,Watts
239015,11442414,220277,-951234,-951234.00,%
768582,14665633,224162,-897868,-897868.00,insp/min
109835,10688510,228005,-601600,-601600.00,ml/hr
109827,10688510,228005,-601600,-601600.00,ml/hr
...,...,...,...,...,...
604622,13668411,224746,-0.03,-0.03,cmH2O
795831,14816494,224746,-0.03,-0.03,cmH2O
846659,15097100,224746,-0.02,-0.02,cmH2O
240832,11448985,224746,-0.02,-0.02,cmH2O
