In [5]:
import pandas as pd
import numpy as np

# Load your dataset
file_path = 'sepsis_Data2.csv'  # Update this with your file path
df = pd.read_csv(file_path)

def generate_dataset_report(dataframe, target_column=None):
    # Basic information
    report = {}
    report['shape'] = dataframe.shape
    report['dtypes'] = dataframe.dtypes.value_counts().to_dict()
    
    # Missing values analysis (fixed np.product -> np.prod)
    missing_values = dataframe.isnull().sum()
    report['missing_values'] = {
        'total_missing': missing_values.sum(),
        'columns_with_missing': missing_values[missing_values > 0].to_dict(),
        'missing_percentage': (missing_values.sum() / np.prod(dataframe.shape)) * 100
    }
    
    # Descriptive statistics (handle empty data cases)
    report['description'] = {}
    
    # Numeric features
    numeric_desc = dataframe.select_dtypes(include=[np.number])
    if not numeric_desc.empty:
        report['description']['numeric'] = numeric_desc.describe().to_dict()
    
    # Categorical features
    categorical_desc = dataframe.select_dtypes(include=['object'])
    if not categorical_desc.empty:
        report['description']['categorical'] = categorical_desc.describe().to_dict()
    
    # Duplicates analysis
    report['duplicates'] = {
        'total_duplicates': dataframe.duplicated().sum(),
        'percentage': (dataframe.duplicated().sum() / len(dataframe)) * 100
    }
    
    # Unique values analysis
    unique_values = dataframe.nunique()
    report['unique_values'] = {
        'average_unique': unique_values.mean(),
        'high_cardinality_features': unique_values[unique_values > 100].to_dict(),
        'constant_features': unique_values[unique_values == 1].to_dict()
    }
    
    # Target analysis (if specified)
    if target_column:
        if target_column in dataframe.columns:
            report['target_distribution'] = {
                'class_distribution': dataframe[target_column].value_counts(normalize=True).to_dict(),
                'class_imbalance': (dataframe[target_column].value_counts(normalize=True).max() > 0.7)
            }
    
    # Correlation analysis (numeric features only)
    if not numeric_desc.empty:
        corr_matrix = numeric_desc.corr().abs()
        report['correlation'] = {
            'top_correlations': corr_matrix.unstack().sort_values(ascending=False).drop_duplicates().head(10).to_dict()
        }
    
    return report

# Generate the report
target_col = 'SepsisLabel'  # Set to None if not applicable
dataset_report = generate_dataset_report(df, target_column=target_col)

# Print the report
import pprint
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(dataset_report)

{   'correlation': {   'top_correlations': {   ('BUN', 'Creatinine'): 0.6377219504463209,
                                               ('Bilirubin_total', 'Bilirubin_direct'): 0.7432911599557059,
                                               ('Hgb', 'Hct'): 0.9087244159445476,
                                               ('ICULOS', 'Unnamed: 0'): 0.998711085224039,
                                               ('PaCO2', 'SaO2'): 0.5634699506265477,
                                               ('PaCO2', 'pH'): 0.8830538230577867,
                                               ('Phosphate', 'Calcium'): 0.6038935897249273,
                                               ('SBP', 'MAP'): 0.6510978516648795,
                                               ('SaO2', 'pH'): 0.5600162145094302,
                                               ('Unnamed: 0', 'Unnamed: 0'): 1.0}},
    'description': {   'numeric': {   'AST': {   '25%': 0.0,
                                                 '50%

In [31]:
import pandas as pd

# Replace the file path with your dataset's path.
# For example, if your data is in CSV format:
df = pd.read_csv("sepsis_Data2.csv")

# Inspect the first few rows and the data summary
print("First 5 rows:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())


First 5 rows:
   Unnamed: 0    HR  O2Sat   Temp    SBP    MAP   DBP  Resp  EtCO2  \
0           1  61.0   99.0  36.44  124.0   65.0  43.0  17.5    0.0   
1           2  64.0   98.0  36.44  125.0   64.0  41.0  27.0    0.0   
2           3  56.0  100.0  36.44  123.0   65.0  41.0   9.0    0.0   
3           4  66.0   99.0  36.22  120.0   67.0  43.0  23.0    0.0   
4           5  94.0  100.0  36.22  194.0  116.0  66.0  14.0    0.0   

   BaseExcess  ...   WBC  Fibrinogen  Platelets    Age  Gender  Unit1  Unit2  \
0         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
1         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
2         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
3         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   
4         0.0  ...  11.0         0.0      158.0  75.91       0    0.0    1.0   

   HospAdmTime  ICULOS  SepsisLabel  
0        -98.6       2            0  
1       

In [33]:
# Step 2: Remove Constant Features
if 'EtCO2' in df.columns:
    unique_vals = df['EtCO2'].nunique()
    print(f"'EtCO2' unique values count: {unique_vals}")
    if unique_vals == 1:
        df.drop(columns=['EtCO2'], inplace=True)
        print("'EtCO2' column removed as it is constant.")
    else:
        print("'EtCO2' column is not constant.")
else:
    print("'EtCO2' column not found in the dataframe.")

# Display remaining columns
print("\nRemaining columns:")
print(df.columns.tolist())


'EtCO2' unique values count: 1
'EtCO2' column removed as it is constant.

Remaining columns:
['Unnamed: 0', 'HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Bilirubin_direct', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hct', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel']


In [35]:
import numpy as np
from sklearn.impute import SimpleImputer

# Step 3: Handle Suspicious Zero Values
# Define the features where 0 might be erroneous
erroneous_zero_features = ['pH', 'Bilirubin_direct', 'Bilirubin_total', 'Calcium']
# Filter the features that exist in the dataframe
erroneous_cols = [col for col in erroneous_zero_features if col in df.columns]

print("Columns to check for erroneous zeros:", erroneous_cols)

# Replace 0 with NaN in these columns and print the number of NaNs introduced
for col in erroneous_cols:
    df[col] = df[col].replace(0, np.nan)
    n_missing = df[col].isna().sum()
    print(f"{col}: {n_missing} missing values after replacing zeros.")

# Impute these columns using the median
if erroneous_cols:
    imputer = SimpleImputer(strategy='median')
    df[erroneous_cols] = imputer.fit_transform(df[erroneous_cols])
    print("Imputation complete for columns:", erroneous_cols)
    
# Display a sample of the updated columns
print("\nData sample after imputation:")
print(df[erroneous_cols].head())


Columns to check for erroneous zeros: ['pH', 'Bilirubin_direct', 'Bilirubin_total', 'Calcium']
pH: 17353 missing values after replacing zeros.
Bilirubin_direct: 56331 missing values after replacing zeros.
Bilirubin_total: 39990 missing values after replacing zeros.
Calcium: 8597 missing values after replacing zeros.
Imputation complete for columns: ['pH', 'Bilirubin_direct', 'Bilirubin_total', 'Calcium']

Data sample after imputation:
    pH  Bilirubin_direct  Bilirubin_total  Calcium
0  7.4               1.1              0.6      7.9
1  7.4               1.1              0.6      7.9
2  7.4               1.1              0.6      7.9
3  7.4               1.1              0.6      7.9
4  7.4               1.1              0.6      7.9


In [37]:
# Step 4: Address Multicollinearity

# Drop 'Unnamed: 0' if it exists
if 'Unnamed: 0' in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)
    print("'Unnamed: 0' column removed.")
else:
    print("'Unnamed: 0' column not found.")

# If both 'Hgb' and 'Hct' exist, drop 'Hct' (keeping 'Hgb')
if 'Hgb' in df.columns and 'Hct' in df.columns:
    df.drop(columns=['Hct'], inplace=True)
    print("'Hct' column removed to address multicollinearity with 'Hgb'.")
else:
    print("Either 'Hgb' or 'Hct' column is missing.")

# If both 'Bilirubin_direct' and 'Bilirubin_total' exist, drop 'Bilirubin_direct'
if 'Bilirubin_direct' in df.columns and 'Bilirubin_total' in df.columns:
    df.drop(columns=['Bilirubin_direct'], inplace=True)
    print("'Bilirubin_direct' column removed, keeping 'Bilirubin_total'.")
else:
    print("Either 'Bilirubin_direct' or 'Bilirubin_total' column is missing.")

# Display remaining columns after handling multicollinearity
print("\nRemaining columns after handling multicollinearity:")
print(df.columns.tolist())


'Unnamed: 0' column removed.
'Hct' column removed to address multicollinearity with 'Hgb'.
'Bilirubin_direct' column removed, keeping 'Bilirubin_total'.

Remaining columns after handling multicollinearity:
['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS', 'SepsisLabel']


In [63]:
# IMPORTANT: Split the target from features BEFORE applying numeric transformations!
y = df_raw['SepsisLabel'].copy()  # Save the target separately
X = df_raw.drop(columns=['SepsisLabel']).copy()  # Features only

In [65]:
import numpy as np

# Step 5 (Updated): Outlier Clipping Using IQR (Exclude the target 'SepsisLabel')
# Select numeric columns, excluding 'SepsisLabel'
numeric_cols = [col for col in df.select_dtypes(include=[np.number]).columns if col != 'SepsisLabel']
print("Numeric columns to be processed for outliers (excluding 'SepsisLabel'):", numeric_cols)

# Apply IQR-based clipping for each numeric column (excluding the target)
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    print(f"Outliers in {col} have been clipped.")

# Show summary statistics for numeric columns after clipping to verify
print("\nSummary statistics after clipping outliers:")
print(df[numeric_cols].describe())


Numeric columns to be processed for outliers (excluding 'SepsisLabel'): ['HR', 'O2Sat', 'Temp', 'SBP', 'MAP', 'DBP', 'Resp', 'BaseExcess', 'HCO3', 'FiO2', 'pH', 'PaCO2', 'SaO2', 'AST', 'BUN', 'Alkalinephos', 'Calcium', 'Chloride', 'Creatinine', 'Glucose', 'Lactate', 'Magnesium', 'Phosphate', 'Potassium', 'Bilirubin_total', 'TroponinI', 'Hgb', 'PTT', 'WBC', 'Fibrinogen', 'Platelets', 'Age', 'Gender', 'Unit1', 'Unit2', 'HospAdmTime', 'ICULOS']
Outliers in HR have been clipped.
Outliers in O2Sat have been clipped.
Outliers in Temp have been clipped.
Outliers in SBP have been clipped.
Outliers in MAP have been clipped.
Outliers in DBP have been clipped.
Outliers in Resp have been clipped.
Outliers in BaseExcess have been clipped.
Outliers in HCO3 have been clipped.
Outliers in FiO2 have been clipped.
Outliers in pH have been clipped.
Outliers in PaCO2 have been clipped.
Outliers in SaO2 have been clipped.
Outliers in AST have been clipped.
Outliers in BUN have been clipped.
Outliers in Alk

In [69]:
from sklearn.preprocessing import StandardScaler

# Step 6: Feature Scaling
# Exclude target column 'SepsisLabel' from scaling
scaling_cols = [col for col in numeric_cols if col != 'SepsisLabel']

scaler = StandardScaler()
df[scaling_cols] = scaler.fit_transform(df[scaling_cols])
print("Scaling complete for numeric columns (excluding 'SepsisLabel').")

# Show a sample of scaled features
print("\nData sample after scaling:")
print(df[scaling_cols].head())


Scaling complete for numeric columns (excluding 'SepsisLabel').

Data sample after scaling:
         HR     O2Sat      Temp       SBP       MAP       DBP      Resp  \
0 -1.415342  0.711454 -0.679109  0.168608 -0.949892  0.063991 -0.233614   
1 -1.238289  0.288212 -0.679109  0.213133 -1.019396 -0.001462  1.597754   
2 -1.710431  1.134695 -0.679109  0.124083 -0.949892 -0.001462 -1.872206   
3 -1.120254  0.711454 -0.979456 -0.009493 -0.810885  0.063991  0.826652   
4  0.532240  1.134695 -0.979456  2.728815  2.594798  0.816705 -0.908328   

   BaseExcess      HCO3      FiO2  ...      PTT       WBC  Fibrinogen  \
0   -0.149054 -0.588904 -1.156218  ... -1.70339 -0.013064         0.0   
1   -0.149054 -0.588904 -1.156218  ... -1.70339 -0.013064         0.0   
2   -0.149054 -0.588904 -1.156218  ... -1.70339 -0.013064         0.0   
3   -0.149054 -0.588904 -1.156218  ... -1.70339 -0.013064         0.0   
4   -0.149054 -0.588904 -1.156218  ... -1.70339 -0.013064         0.0   

   Platelets      

In [71]:
# (Optional) Re-check target distribution
print("Target distribution (should remain unchanged):")
print(y.value_counts())

Target distribution (should remain unchanged):
SepsisLabel
0    57070
1     1306
Name: count, dtype: int64


In [73]:
# Step 6: Apply SMOTE to the feature matrix and target
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print("After SMOTE, target distribution:")
print(pd.Series(y_resampled).value_counts())

# Combine the resampled features and target
df_resampled = pd.concat(
    [pd.DataFrame(X_resampled, columns=X.columns),
     pd.DataFrame(y_resampled, columns=['SepsisLabel'])],
    axis=1
)
print("SMOTE applied successfully. Resampled dataset shape:", df_resampled.shape)



After SMOTE, target distribution:
SepsisLabel
0    57070
1    57070
Name: count, dtype: int64
SMOTE applied successfully. Resampled dataset shape: (114140, 42)


In [75]:
from sklearn.model_selection import train_test_split

# Separate features and target from the resampled dataset
X = df_resampled.drop('SepsisLabel', axis=1)
y = df_resampled['SepsisLabel']

# Perform a stratified train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("\nTraining target distribution:")
print(y_train.value_counts())
print("\nTest target distribution:")
print(y_test.value_counts())


Train set shape: (91312, 41)
Test set shape: (22828, 41)

Training target distribution:
SepsisLabel
0    45656
1    45656
Name: count, dtype: int64

Test target distribution:
SepsisLabel
0    11414
1    11414
Name: count, dtype: int64
