In [1]:
# =========================
# Importing Libraries
# =========================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from scipy.stats import ttest_ind, mannwhitneyu

from matplotlib.backends.backend_pdf import PdfPages
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Bidirectional
from xgboost import XGBClassifier

# =========================
# Load Dataset
# =========================
# Adjust the file path if needed
df = pd.read_csv('Paitients_Files_Train.csv')

# Display DataFrame information
print("First 5 rows:")
print(df.head())
print("\nLast 5 rows:")
print(df.tail())
print("\nRandom sample of 5 rows:")
print(df.sample(5))
print("\nData Shape:", df.shape)
print("\nData Types:")
print(df.dtypes)
print("\nSummary Statistics for Numerical Features:")
print(df.describe())
print("\nSummary for Object/Categorical Features:")
print(df.describe(include=['object']))

# =========================
# Handling Missing Values (Optional)
# =========================
# If certain columns have zeros that represent missing values, uncomment the block below and adjust columns as needed.
# cols_with_invalid_zeros = ['PRG', 'PL', 'PR', 'SK', 'TS', 'M11', 'BD2']
# for col in cols_with_invalid_zeros:
#     df.loc[df[col] == 0, col] = np.nan
# print("Missing values per column after replacing zeros with NaN:")
# print(df.isnull().sum())

# Optional median imputation for selected columns:
# cols_to_impute = ['PRG', 'PL', 'PR', 'SK', 'TS', 'M11']
# imputer = SimpleImputer(strategy='median')
# df[cols_to_impute] = imputer.fit_transform(df[cols_to_impute])
# print("Missing values after imputation:")
# print(df[cols_to_impute].isnull().sum())
# print("\nUpdated summary statistics for imputed columns:")
# print(df[cols_to_impute].describe())

# =========================
# Checking For Duplicates
# =========================
duplicates = df.duplicated().sum()
print("\nNumber of duplicate rows:", duplicates)
if duplicates > 0:
    df.drop_duplicates(inplace=True)
    print("Duplicates removed.")

# =========================
# Exploratory Data Analysis (EDA)
# =========================
# Define feature and target columns.
# Change 'Sepssis' to 'Sepsis' if that is the correct column name.
features = ['PRG', 'PL', 'PR', 'SK', 'TS', 'M11', 'BD2', 'Age']
target = 'Sepssis'
# Ensure the target is categorical
df[target] = df[target].astype('category')

# Define PDF filename for saving plots
pdf_filename = 'eda_results.pdf'

# Create PDF with boxplots, histograms, and correlation heatmap.
with PdfPages(pdf_filename) as pdf:
    # 1. Boxplots grouped by target
    for feature in features:
        fig, ax = plt.subplots(figsize=(8, 4))
        sns.boxplot(x=target, y=feature, data=df, ax=ax)
        ax.set_title(f'Boxplot of {feature} by Sepsis Outcome')
        ax.set_xlabel('Sepsis')
        ax.set_ylabel(feature)
        pdf.savefig(fig)
        plt.close(fig)
    
    # 2. Histograms with KDE
    for feature in features:
        fig, ax = plt.subplots(figsize=(8, 4))
        sns.histplot(df[feature], kde=True, bins=30, ax=ax)
        ax.set_title(f'Histogram of {feature}')
        ax.set_xlabel(feature)
        ax.set_ylabel('Frequency')
        pdf.savefig(fig)
        plt.close(fig)
    
    # 3. Correlation Heatmap
    corr_matrix = df[features].corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax)
    ax.set_title('Correlation Matrix of Features')
    pdf.savefig(fig)
    plt.close(fig)

print(f"\nAll EDA plots have been saved into {pdf_filename}")

# =========================
# Hypothesis Testing
# =========================
# Convert target to binary (mapping: Negative -> 0, Positive -> 1)
df[target] = df[target].map({'Negative': 0, 'Positive': 1})

# Function to choose test based on skewness
def choose_test(series):
    return abs(series.skew()) < 0.5  # True for approximate normality

results = []
for feature in features:
    group0 = df[df[target] == 0][feature]
    group1 = df[df[target] == 1][feature]
    
    if choose_test(group0) and choose_test(group1):
        test_stat, p_value = ttest_ind(group0, group1, nan_policy='omit')
        test_used = 't-test'
    else:
        test_stat, p_value = mannwhitneyu(group0, group1, alternative='two-sided')
        test_used = 'Mann-Whitney U'
    
    results.append({
        'Feature': feature,
        'Test': test_used,
        'Statistic': test_stat,
        'p-value': p_value
    })

results_df = pd.DataFrame(results)
print("\nHypothesis Testing Results:")
print(results_df)

# =========================
# Feature Scaling
# =========================
# Apply Power Transformation to selected features to reduce skewness.
transformer = PowerTransformer(method='yeo-johnson')
for col in ["SK", "TS", "M11", "BD2", "Age"]:
    df[col] = transformer.fit_transform(df[[col]])

# Remove unnecessary column if present.
if "ID" in df.columns:
    df = df.drop(columns=["ID"])

# Separate features and target.
X = df.drop(columns=[target])
Y = df[target]

# Apply Robust Scaling to all features.
scaler = RobustScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
print("\nScaled Feature Columns:")
print(df_scaled.columns)

# =========================
# Train-Test Split for Modeling
# =========================
X_train, X_test, y_train, y_test = train_test_split(df_scaled, Y, test_size=0.2, random_state=42)

# =========================
# Model Training and Evaluation
# =========================

# --- 1. Basic Models ---
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC()
}

print("\n--- Basic Models Evaluation ---")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n{name} Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred))

# --- 2. Stacking Classifier ---
base_models = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ('SVM', SVC(probability=True))
]
meta_model = LogisticRegression()
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)
print("\n--- Stacking Classifier Evaluation ---")
print(f"Stacking Model Accuracy: {accuracy_score(y_test, y_pred_stack):.4f}")
print(classification_report(y_test, y_pred_stack))

# --- 3. Additional Models ---
# Gradient Boosting
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
print("\nGradient Boosting Accuracy:", accuracy_score(y_test, gb_pred))
print(classification_report(y_test, gb_pred))

# XGBoost
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)
print("\nXGBoost Accuracy:", accuracy_score(y_test, xgb_pred))
print(classification_report(y_test, xgb_pred))

# KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
print("\nKNN Accuracy:", accuracy_score(y_test, knn_pred))
print(classification_report(y_test, knn_pred))

# SGM (Simple Neural Network)
sgm_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
sgm_model.fit(X_train, y_train)
sgm_pred = sgm_model.predict(X_test)
print("\nSGM Accuracy:", accuracy_score(y_test, sgm_pred))
print(classification_report(y_test, sgm_pred))

# Bi-SGM (Deeper Neural Network)
bi_sgm_model = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=500)
bi_sgm_model.fit(X_train, y_train)
bi_sgm_pred = bi_sgm_model.predict(X_test)
print("\nBi-SGM Accuracy:", accuracy_score(y_test, bi_sgm_pred))
print(classification_report(y_test, bi_sgm_pred))

# --- 4. RNN and Bi-RNN Models ---
# For RNN-based models, we need to reshape the input data.
X_train_rnn = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_rnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

# RNN Model
rnn_model = Sequential([
    SimpleRNN(50, activation='relu', input_shape=(X_train.shape[1], 1)),
    Dense(1, activation='sigmoid')
])


First 5 rows:
          ID  PRG   PL  PR  SK   TS   M11    BD2  Age  Insurance   Sepssis
0  ICU200010    6  148  72  35    0  33.6  0.627   50          0  Positive
1  ICU200011    1   85  66  29    0  26.6  0.351   31          0  Negative
2  ICU200012    8  183  64   0    0  23.3  0.672   32          1  Positive
3  ICU200013    1   89  66  23   94  28.1  0.167   21          1  Negative
4  ICU200014    0  137  40  35  168  43.1  2.288   33          1  Positive

Last 5 rows:
            ID  PRG   PL  PR  SK   TS   M11    BD2  Age  Insurance   Sepssis
594  ICU200604    6  123  72  45  230  33.6  0.733   34          0  Negative
595  ICU200605    0  188  82  14  185  32.0  0.682   22          1  Positive
596  ICU200606    0   67  76   0    0  45.3  0.194   46          1  Negative
597  ICU200607    1   89  24  19   25  27.8  0.559   21          0  Negative
598  ICU200608    1  173  74   0    0  36.8  0.088   38          1  Positive

Random sample of 5 rows:
            ID  PRG   PL  PR  SK  

Parameters: { "use_label_encoder" } are not used.




XGBoost Accuracy: 0.7333333333333333
              precision    recall  f1-score   support

           0       0.81      0.77      0.79        77
           1       0.62      0.67      0.64        43

    accuracy                           0.73       120
   macro avg       0.71      0.72      0.72       120
weighted avg       0.74      0.73      0.74       120


KNN Accuracy: 0.7583333333333333
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        77
           1       0.69      0.58      0.63        43

    accuracy                           0.76       120
   macro avg       0.74      0.72      0.73       120
weighted avg       0.75      0.76      0.75       120






SGM Accuracy: 0.7333333333333333
              precision    recall  f1-score   support

           0       0.80      0.78      0.79        77
           1       0.62      0.65      0.64        43

    accuracy                           0.73       120
   macro avg       0.71      0.72      0.71       120
weighted avg       0.74      0.73      0.73       120



  super().__init__(**kwargs)



Bi-SGM Accuracy: 0.7583333333333333
              precision    recall  f1-score   support

           0       0.79      0.86      0.82        77
           1       0.69      0.58      0.63        43

    accuracy                           0.76       120
   macro avg       0.74      0.72      0.73       120
weighted avg       0.75      0.76      0.75       120

