# Neonatal Sepsis Data

In [None]:
import pandas as pd 
import seaborn as sns
from itertools import product
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('../Data/Neonatal_Sepsis_Registry.csv')
df.info()

In [None]:
list_intresting_parameters = ["gestational_age_at_birth_weeks",
                              "sex",
                              "race",
                              "birth_weight_kg",
                              "sepsis_group",
                              "onset_age_in_days",
                              "onset_hour_of_day",
                            #   "time_to_antibiotics",
                              "stat_abx",
                              "intubated_at_time_of_sepsis_evaluation",
                              "inotrope_at_time_of_sepsis_eval",
                              "central_venous_line",
                              "umbilical_arterial_line",
                              "ecmo",
                              "temp_celsius",
                              "comorbidity_necrotizing_enterocolitis",
                              "comorbidity_chronic_lung_disease",
                              "comorbidity_cardiac",
                              "comorbidity_surgical",
                              "comorbidity_ivh_or_shunt"]

df = df[list_intresting_parameters]
df.info()

## Plot

In [None]:
sns.catplot(data=df,
            y="gestational_age_at_birth_weeks",
            x="sepsis_group",
            hue="temp_celsius",
            palette="Spectral",
            row="sex"
)

In [None]:
df.describe()

## Modelle ausprobieren


Splitting data set in without na and male/female

In [None]:
df.replace('NI', np.nan, inplace=True)


In [None]:
# Benutzerdefinierte Funktion zur Zuordnung von Werten zu 0 oder 1
def map_to_binary(value):
    if value == 1 or value in [4, 5, 6]:
        return 1
    else:
        return 0

# Eine neue Spalte "sepsis_binary" erstellen, indem Sie die benutzerdefinierte Funktion auf die "sepsis_group"-Spalte anwenden
df['sepsis_binary'] = df['sepsis_group'].apply(map_to_binary)



In [None]:
df_male = df[df["sex"] == 1]
df_female = df[df["sex"] == 0]

df_without_nan = df.dropna()
df_without_nan["race"] = df_without_nan["race"].astype(int)
df_male_without_nan = df_male.dropna()
df_female_without_nan = df_female.dropna()

In [None]:
df_without_nan.info()

In [None]:
df_encoded = pd.get_dummies(df_without_nan, columns=['race'])


In [None]:
sns.catplot(data=df_without_nan, x ="sepsis_binary", kind="count")

### All gender with na

In [None]:
# X = df[df.columns.difference(['sepsis_group', "sepsis_binary"])]
# df['sepsis_binary'] = df['sepsis_binary']
# y = df['sepsis_binary']

# # Train-Test-Split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = HistGradientBoostingClassifier()
# model.fit(X_train, y_train)

# y_pred = model.predict(X_test_scaled)
# # accuracy = accuracy_score(y_test, y_pred)
# # print(f"{name}: {accuracy}")
# print(len(X_test), len(y_test), len(y_pred), y_train.shape)

### All gender without na

In [None]:
X = df_encoded[df_encoded.columns.difference(['sepsis_binary', 'sepsis_group'])]
df_encoded['sepsis_binary'] = df_encoded['sepsis_binary'] 
y = df_encoded['sepsis_binary']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature Scaling
# scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Schritt 3: Feature Selection mit SelectKBest auf den resamplten Trainingsdaten
selector = SelectKBest(mutual_info_classif, k=3)  # Wählen Sie die besten 5 Merkmale aus
X_train_selected = selector.fit_transform(X_train_scaled, y_train_resampled)
X_test_selected = selector.transform(X_test_scaled)

# [('learning_rate', 0.2), ('max_depth', 4), ('n_estimators', 150)]
# Modelle initialisieren
# criterion': 'entropy', 'max_depth': 30, 'max_features': 'auto', 'n_estimators': 200
models = {
    "LR": LogisticRegression(),
    "SVM": SVC(),
    # "XGBoost": XGBClassifier(eval_metric='mlogloss', learning_rate=0.2, max_depth=4, n_estimators=150),
    "XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "RF": RandomForestClassifier(),
    "RF": RandomForestClassifier(max_depth =30, min_samples_split= 2, n_estimators=200, criterion='entropy'),
    "DT": DecisionTreeClassifier(),
    "NB": GaussianNB()
}

# Modelle trainieren und evaluieren
for name, model in models.items():
    model.fit(X_train_selected, y_train_resampled.values.ravel())
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy}")

# Nachdem Sie SelectKBest angewendet haben, können Sie die ausgewählten Indizes der Merkmale abrufen.
selected_feature_indices = selector.get_support(indices=True)

# Verwenden Sie die Indizes, um die Namen der ausgewählten Merkmale aus Ihrem ursprünglichen DataFrame abzurufen.
selected_feature_names = X.columns[selected_feature_indices]

# Drucken Sie die Namen der ausgewählten Merkmale aus.
print("Ausgewählte Merkmale:")
print(selected_feature_names)

In [None]:
from sklearn.ensemble import VotingClassifier

# Modelle, die du kombinieren möchtest
models = [
    ('LR', LogisticRegression()),
    ('RF', RandomForestClassifier()),
    ('XGB', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))
]

# Voting Classifier erstellen
voting_clf = VotingClassifier(estimators=models, voting='hard')

# Training des Voting Classifiers
voting_clf.fit(X_train_selected, y_train_resampled.values.ravel())

# Evaluierung des Voting Classifiers
y_pred = voting_clf.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"Voting Classifier: {accuracy}")


In [None]:
X = df_male_without_nan[df_male_without_nan.columns.difference(['sepsis_binary', 'sepsis_group'])]
df_male_without_nan['sepsis_binary'] = df_male_without_nan['sepsis_binary'] 
y = df_male_without_nan['sepsis_binary']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Modelle initialisieren
models = {
    "LR": LogisticRegression(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "RF": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "NB": GaussianNB()
}

# Modelle trainieren und evaluieren
for name, model in models.items():
    model.fit(X_train_scaled, y_train_resampled.values.ravel())
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy}")


### PCA

In [None]:


pca = PCA(n_components=5)  # Zum Beispiel, um die Dimensionalität auf 2 zu reduzieren
X_pca = pca.fit_transform(df_without_nan)

# import matplotlib.pyplot as plt

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
print(cumulative_variance)

plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), cumulative_variance, marker='o', linestyle='-')
plt.xlabel('Anzahl der Hauptkomponenten')
plt.ylabel('Kumulative erklärte Varianz')
plt.show()


In [None]:
# Die Ladevektoren für die ersten beiden Hauptkomponenten abrufen
first_pc_loading_vector = pca.components_[0]
second_pc_loading_vector = pca.components_[1]

# Die Ladevektoren ausgeben
print("Ladevektor für die erste Hauptkomponente:")
print(first_pc_loading_vector)

print("\nLadevektor für die zweite Hauptkomponente:")
print(second_pc_loading_vector)


In [None]:
# Index der Spalte mit dem größten Beitrag zur ersten Hauptkomponente finden
max_contributor_index = np.argmax(np.abs(first_pc_loading_vector))

# Den Namen der Spalte aus dem DataFrame abrufen
column_name = df_without_nan.columns[max_contributor_index]

print(f"Die Spalte mit dem größten Beitrag zur ersten Hauptkomponente ist '{column_name}'")


In [None]:
# Index der Spalte mit dem größten Beitrag zur ersten Hauptkomponente finden
max_contributor_index = np.argmax(np.abs(second_pc_loading_vector))

# Den Namen der Spalte aus dem DataFrame abrufen
column_name = df_without_nan.columns[max_contributor_index]

print(f"Die Spalte mit dem größten Beitrag zur zweiten Hauptkomponente ist '{column_name}'")


In [None]:
X = df_without_nan[df_without_nan.columns.difference(['sepsis_binary', 'sepsis_group'])]
df_without_nan['sepsis_binary'] = df_without_nan['sepsis_binary'] 
y = df_without_nan['sepsis_binary']

# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Schritt 3: Feature Selection mit SelectKBest auf den resamplten Trainingsdaten
selector = SelectKBest(mutual_info_classif, k=5)  # Wählen Sie die besten 5 Merkmale aus
X_train_selected = selector.fit_transform(X_train_scaled, y_train_resampled)
X_test_selected = selector.transform(X_test_scaled)


# Modelle initialisieren
models = {
    "LR": LogisticRegression(),
    "SVM": SVC(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    "RF": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "DT": DecisionTreeClassifier(),
    "NB": GaussianNB()
}

# Modelle trainieren und evaluieren
for name, model in models.items():
    model.fit(X_train_selected, y_train_resampled.values.ravel())
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}: {accuracy}")

# Nachdem Sie SelectKBest angewendet haben, können Sie die ausgewählten Indizes der Merkmale abrufen.
selected_feature_indices = selector.get_support(indices=True)

# Verwenden Sie die Indizes, um die Namen der ausgewählten Merkmale aus Ihrem ursprünglichen DataFrame abzurufen.
selected_feature_names = X.columns[selected_feature_indices]

# Drucken Sie die Namen der ausgewählten Merkmale aus.
print("Ausgewählte Merkmale:")
print(selected_feature_names)

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

xgb_params = {
    'learning_rate': Real(0.01, 0.2, 'log-uniform'),
    'max_depth': Integer(3, 10),
    'n_estimators': Integer(50, 150)
}

xgb_model = XGBClassifier( eval_metric='mlogloss', random_state=42)
xgb_bayes_search = BayesSearchCV(xgb_model, xgb_params, n_iter=32, cv=5, n_jobs=-1, random_state=42)
xgb_bayes_search.fit(X_train_selected, y_train_resampled.values.ravel())

print('Beste Parameter für XGBoost:', xgb_bayes_search.best_params_)


In [None]:
rf_params = {
    'n_estimators': Integer(50, 150),
    'max_depth': Integer(10, 30),
    'min_samples_split': Integer(2, 10)
}

rf_model = RandomForestClassifier(random_state=42)
rf_bayes_search = BayesSearchCV(rf_model, rf_params, n_iter=32, cv=5, n_jobs=-1, random_state=42)
rf_bayes_search.fit(X_train_selected, y_train_resampled.values.ravel())

print('Beste Parameter für RF:', rf_bayes_search.best_params_)


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [10, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'criterion': ['gini', 'entropy']
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(X_train_selected, y_train_resampled.values.ravel())

# Beste Parameter ausgeben
print("Beste Parameter:", grid_search.best_params_)

# Beste Modell verwenden
best_rf_model = grid_search.best_estimator_
