In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import joblib
import matplotlib.pyplot as plt

# Load the synthetic dataset
data = pd.read_excel('synthetic_sepsis_data_70_30.xlsx')

# Separate features and target variable
X = data.drop(columns=['Sepsis_Label'])  # Features
y = data['Sepsis_Label']  # Target variable

# Rebalance the dataset using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Scale features to standardize the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

# Train the model with GridSearchCV
grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Optimized Model Accuracy: {accuracy:.2f}")
print(f"Optimized F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Save the improved model
joblib.dump(best_model, 'sepsis_model.pkl')
print("Improved model saved as 'sepsis_model.pkl'")

# Plotting Accuracy and F1 Score
scores = {'Accuracy': accuracy, 'F1 Score': f1}
plt.figure(figsize=(8, 5))
plt.bar(scores.keys(), scores.values(), color=['blue', 'green'])
plt.title('Improved Model Performance Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)  # Both scores are between 0 and 1
plt.grid(axis='y', linestyle='--', alpha=0.7)
for metric, value in scores.items():
    plt.text(metric, value + 0.02, f"{value:.2f}", ha='center', fontsize=12)
plt.show()


ModuleNotFoundError: No module named 'imblearn'

In [3]:
pip install imbalanced-learn
 

SyntaxError: invalid syntax (3761031963.py, line 1)