In [10]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

In [12]:
import pandas as pd

# Load your dataset
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

# Set the percentiles for capping
lower_percentile = 0.05
upper_percentile = 0.95

# Loop through every attribute (column) in the dataset
for column in df.columns:
    # Only apply capping for numeric columns
    if pd.api.types.is_numeric_dtype(df[column]):
        # Calculate lower and upper caps
        lower_cap = df[column].quantile(lower_percentile)
        upper_cap = df[column].quantile(upper_percentile)
        
        # Cap the outliers in the column
        df[column] = df[column].clip(lower=lower_cap, upper=upper_cap)

# Display the modified dataset
print(df.head())


    age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0  75.0        0                       582         0                 20   
1  55.0        0                      2263         0                 38   
2  65.0        0                       146         0                 20   
3  50.0        1                       111         0                 20   
4  65.0        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1  265000.00               1.9           130    1   
1                    0  263358.03               1.1           136    1   
2                    0  162000.00               1.3           130    1   
3                    0  210000.00               1.9           137    1   
4                    0  327000.00               2.7           130    0   

   smoking  time  DEATH_EVENT  
0        0  12.9            1  
1        0  12.9            1  
2       

  df[column] = df[column].clip(lower=lower_cap, upper=upper_cap)


In [14]:
X = df.drop(columns='DEATH_EVENT', axis=1)
Y = df['DEATH_EVENT']

In [16]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, stratify=Y, random_state=2)

In [20]:
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=0)
gb_model.fit(X_train, Y_train)

In [22]:
Y_pred_gb = gb_model.predict(X_test)

In [24]:
accuracy = accuracy_score(Y_test, Y_pred_gb)
print(f"Gradient Boosting Test Accuracy: {accuracy * 100:.2f}%")

Gradient Boosting Test Accuracy: 86.67%


In [26]:
X_train_cnn = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test_cnn = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

In [30]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input
import warnings

In [32]:
model_cnn = Sequential([
    Input(shape=(X_train_cnn.shape[1], 1)),  # Use Input layer to define the input shape
    Conv1D(filters=32, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [34]:
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [36]:
history = model_cnn.fit(X_train_cnn, Y_train, epochs=20, validation_data=(X_test_cnn, Y_test), verbose=1)

Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 58ms/step - accuracy: 0.5566 - loss: 0.6787 - val_accuracy: 0.6833 - val_loss: 0.6403
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7057 - loss: 0.6243 - val_accuracy: 0.6833 - val_loss: 0.6321
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6392 - loss: 0.6586 - val_accuracy: 0.6833 - val_loss: 0.6250
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6906 - loss: 0.6157 - val_accuracy: 0.6833 - val_loss: 0.6191
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7031 - loss: 0.6023 - val_accuracy: 0.6833 - val_loss: 0.6106
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6542 - loss: 0.6230 - val_accuracy: 0.6833 - val_loss: 0.6033
Epoch 7/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━

In [38]:
test_loss, test_accuracy = model_cnn.evaluate(X_test_cnn, Y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 73.33%


In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Load and prepare the data
heart_df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
X = heart_df.drop(columns='DEATH_EVENT', axis=1)
y = heart_df['DEATH_EVENT']

# Feature selection using feature importance
def select_important_features(X, y):
    initial_model = GradientBoostingClassifier(random_state=42)
    initial_model.fit(X, y)
    selector = SelectFromModel(initial_model, prefit=True, threshold='mean')
    feature_idx = selector.get_support()
    selected_features = X.columns[feature_idx]
    return X[selected_features], selected_features

# Custom evaluation function
def evaluate_model(y_true, y_pred, y_pred_proba=None):
    print("Model Performance Metrics:")
    print("-" * 50)
    print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    if y_pred_proba is not None:
        print(f"ROC AUC: {roc_auc_score(y_true, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

# Preprocessing pipeline
def preprocess_data(X, y, test_size=0.2):
    # Feature selection
    X_selected, selected_features = select_important_features(X, y)
    print("Selected features:", selected_features.tolist())
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X_selected, y, test_size=test_size, stratify=y, random_state=42
    )
    
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Handle imbalanced data using SMOTE
    smote = SMOTE(random_state=42)
    X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
    
    return X_train_balanced, X_test_scaled, y_train_balanced, y_test

# Hyperparameter tuning
def tune_hyperparameters(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'subsample': [0.8, 0.9, 1.0]
    }
    
    gb_model = GradientBoostingClassifier(random_state=42)
    grid_search = GridSearchCV(
        estimator=gb_model,
        param_grid=param_grid,
        cv=5,
        scoring='roc_auc',
        n_jobs=-1,
        verbose=0
    )
    grid_search.fit(X_train, y_train)
    
    print("Best parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

# Main execution
def main():
    # Preprocess the data
    X_train, X_test, y_train, y_test = preprocess_data(X, y)
    
    # Tune and train the model
    best_model = tune_hyperparameters(X_train, y_train)
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    y_pred_proba = best_model.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    evaluate_model(y_test, y_pred, y_pred_proba)
    
    # Perform cross-validation
    cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
    print("\nCross-validation scores:", cv_scores)
    print(f"Average CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

if __name__ == "__main__":
    main()

Selected features: ['ejection_fraction', 'serum_creatinine', 'time']
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.9}
Model Performance Metrics:
--------------------------------------------------
Accuracy: 0.8333
ROC AUC: 0.8344

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.90      0.88        41
           1       0.76      0.68      0.72        19

    accuracy                           0.83        60
   macro avg       0.81      0.79      0.80        60
weighted avg       0.83      0.83      0.83        60


Confusion Matrix:
[[37  4]
 [ 6 13]]

Cross-validation scores: [0.78461538 0.87692308 0.92307692 0.92307692 0.890625  ]
Average CV score: 0.8797 (+/- 0.1017)


In [48]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Load the dataset
heart_df = pd.read_csv('xpanded_heart_failure_dataset.csv')

# Split features and target
X = heart_df.drop(columns='DEATH_EVENT', axis=1)
Y = heart_df['DEATH_EVENT']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, stratify=Y, random_state=2)

# Define a Gradient Boosting model with a simplified parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 150],        # Number of boosting stages
    'learning_rate': [0.05, 0.1],      # Shrinkage rate
    'max_depth': [3, 4]                # Maximum depth of individual estimators
}

# Initialize the Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=0)

# Set up GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')

# Fit the model with training data to find the best parameters
grid_search.fit(X_train, Y_train)

# Retrieve the best model from grid search
best_gb_model = grid_search.best_estimator_

# Make predictions on the test set
Y_pred_gb = best_gb_model.predict(X_test)

# Calculate and print the test accuracy
accuracy = accuracy_score(Y_test, Y_pred_gb)
print(f"Optimized Gradient Boosting Test Accuracy: {accuracy * 100:.2f}%")

# Print the confusion matrix and classification report
conf_matrix = confusion_matrix(Y_test, Y_pred_gb)
print("\nConfusion Matrix:\n", conf_matrix)

print("\nClassification Report:\n", classification_report(Y_test, Y_pred_gb))

# Print the best parameters
print("\nBest Parameters from Grid Search:\n", grid_search.best_params_)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [46]:
import pandas as pd
import numpy as np

# Load the original dataset
heart_df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

# Calculate the number of new rows needed to reach 10,000 rows
current_rows = heart_df.shape[0]
new_rows_needed = 10000 - current_rows

# Generate new synthetic rows by sampling from the existing data with replacement
# Add a small random noise to numerical columns for variation
synthetic_data = heart_df.sample(n=new_rows_needed, replace=True).reset_index(drop=True)

# Add small random noise to numeric columns
for col in synthetic_data.select_dtypes(include=['float64', 'int64']).columns:
    noise = np.random.normal(0, 0.01, size=synthetic_data[col].shape)  # Adjust the noise level if needed
    synthetic_data[col] += noise

# Append the synthetic data to the original data
expanded_heart_df = pd.concat([heart_df, synthetic_data], ignore_index=True)

# Save the expanded dataset to a new CSV file
expanded_heart_df.to_csv('xpanded_heart_failure_dataset.csv', index=False)

print(f"Expanded dataset shape: {expanded_heart_df.shape}")


Expanded dataset shape: (10000, 13)


In [50]:
import pandas as pd

# Load the dataset
file_path = 'heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

# Convert DEATH_EVENT attribute to only 0 or 1 (ensure it is binary integer)
data['DEATH_EVENT'] = data['DEATH_EVENT'].astype(int).clip(0, 1)

# Convert all attributes to real integers
data = data.applymap(lambda x: int(x) if isinstance(x, (float, int)) else x)

# Limit the data to 1000 rows if it has more rows
data_limited = data.head(1000)

# Save the modified dataset to a new CSV file
data_limited.to_csv('modified_heart_failure_data.csv', index=False)

# Display the first few rows to confirm changes
print(data_limited.head())


   age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0   75        0                       582         0                 20   
1   55        0                      7861         0                 38   
2   65        0                       146         0                 20   
3   50        1                       111         0                 20   
4   65        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1     265000                 1           130    1   
1                    0     263358                 1           136    1   
2                    0     162000                 1           129    1   
3                    0     210000                 1           137    1   
4                    0     327000                 2           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2        1    

In [52]:
import pandas as pd

# Load the dataset
file_path = 'heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

# Convert DEATH_EVENT attribute to only 0 or 1 (ensure it is binary integer)
data['DEATH_EVENT'] = data['DEATH_EVENT'].astype(int).clip(0, 1)

# Convert all attributes to real integers
data = data.applymap(lambda x: int(x) if isinstance(x, (float, int)) else x)

# Adjust dataset to contain exactly 1000 rows
if len(data) < 1000:
    # Repeat rows if the dataset has fewer than 1000 rows
    data_limited = pd.concat([data] * (1000 // len(data) + 1), ignore_index=True).head(1000)
else:
    # Truncate to 1000 rows if the dataset has more than 1000 rows
    data_limited = data.head(1000)

# Save the modified dataset to a new CSV file
data_limited.to_csv('modified_heart_failure_data_1000.csv', index=False)

# Display the first few rows to confirm changes
print(data_limited.head())


   age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0   75        0                       582         0                 20   
1   55        0                      7861         0                 38   
2   65        0                       146         0                 20   
3   50        1                       111         0                 20   
4   65        1                       160         1                 20   

   high_blood_pressure  platelets  serum_creatinine  serum_sodium  sex  \
0                    1     265000                 1           130    1   
1                    0     263358                 1           136    1   
2                    0     162000                 1           129    1   
3                    0     210000                 1           137    1   
4                    0     327000                 2           116    0   

   smoking  time  DEATH_EVENT  
0        0     4            1  
1        0     6            1  
2        1    