In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer



# Load the preprocessed dataset
data = pd.read_csv("dataset/stroke_data.csv")

# Separate features and target variable
X = data.drop(columns=["stroke"])
y = data["stroke"]

In [2]:
data.dtypes

sex                  float64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40910 entries, 0 to 40909
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sex                40907 non-null  float64
 1   age                40910 non-null  float64
 2   hypertension       40910 non-null  int64  
 3   heart_disease      40910 non-null  int64  
 4   ever_married       40910 non-null  int64  
 5   work_type          40910 non-null  int64  
 6   Residence_type     40910 non-null  int64  
 7   avg_glucose_level  40910 non-null  float64
 8   bmi                40910 non-null  float64
 9   smoking_status     40910 non-null  int64  
 10  stroke             40910 non-null  int64  
dtypes: float64(4), int64(7)
memory usage: 3.4 MB


In [4]:
data.shape

(40910, 11)

In [5]:
data.describe(include="all")

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,40907.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0,40910.0
mean,0.555162,51.327255,0.213835,0.127719,0.82134,3.461134,0.514886,122.075901,30.406355,0.488609,0.500122
std,0.496954,21.623969,0.410017,0.333781,0.383072,0.780919,0.499784,57.561531,6.835072,0.499876,0.500006
min,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,55.12,11.5,0.0,0.0
25%,0.0,35.0,0.0,0.0,1.0,3.0,0.0,78.75,25.9,0.0,0.0
50%,1.0,52.0,0.0,0.0,1.0,4.0,1.0,97.92,29.4,0.0,1.0
75%,1.0,68.0,0.0,0.0,1.0,4.0,1.0,167.59,34.1,1.0,1.0
max,1.0,103.0,1.0,1.0,1.0,4.0,1.0,271.74,92.0,1.0,1.0


In [6]:
data.head()

Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1


In [7]:
data.isnull().sum()
# This will give you the count of missing values in each column


sex                  3
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [8]:
# Handling Missing Values
# Impute missing values in the 'sex' column with the most frequent value
imputer = SimpleImputer(strategy='most_frequent')
data[['sex']] = imputer.fit_transform(data[['sex']])

# Encoding Categorical Variables (if needed)
# Assuming all categorical variables are already encoded

# Feature Scaling (if needed)
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
data[numerical_columns] = scaler.fit_transform(data[numerical_columns])

# Splitting Data into Training and Testing Sets
X = data.drop('stroke', axis=1)
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Balancing the Dataset (if needed)
class_counts = y_train.value_counts()
# If classes are imbalanced, consider using techniques like oversampling or undersampling

# Now you have preprocessed data ready for training your model

In [10]:


# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Predicting on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluating the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9984111464189684


In [11]:
# Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

# Predicting on the test set using Decision Tree
dt_y_pred = dt_classifier.predict(X_test)

# Evaluate Random Forest Classifier
print("Random Forest Classifier:")
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))

Random Forest Classifier:
Accuracy: 0.9984111464189684
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4022
           1       1.00      1.00      1.00      4160

    accuracy                           1.00      8182
   macro avg       1.00      1.00      1.00      8182
weighted avg       1.00      1.00      1.00      8182


Decision Tree Classifier:
Accuracy: 0.9998777804937669
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4022
           1       1.00      1.00      1.00      4160

    accuracy                           1.00      8182
   macro avg       1.00      1.00      1.00      8182
weighted avg       1.00      1.00      1.00      8182



In [12]:
from sklearn.neighbors import KNeighborsClassifier

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a K-Nearest Neighbors Classifier
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Predictions for KNN Classifier
knn_y_pred = knn_classifier.predict(X_test)

# Evaluate KNN Classifier
print("K-Nearest Neighbors Classifier:")
print("Accuracy:", accuracy_score(y_test, knn_y_pred))
print("Classification Report:\n", classification_report(y_test, knn_y_pred))


K-Nearest Neighbors Classifier:
Accuracy: 0.8813248594475678
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.78      0.87      4022
           1       0.82      0.98      0.89      4160

    accuracy                           0.88      8182
   macro avg       0.90      0.88      0.88      8182
weighted avg       0.90      0.88      0.88      8182



In [13]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for Random Forest and Decision Tree classifiers
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Instantiate Random Forest and Decision Tree classifiers
rf_classifier = RandomForestClassifier(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

# Instantiate GridSearchCV for Random Forest and Decision Tree classifiers
rf_grid_search = GridSearchCV(rf_classifier, rf_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
dt_grid_search = GridSearchCV(dt_classifier, dt_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the models
rf_grid_search.fit(X_train, y_train)
dt_grid_search.fit(X_train, y_train)

# Best parameters for Random Forest
print("Best parameters for Random Forest:", rf_grid_search.best_params_)

# Best parameters for Decision Tree
print("Best parameters for Decision Tree:", dt_grid_search.best_params_)

# Predicting on the test set using best estimators from grid search
rf_y_pred = rf_grid_search.predict(X_test)
dt_y_pred = dt_grid_search.predict(X_test)

# Evaluate Random Forest Classifier
print("\nRandom Forest Classifier:")
rf_accuracy = accuracy_score(y_test, rf_y_pred)
print("Accuracy:", rf_accuracy)
print("Classification Report:\n", classification_report(y_test, rf_y_pred))

# Evaluate Decision Tree Classifier
print("\nDecision Tree Classifier:")
dt_accuracy = accuracy_score(y_test, dt_y_pred)
print("Accuracy:", dt_accuracy)
print("Classification Report:\n", classification_report(y_test, dt_y_pred))


Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best parameters for Decision Tree: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}

Random Forest Classifier:
Accuracy: 0.9982889269127353
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4022
           1       1.00      1.00      1.00      4160

    accuracy                           1.00      8182
   macro avg       1.00      1.00      1.00      8182
weighted avg       1.00      1.00      1.00      8182


Decision Tree Classifier:
Accuracy: 0.9998777804937669
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4022
           1       1.00      1.00      1.00      4160

    accuracy                           1.00      8182
   macro avg       1.00      1.00      1.00      8182
weighted avg    

In [17]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Instantiate XGBoost classifier
xgb_classifier = XGBClassifier(random_state=42)

# Define parameter grid for XGBoost classifier
xgb_param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 200, 300],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

# Instantiate GridSearchCV for XGBoost classifier
xgb_grid_search = GridSearchCV(xgb_classifier, xgb_param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the model
xgb_grid_search.fit(X_train, y_train_encoded)

# Best parameters for XGBoost
print("Best parameters for XGBoost:", xgb_grid_search.best_params_)

# Predicting on the test set using best estimator from grid search
xgb_y_pred = xgb_grid_search.predict(X_test)

# Evaluate XGBoost Classifier
print("\nXGBoost Classifier:")
xgb_accuracy = accuracy_score(y_test_encoded, xgb_y_pred)
print("Accuracy:", xgb_accuracy)
print("Classification Report:\n", classification_report(y_test_encoded, xgb_y_pred))


Best parameters for XGBoost: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 1.0}

XGBoost Classifier:
Accuracy: 0.9996333414813005
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4022
           1       1.00      1.00      1.00      4160

    accuracy                           1.00      8182
   macro avg       1.00      1.00      1.00      8182
weighted avg       1.00      1.00      1.00      8182



In [None]:




# Initialize LightGBM Classifier
lgb_classifier = LGBMClassifier(random_state=42)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1]
}

# Initialize LightGBM Classifier
best_lgb_classifier = LGBMClassifier(num_leaves=50, max_depth=7, learning_rate=0.05, n_estimators=300, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Perform GridSearchCV for LightGBM
grid_search_lgb = GridSearchCV(lgb_classifier, param_grid_lgb, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_lgb.fit(X_train_scaled, y_train)

# Get the best parameters for LightGBM
best_params_lgb = grid_search_lgb.best_params_
print("Best Parameters for LightGBM:", best_params_lgb)

# Train the LightGBM classifier with the best parameters
best_lgb_classifier = LGBMClassifier(**best_params_lgb, random_state=42)
best_lgb_classifier.fit(X_train_scaled, y_train)

# Predictions for the best LightGBM Classifier
best_lgb_y_pred = best_lgb_classifier.predict(X_test_scaled)

# Evaluate the best LightGBM Classifier
print("Best LightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, best_lgb_y_pred))
print("Classification Report:\n", classification_report(y_test, best_lgb_y_pred))


Best Parameters for LightGBM: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 200}
Best LightGBM Classifier:
Accuracy: 0.8510525070955535
Classification Report:
               precision    recall  f1-score   support

         0.0       0.86      0.98      0.92     42795
         1.0       0.00      0.00      0.00       944
         2.0       0.56      0.18      0.27      6997

    accuracy                           0.85     50736
   macro avg       0.48      0.39      0.40     50736
weighted avg       0.81      0.85      0.81     50736



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
imputer = SimpleImputer(strategy='most_frequent')
data[['sex']] = imputer.fit_transform(data[['sex']])

# Splitting Data into Training and Testing Sets
X = data.drop('stroke', axis=1)
y = data['stroke']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
numerical_columns = ['age', 'avg_glucose_level', 'bmi']
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])
X_test[numerical_columns] = scaler.transform(X_test[numerical_columns])

# Define the ANN model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = model.predict_classes(X_test)
print(classification_report(y_test, y_pred))

NameError: name 'tf' is not defined