In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer

In [36]:
# Load the dataset
file_path = 'healthcare-dataset-stroke-data.csv' # Update this path accordingly
stroke_data = pd.read_csv(file_path)
stroke_data.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [37]:
from sklearn.preprocessing import LabelEncoder
# Create a copy of the data to encode
encoded_stroke_data = stroke_data.copy()

# Use label encoder to code categotical variables
label_encoder = LabelEncoder()

# List of categorical columns to be encoded
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Apply label encoder for each categorical column
for column in categorical_columns:
    encoded_stroke_data[column] = label_encoder.fit_transform(encoded_stroke_data[column].astype(str))

# Display the encoded categorical variables
encoded_stroke_data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,1,67.0,0,1,1,2,1,228.69,36.6,1,1
1,51676,0,61.0,0,0,1,3,0,202.21,,2,1
2,31112,1,80.0,0,1,1,2,0,105.92,32.5,2,1
3,60182,0,49.0,0,0,1,2,1,171.23,34.4,3,1
4,1665,0,79.0,1,0,1,3,0,174.12,24.0,2,1


In [38]:
import statsmodels.api as sm
from sklearn.impute import SimpleImputer
# Handling missing values for BMI by mean imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
encoded_stroke_data['bmi'] = imputer.fit_transform(encoded_stroke_data[['bmi']])

# Features and target variable
X = encoded_stroke_data.drop(['id', 'stroke'], axis=1)  # id is not a useful feature, stroke is the DV
y = encoded_stroke_data['stroke']

# Adding constant for intercept term as statsmodels doesn't add it by default
X = sm.add_constant(X)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Getting the summary of the regression
regression_summary = result.summary()
p_values = result.pvalues

#regression_summary
p_values

Optimization terminated successfully.
         Current function value: 0.155533
         Iterations 9


const                4.384989e-41
gender               7.134724e-01
age                  1.143494e-39
hypertension         1.845758e-02
heart_disease        8.974717e-02
ever_married         3.842890e-01
work_type            4.673000e-01
Residence_type       4.725803e-01
avg_glucose_level    6.170394e-04
bmi                  8.780803e-01
smoking_status       9.951107e-01
dtype: float64

In [39]:
# Create a list of variables with p-values less than 0.1, which are significant at the 95% confidence level
significant_variables = p_values[p_values < 0.1].index.tolist()


# However, given consideration of clinical relevance and other domain-specific knowledge,
# We'd also want to involve smoking_statusand bmi index
key_features = significant_variables + ['bmi']
key_features.remove('const')
key_features

['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']

In [40]:
# Create a copy of the data to encode
encoded_stroke_data = stroke_data.copy()

# Use label encoder to code categorical variables
label_encoder = LabelEncoder()

# List of categorical columns to be encoded
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Apply label encoder for each categorical column
for column in categorical_columns:
    encoded_stroke_data[column] = label_encoder.fit_transform(encoded_stroke_data[column].astype(str))

# Handling missing values for BMI by mean imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
encoded_stroke_data['bmi'] = imputer.fit_transform(encoded_stroke_data[['bmi']])

# Features and target variable
X = encoded_stroke_data.drop(['id', 'stroke'], axis=1)
y = encoded_stroke_data['stroke']

# Adding constant for intercept term as statsmodels doesn't add it by default
X = sm.add_constant(X)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Getting the p-values of the regression
p_values = result.pvalues

# Select the best 7 features based on p-values
selected_features = p_values[p_values < 0.05].index.tolist()

# Remove the constant term from selected features
selected_features.remove('const')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_stroke_data[selected_features], y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))

Optimization terminated successfully.
         Current function value: 0.155533
         Iterations 9
Accuracy: 0.9393346379647749
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [41]:
# Create a copy of the data to encode
encoded_stroke_data = stroke_data.copy()

# Use label encoder to code categorical variables
label_encoder = LabelEncoder()

# # List of categorical columns to be encoded
# categorical_columns = ['ever_married', 'work_type']

# # Apply label encoder for each categorical column
# for column in categorical_columns:
#     encoded_stroke_data[column] = label_encoder.fit_transform(encoded_stroke_data[column].astype(str))

# # Select key features and the target based on p-values
# key_features = ['age', 'avg_glucose_level', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'bmi']
# X = encoded_stroke_data[key_features]
# y = encoded_stroke_data['stroke']

# # Handle missing values for 'bmi' and 'avg_glucose_level' by mean imputation
# imputer = SimpleImputer(strategy='mean')
# X[['avg_glucose_level', 'bmi']] = imputer.fit_transform(X[['avg_glucose_level', 'bmi']])

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# # Normalize the numerical features
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# # Initialize and train the logistic regression model with class weights
# model = LogisticRegression(random_state=42, class_weight='balanced')
# model.fit(X_train, y_train)

# # Predict on the test set
# y_pred = model.predict(X_test)

# List of categorical columns to be encoded
categorical_columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

# Apply label encoder for each categorical column
for column in categorical_columns:
    encoded_stroke_data[column] = label_encoder.fit_transform(encoded_stroke_data[column].astype(str))

# Handling missing values for BMI by mean imputation
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
encoded_stroke_data['bmi'] = imputer.fit_transform(encoded_stroke_data[['bmi']])

# Features and target variable
X = encoded_stroke_data.drop(['id', 'stroke'], axis=1)
y = encoded_stroke_data['stroke']

# Adding constant for intercept term as statsmodels doesn't add it by default
X = sm.add_constant(X)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y, X)
result = logit_model.fit()

# Getting the p-values of the regression
p_values = result.pvalues

# Select the best 7 features based on p-values
selected_features = p_values[p_values < 0.05].index.tolist()

# Remove the constant term from selected features
selected_features.remove('const')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(encoded_stroke_data[selected_features], y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the logistic regression model
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = lr_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results
print("Accuracy of the model: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Optimization terminated successfully.
         Current function value: 0.155533
         Iterations 9
Accuracy of the model: 93.93%
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       960
           1       0.00      0.00      0.00        62

    accuracy                           0.94      1022
   macro avg       0.47      0.50      0.48      1022
weighted avg       0.88      0.94      0.91      1022



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Step 1: data cleansing; importing p-value test of relevence, dummy code the categorical variables

In [42]:
# Select key features and the target
X = stroke_data[key_features]
y = stroke_data['stroke']

# Handle missing values for 'bmi' by mean imputation
imputer = SimpleImputer(strategy='mean')
X['bmi'] = imputer.fit_transform(X[['bmi']])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalize the numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize and train the logistic regression model with class weights
model = LogisticRegression(random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Output the results
print("Accuracy of the model: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)

Accuracy of the model: 74.56%
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.74      0.85       972
           1       0.14      0.80      0.24        50

    accuracy                           0.75      1022
   macro avg       0.56      0.77      0.54      1022
weighted avg       0.94      0.75      0.82      1022



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bmi'] = imputer.fit_transform(X[['bmi']])


In [None]:
def predict_stroke(age, hypertension, heart_disease, avg_glucose_level, bmi):
    """
    Function to predict stroke risk based on user input.
    """
    # Data preprocessing (ensure all inputs are float or int as required)
    input_data = {
        'age': float(age),
        'hypertension': int(hypertension),
        'heart_disease': int(heart_disease),
        'avg_glucose_level': float(avg_glucose_level),
        'bmi': float(bmi)
    }

    # Using the previously defined model and pre-processors
    input_df = pd.DataFrame([input_data])
    input_df['bmi'] = imputer.transform(input_df[['bmi']])
    input_df = scaler.transform(input_df)
    prediction = model.predict(input_df)

    if prediction[0] == 1:
        return "High risk of stroke. Please consult a healthcare provider immediately."
    else:
        return "Low risk of stroke. Continue to maintain a healthy lifestyle."

# Collect inputs from the user
print("Please enter the following information:")
age = input("Age: ")
hypertension = input("Hypertension (1 if yes, 0 if no): ")
heart_disease = input("Heart disease (1 if yes, 0 if no): ")
avg_glucose_level = input("Average Glucose Level (a normal glucose level is a level of 70 to 99 mg/dL): ")
bmi = input("Body Mass Index (BMI): ")

# Call the prediction function
result = predict_stroke(age, hypertension, heart_disease, avg_glucose_level, bmi)
print(result)


Please enter the following information:
