**IMPORTING THE NECESSARY LIBRARIES**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek

**LOADING AND CLEANING THE DATASET**

In [2]:
#Loading the dataset
stroke_data = pd.read_csv('/content/healthcare-stroke-data.csv')

In [None]:
stroke_data.head(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
6,53882,Male,74.0,1,1,Yes,Private,Rural,70.09,27.4,never smoked,1
7,10434,Female,69.0,0,0,No,Private,Urban,94.39,22.8,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
9,60491,Female,78.0,0,0,Yes,Private,Urban,58.57,24.2,Unknown,1


In [None]:
stroke_data.tail(10)

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
5100,68398,Male,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,never smoked,0
5101,36901,Female,45.0,0,0,Yes,Private,Urban,97.95,24.5,Unknown,0
5102,45010,Female,57.0,0,0,Yes,Private,Rural,77.93,21.7,never smoked,0
5103,22127,Female,18.0,0,0,No,Private,Urban,82.85,46.9,Unknown,1
5104,14180,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.2,40.0,never smoked,1
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,1
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0
5109,44679,Female,44.0,0,0,Yes,Govt_job,Urban,85.28,26.2,Unknown,0


In [None]:
stroke_data.shape

(5110, 12)

In [3]:
stroke_data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.265949,0.097456,0.054012,106.147677,28.893237,0.368102
std,21161.721625,22.593362,0.296607,0.226063,45.28356,7.854067,0.482336
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,1.0
max,72940.0,88.0,1.0,1.0,271.74,97.6,1.0


In [4]:
stroke_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [5]:
#Checking for null values
stroke_data.isnull().sum()

Unnamed: 0,0
id,0
gender,0
age,0
hypertension,0
heart_disease,0
ever_married,0
work_type,0
Residence_type,0
avg_glucose_level,0
bmi,201


In [6]:
#Dropping null values
stroke_data = stroke_data.dropna()

In [7]:
#Replacing the 'Unknown' value in the 'smoking status' column with 'passive smoker'.
stroke_data['smoking_status'] = stroke_data['smoking_status'].replace('Unknown', 'passive smoker')

The column 'smoking_status' had a categorical value 'Unknown' which was replaced by 'passive smoker'. This was done since the unknown patients upon diagnosis, were found to posses traits and bear the effects of an active smoker whereas they had no history of actively smoking.

HANDLING OUTLIERS

In [8]:
# Calculate Q1, Q3, and IQR for bmi column
q1 = np.percentile(stroke_data['bmi'], 25)
q3 = np.percentile(stroke_data['bmi'], 75)
iqr = q3 - q1

# Define bounds
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

#Check outliers
outliers = stroke_data[(stroke_data['bmi'] < lower_bound) | (stroke_data['bmi'] > upper_bound)]

print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Number of Outliers:", len(outliers))





Lower Bound: 9.099999999999998
Upper Bound: 47.5
Number of Outliers: 110


In [9]:
#Calculate Q1,Q3 and IQR for avg_glucose_level
Q1 = np.percentile(stroke_data['avg_glucose_level'], 25)
Q3 = np.percentile(stroke_data['avg_glucose_level'], 75)
IQR = Q3 - Q1

#Define bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

#Check outliers
outliers = stroke_data[(stroke_data['avg_glucose_level'] < lower_bound) | (stroke_data['avg_glucose_level'] > upper_bound)]

print("Lower Bound:", lower_bound)
print("Upper Bound:", upper_bound)
print("Number of Outliers:", len(outliers))

Lower Bound: 22.319999999999993
Upper Bound: 168.32
Number of Outliers: 567


In [10]:
#Capping both bmi and avg.glucose level outliers to the bounds

# Capping BMI
stroke_data['bmi'] = stroke_data['bmi'].clip(lower=11.99, upper=47.2)

# Capping Glucose levels
stroke_data['avg_glucose_level'] = stroke_data['avg_glucose_level'].clip(lower=20.46, upper=171.73)





*  Capping both BMI and Avg.Glucose levels column's outliers ensures extreme values are adjusted while retaining as much valid data as possible within the dataset while also Keeping the natural variability in the data and reducing the influence of extreme values.




**HYPOTHESIS AND HYPOTHESIS TESTING**

HYPOTHESIS

* Ho: There is no significant relationship between stroke occurrence and extra-personal social factors (marital status, work type and residence) in a developing economy's population.
* H1: There is a significant relationship between stroke occurrence and extra-personal social factors (marital status, work type and residence) in a developing economy's population.



HYPOTHESIS TESTING

In [None]:
# Selecting relevant columns for the analysis
data_subset = stroke_data[['ever_married', 'work_type', 'Residence_type', 'stroke']]

# Encode categorical variables
label_encoders = {}
for column in ['ever_married', 'work_type', 'Residence_type']:
    le = LabelEncoder()
    data_subset.loc[:,column] = data_subset[column].astype(str)
    data_subset.loc[:,column] = pd.to_numeric(le.fit_transform(data_subset[column]))
    label_encoders[column] = le

    # Splitting predictors (X) and outcome (y)
X = data_subset[['ever_married', 'work_type', 'Residence_type']]
y = data_subset['stroke']

# Adding a constant term to the predictors for statsmodels
X_sm = sm.add_constant(X)

X_sm = X_sm.astype(float)
y = y.astype(int)

# Fit logistic regression model using statsmodels
logit_model = sm.Logit(y, X_sm).fit()

# Summary of the model
print(logit_model.summary())



Optimization terminated successfully.
         Current function value: 0.669305
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                 stroke   No. Observations:                 4238
Model:                          Logit   Df Residuals:                     4234
Method:                           MLE   Df Model:                            3
Date:                Thu, 16 Jan 2025   Pseudo R-squ.:                 0.01233
Time:                        10:55:29   Log-Likelihood:                -2836.5
converged:                       True   LL-Null:                       -2871.9
Covariance Type:            nonrobust   LLR p-value:                 2.879e-15
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const             -0.8317      0.100     -8.299      0.000      -1.028      -0.635
ever_married     

RESULTS AND INTERPRETETIONS


1. Const: -0.8317, The log-odds of having a stroke when all predictors are at their baseline is -0.8317. This translates to a baseline probability of 0.303
2. Ever_married: 0.6241 , Being married increases the log-odds of having a stroke by 0.6241. The associated p-value (p < 0.001) indicates this is statistically significant, meaning married individuals are more likely to have a stroke compared to those who are not married.
3. Work_type:-0.0190, the coefficient is small, negative, and not significant (p=0.594). Work type does not seem to have a meaningful effect on stroke odds in this model.
4. Residence_type: 0.0644, Living in a particular type of residence has a small, positive but non-significant (p=0.306) effect on stroke likelihood.



CONCLUSION

* We reject the null hypothesis as from the statistical test, extra-personal social factors affect the occurence of stroke in the population. This is evident in the marital status factor which significantly affects the occurence of stroke within a developing economy's population.



**MACHINE LEARNING AND MODEL PREDICTIONS**

* From our above stroke dataset, we would like to come up with a highly accurate model that predicts the outcome of stroke occurence based on the various different features from our dataset.
* After feature engineering and train-test data split, different models were tested on our data to see which would have the best results.



FEATURE ENGINEERING AND DATA SPLIT

In [None]:
# Define features (X) and target (y)
X = stroke_data.drop(columns=['id', 'stroke'])
y = stroke_data['stroke']

# Perform one-hot encoding for categorical features
categorical_features = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']  # Add all your categorical columns
X = pd.get_dummies(X, columns=categorical_features, drop_first=True) # drop_first to avoid multicollinearity

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (3390, 15)
Test set size: (848, 15)


1(a).Logistic regression



In [None]:
# Initialize and train logistic regression
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = logistic_model.predict(X_test)
y_prob_lr = logistic_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, zero_division=1))


Logistic Regression Performance:
Accuracy: 0.8231132075471698
AUC-ROC: 0.8819104991394147
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.88      0.85       498
           1       0.81      0.74      0.78       350

    accuracy                           0.82       848
   macro avg       0.82      0.81      0.81       848
weighted avg       0.82      0.82      0.82       848



1(b).Logistic regresssion with class weights

In [None]:
# Initialize and train logistic regression
logistic_model = LogisticRegression(class_weight='balanced',max_iter=1000, random_state=42)
logistic_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = logistic_model.predict(X_test)
y_prob_lr = logistic_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Logistic Regression Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr, zero_division=1))


Logistic Regression Performance:
Accuracy: 0.8148584905660378
AUC-ROC: 0.882071141709696
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.83      0.84       498
           1       0.76      0.80      0.78       350

    accuracy                           0.81       848
   macro avg       0.81      0.81      0.81       848
weighted avg       0.82      0.81      0.82       848



1(c).Logistic regression with undersampling

In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train Logistic Regression on SMOTE data
lr = LogisticRegression(max_iter=500, solver='lbfgs')
lr.fit(X_train_smote, y_train_smote)

# Predictions and Evaluation
y_pred = lr.predict(X_test)
y_pred_proba = lr.predict_proba(X_test)[:, 1]

#Evaluate the model
print("Logistic Regression Performance (SMOTE):")
print("Accuracy (SMOTE):",accuracy_score(y_test, y_pred))
print("AUC-ROC (SMOTE):", roc_auc_score(y_test, y_pred_proba))
print("Classification Report (SMOTE):\n", classification_report(y_test, y_pred))


Logistic Regression Performance (SMOTE):
Accuracy (SMOTE): 0.8101415094339622
AUC-ROC (SMOTE): 0.8755421686746989
Classification Report (SMOTE):
               precision    recall  f1-score   support

           0       0.85      0.82      0.84       498
           1       0.76      0.79      0.77       350

    accuracy                           0.81       848
   macro avg       0.80      0.81      0.81       848
weighted avg       0.81      0.81      0.81       848



2(a).Random forest



In [None]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf, zero_division=1))


Random Forest Performance:
Accuracy: 0.8408018867924528
AUC-ROC: 0.8909982788296041
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.84      0.86       498
           1       0.79      0.84      0.81       350

    accuracy                           0.84       848
   macro avg       0.84      0.84      0.84       848
weighted avg       0.84      0.84      0.84       848



2(b).Random forest with class weights

In [None]:
# Initialize and train Random Forest
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)
y_prob_rf = rf_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf, zero_division=1))


Random Forest Performance:
Accuracy: 0.8349056603773585
AUC-ROC: 0.8876104417670683
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.84      0.86       498
           1       0.79      0.82      0.80       350

    accuracy                           0.83       848
   macro avg       0.83      0.83      0.83       848
weighted avg       0.84      0.83      0.84       848



2(c).Random Forest with undersampling

In [None]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_smote, y_train_smote)

# Predictions and Evaluation
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Random Forest Performance (SMOTE):")
print("Accuracy (SMOTE):", accuracy_score(y_test, y_pred))
print("AUC-ROC (SMOTE + Random Forest):", roc_auc_score(y_test, y_pred_proba))
print("Classification Report (SMOTE + Random Forest):\n",classification_report(y_test, y_pred))


Random Forest Performance (SMOTE):
Accuracy (SMOTE): 0.8431603773584906
AUC-ROC (SMOTE + Random Forest): 0.8860900745840505
Classification Report (SMOTE + Random Forest):
               precision    recall  f1-score   support

           0       0.89      0.83      0.86       498
           1       0.78      0.86      0.82       350

    accuracy                           0.84       848
   macro avg       0.84      0.85      0.84       848
weighted avg       0.85      0.84      0.84       848



3. Gradient boosting




In [None]:
# Initialize and train XGBoost
xgb_model = XGBClassifier( eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("XGBoost Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb, zero_division=1))


XGBoost Performance:
Accuracy: 0.8408018867924528
AUC-ROC: 0.8879288582903041
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.85      0.86       498
           1       0.79      0.83      0.81       350

    accuracy                           0.84       848
   macro avg       0.84      0.84      0.84       848
weighted avg       0.84      0.84      0.84       848



RESULTS AND INTERPRETATIONS

* The models that were employed in this study were logistic regression, random forest and gradient boosting. All three models performed exemplarily well and all could be deployed in predicting stroke cases.
* However, the random forest with model, with undersampling, works best for our dataset requirenments. It delivers the best recall (0.86) and a strong F1-score (0.82) and its accuracy (0.8432) and AUC-ROC (0.8861) are competitive which demonstrates overall reliability. The undersampling ensures better class balance, which can be crucial for imbalanced datasets in the future.





