# Statistics in Machine Learning: STA 208
# Final Project
## Stroke Prediction/Prevention

URL: https://www.kaggle.com/lirilkumaramal/heart-stroke


### Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
folder_mac = "/Users/Dylan/Desktop/School Stuff/Master's Year/Spring 2021/STA 208/Final Project/"
filename = "train_strokes.csv"
df = pd.read_csv(folder_mac+filename, na_values="N/A")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43400 entries, 0 to 43399
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 43400 non-null  int64  
 1   gender             43400 non-null  object 
 2   age                43400 non-null  float64
 3   hypertension       43400 non-null  int64  
 4   heart_disease      43400 non-null  int64  
 5   ever_married       43400 non-null  object 
 6   work_type          43400 non-null  object 
 7   Residence_type     43400 non-null  object 
 8   avg_glucose_level  43400 non-null  float64
 9   bmi                41938 non-null  float64
 10  smoking_status     30108 non-null  object 
 11  stroke             43400 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 4.0+ MB


## Data Preparation

### Dealing with Null values

In [3]:
df.isna().sum()

id                       0
gender                   0
age                      0
hypertension             0
heart_disease            0
ever_married             0
work_type                0
Residence_type           0
avg_glucose_level        0
bmi                   1462
smoking_status       13292
stroke                   0
dtype: int64

In [4]:
# Only the bmi variable has NA values. I think the best course of action is to replace
# BMI nas with the median value of the patient's gender
# Ex. If patitent is female, replace their BMI with the median value for females
# We also want to drop the na values so that we can come up with a median value
male_df = df.loc[df['gender'] == "Male"].dropna()
male_bmis = male_df["bmi"].to_numpy(dtype="float32")
male_bmi_median = np.median(male_bmis)
male_bmi_avg = male_bmis.mean()
print("The median male bmi is: ", male_bmi_median)
print("The average male bmi is: ", male_bmi_avg)

# Performing the same work on the female_bmis
female_df = df.loc[df['gender'] == "Female"].dropna()
female_bmis = female_df["bmi"].to_numpy(dtype="float32")
female_bmi_median = np.median(female_bmis)
female_bmi_avg = female_bmis.mean()
print("The median female bmi is: ", female_bmi_median)
print("The average female bmi is: ", female_bmi_avg)

The median male bmi is:  29.5
The average male bmi is:  30.251165
The median female bmi is:  28.5
The average female bmi is:  29.931274


The average and median bmis for males and females are very similar. I will replace the male bmi na values with the median value since the median does a better job of accounting for outliers.

In [5]:
# Finding Patient ids for male and females that don't have a bmi set
na_bmis = df[df.isnull().any(1)]
na_female_patients = na_bmis.loc[na_bmis["gender"] == "Female"]["id"].to_numpy(dtype="int64")
na_male_patients = na_bmis.loc[na_bmis["gender"] == "Male"]["id"].to_numpy(dtype="int64")
na_other_patients = na_bmis.loc[na_bmis["gender"] == "Other"]["id"].to_numpy(dtype="int64")

# Replacing BMI NA values with median
df.loc[df["id"].isin(na_male_patients), "bmi"] = male_bmi_median
df.loc[df["id"].isin(na_female_patients), "bmi"] = female_bmi_median
df.loc[df["id"].isin(na_other_patients), "bmi"] = (female_bmi_median + male_bmi_median)/2

In [6]:
df["smoking_status"].value_counts()

never smoked       16053
formerly smoked     7493
smokes              6562
Name: smoking_status, dtype: int64

In [7]:
# Replace all NA smoking values
na_smoking_ids = df[df.isnull().any(1)]["id"].to_numpy(dtype="float32")
df.loc[df["id"].isin(na_smoking_ids), "smoking_status"] = "Unknown"

# Check if we still have any na values
df.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

### Dealing with String Data

In [8]:
df["work_type"].value_counts()

Private          24834
Self-employed     6793
children          6156
Govt_job          5440
Never_worked       177
Name: work_type, dtype: int64

In [9]:
df["Residence_type"].value_counts()

Urban    21756
Rural    21644
Name: Residence_type, dtype: int64

In [10]:
df["stroke"].value_counts()

0    42617
1      783
Name: stroke, dtype: int64

In [11]:
df["ever_married"].value_counts()

Yes    27938
No     15462
Name: ever_married, dtype: int64

In [12]:
df["gender"].value_counts()

Female    25665
Male      17724
Other        11
Name: gender, dtype: int64

In [13]:
dummies = pd.get_dummies(df[["work_type", "smoking_status"]])

# Drop the columns that we made dummy variables for
temp_df = df.drop(["work_type", "smoking_status", "id"], axis=1)
df = pd.concat([temp_df, dummies], axis=1)

In [14]:
# Replace the other binary string types with 0 or 1
# Gender
male = 1
female = 0

# For ever_married
yes = 1
no = 0

# for residence type
Urban = 1
Rural = 0

# Create a dictionary with all these encoded values
replace_dict = {"Male":1, "Female":0, "Other":2, "Yes":1, "No":0, "Urban":1, "Rural":0}
final_df = df.replace(replace_dict)
final_df.head(10)

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1,3.0,0,0,0,0,95.12,29.5,0,0,0,0,0,1,1,0,0,0
1,1,58.0,1,0,1,1,87.96,39.2,0,0,0,1,0,0,0,0,1,0
2,0,8.0,0,0,0,1,110.89,28.5,0,0,0,1,0,0,1,0,0,0
3,0,70.0,0,0,1,0,69.04,35.9,0,0,0,1,0,0,0,1,0,0
4,1,14.0,0,0,0,0,161.28,29.5,0,0,1,0,0,0,1,0,0,0
5,0,47.0,0,0,1,1,210.95,28.5,0,0,0,1,0,0,1,0,0,0
6,0,52.0,0,0,1,1,77.59,17.7,0,0,0,1,0,0,0,1,0,0
7,0,75.0,0,1,1,0,243.53,27.0,0,0,0,0,1,0,0,0,1,0
8,0,32.0,0,0,1,0,77.67,32.3,0,0,0,1,0,0,0,0,0,1
9,0,74.0,1,0,1,1,205.84,54.6,0,0,0,0,1,0,0,0,1,0


## Data Split and Scaling

In [24]:
# Convert our pandas dataframe into x and y data
y = final_df["stroke"]
X = final_df.drop(["stroke"], axis=1)

# Split our data into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Scale our data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Lasso Fit With KFold

In [36]:
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.linear_model import LassoCV
kf = KFold(n_splits=5)
kf_train = kf.get_n_splits(X_train)
lasso = LassoCV(cv=kf_train, max_iter=2000, random_state=2)
lasso.fit(X_train, y_train)
lasso.get_params()

{'alphas': None,
 'copy_X': True,
 'cv': 5,
 'eps': 0.001,
 'fit_intercept': True,
 'max_iter': 2000,
 'n_alphas': 100,
 'n_jobs': None,
 'normalize': False,
 'positive': False,
 'precompute': 'auto',
 'random_state': 2,
 'selection': 'cyclic',
 'tol': 0.0001,
 'verbose': False}

In [37]:
print(f"Lasso lambda is : {lasso.alpha_:.2E}")
print("Lasso coefficients: \n", lasso.coef_)

Lasso lambda is : 4.04E-04
Lasso coefficients: 
 [ 0.00000000e+00  2.53913795e-02  3.09574320e-03  8.68538257e-03
 -5.37051955e-03  6.63503931e-05  5.00848371e-03 -2.47438760e-03
 -1.04874680e-03  7.70747561e-05 -0.00000000e+00  0.00000000e+00
  6.72497605e-03 -0.00000000e+00  0.00000000e+00 -2.28277216e-04
  0.00000000e+00]


In [38]:
# Print out unused columns
print("Columns not in active set:", np.array(list(X.columns))[lasso.coef_ == 0])

Columns not in active set: ['gender' 'work_type_Private' 'work_type_Self-employed'
 'smoking_status_Unknown' 'smoking_status_formerly smoked'
 'smoking_status_smokes']


### Change the number of KFold splits
This dataset is too large to apply leave one out cross validation in a reasonable amount of time

In [39]:
kf_100 = KFold(n_splits=100)
kf_train_100 = kf_100.get_n_splits(X_train)
lasso_kf = LassoCV(cv=kf_train_100, max_iter=2000, random_state=2)
lasso_kf.fit(X_train, y_train)

LassoCV(cv=100, max_iter=2000, random_state=2)

In [40]:
print(f"KFold 100 Lasso lambda is : {lasso_kf.alpha_:.2E}")
print("KFold 100 Lasso coefficients: \n", lasso_kf.coef_)

KFold 100 Lasso lambda is : 3.76E-04
KFold 100 Lasso coefficients: 
 [ 0.00000000e+00  2.54993484e-02  3.11373133e-03  8.69494989e-03
 -5.43909532e-03  9.37039258e-05  5.02821093e-03 -2.50257233e-03
 -1.06889524e-03  1.07547194e-04 -0.00000000e+00  0.00000000e+00
  6.78001993e-03 -0.00000000e+00  0.00000000e+00 -2.47630422e-04
  0.00000000e+00]


In [41]:
# Print out unused columns
print("Columns not in active set:", np.array(list(X.columns))[lasso_kf.coef_ == 0])

Columns not in active set: ['gender' 'work_type_Private' 'work_type_Self-employed'
 'smoking_status_Unknown' 'smoking_status_formerly smoked'
 'smoking_status_smokes']


## Compare Lasso Resuls with LinearRegression

In [42]:
from sklearn.linear_model import LinearRegression
# Perform linear regression prediction
reg = LinearRegression()
reg.fit(X_train, y_train)
pred = reg.predict(X_test)

# Predict from both lasso
pred_lasso_10 = lasso.predict(X_test)
pred_lasso_100 = lasso_kf.predict(X_test)

# Compare MSE
mses = []
mses.append(((pred - y_test)**2).mean())
mses.append(((pred_lasso_10 - y_test)**2).mean())
mses.append(((pred_lasso_100 - y_test)**2).mean())
print(f"Linear Regression MSE: {mses[0]:.2E}")
print(f"Lasso KFold 10 MSE: {mses[1]:.2E}")
print(f"Lasso KFold 100 MSE: {mses[1]:.2E}")

Linear Regression MSE: 1.46E-02
Lasso KFold 10 MSE: 1.46E-02
Lasso KFold 100 MSE: 1.46E-02


### Checking Accuracy of Each Predictor

In [43]:
acc = (y_test.size - np.sum(np.abs(np.round(pred) - y_test)))/y_test.size
acc_kf_10 = (y_test.size - np.sum(np.abs(np.round(pred_lasso_10) - y_test)))/y_test.size
acc_kf_100 = (y_test.size - np.sum(np.abs(np.round(pred_lasso_100) - y_test)))/y_test.size
print(f"Linear Regression accuracy {acc*100:.2f} %")
print(f"Lasso KFold 10 accuracy {acc_kf_10*100:.2f} %")
print(f"Lasso KFold 100 accuracy {acc_kf_100*100:.2f} %")

Linear Regression accuracy 98.47 %
Lasso KFold 10 accuracy 98.47 %
Lasso KFold 100 accuracy 98.47 %


After fitting the lasso on this data set, it seems like residence type and gender don't have much of an effect on stroke prediction. I notice that a lot of the smoking status variables are also being thrown out, but I still see the never smokes variable in the active set. This suggests that there may be little significance between formerly smoking and currently smoking in terms of stroke prediction. These findings are supported by the results of the MSE and Accuracy testing between linear regression and both lasso regressions. The linear regression is training with all of the input variables, but the lasso regression is training without gender, residence type, and several other data categories. The accuracy and mse of the lasso predictions exactly match the linear predictions, suggesting that omitting these data categories has negligible impact on the ability to predict a stroke. 

In [44]:
np.sum(np.round(pred_lasso_10))


0.0

In [45]:
np.sum(y_test)

133