# **Download the Dataset**

In [1]:
! rm -r ~/.kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json

In [2]:
! kaggle datasets download fedesoriano/stroke-prediction-dataset

stroke-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


# **Data Cleaning & Exploration**

In [3]:
import zipfile
ZIP_PATH = "/content/stroke-prediction-dataset.zip"
EXTRACT_PATH = "/content/dataset"

with zipfile.ZipFile(ZIP_PATH,"r") as zip_ref:
  zip_ref.extractall(EXTRACT_PATH)


In [4]:
import pandas as pd

CSV_PATH = "/content/dataset/healthcare-dataset-stroke-data.csv"

dataset = pd.read_csv(CSV_PATH)

In [5]:
dataset.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [7]:
dataset = dataset.drop(["id"],axis=1)

In [8]:
#alcuni valori di bmi sono null -> li settiamo con la media/moda della colonna
#alcune feature sono non numeriche
#i valori non sono scalati tra 0 e 1

In [9]:
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler


ordinalencoder = OrdinalEncoder()
onehot = OneHotEncoder()

dataset["ever_married"] = ordinalencoder.fit_transform(dataset[["ever_married"]])
dataset["Residence_type"] = ordinalencoder.fit_transform(dataset[["Residence_type"]])
dataset["gender"] = onehot.fit_transform(dataset[["gender"]]).toarray()
dataset["work_type"] = onehot.fit_transform(dataset[["work_type"]]).toarray()

In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1,test_size=0.1)

for train_index,test_index in split.split(dataset,dataset["stroke"]):
  train_set = dataset.loc[train_index]
  test_set = dataset.loc[test_index]

In [11]:
train_labels = train_set["stroke"].copy()
train_set = train_set.drop("stroke",axis=1)

test_labels = test_set["stroke"].copy()
test_set = test_set.drop("stroke",axis=1)


In [12]:
bmi_median = train_set["bmi"].median()
train_set["bmi"].fillna(bmi_median,inplace=True)

bmi_median = test_set["bmi"].median()
test_set["bmi"].fillna(bmi_median,inplace=True)

In [13]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4599 entries, 2781 to 3431
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4599 non-null   float64
 1   age                4599 non-null   float64
 2   hypertension       4599 non-null   int64  
 3   heart_disease      4599 non-null   int64  
 4   ever_married       4599 non-null   float64
 5   work_type          4599 non-null   float64
 6   Residence_type     4599 non-null   float64
 7   avg_glucose_level  4599 non-null   float64
 8   bmi                4599 non-null   float64
 9   smoking_status     4599 non-null   object 
dtypes: float64(7), int64(2), object(1)
memory usage: 395.2+ KB


In [14]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 511 entries, 3100 to 3103
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             511 non-null    float64
 1   age                511 non-null    float64
 2   hypertension       511 non-null    int64  
 3   heart_disease      511 non-null    int64  
 4   ever_married       511 non-null    float64
 5   work_type          511 non-null    float64
 6   Residence_type     511 non-null    float64
 7   avg_glucose_level  511 non-null    float64
 8   bmi                511 non-null    float64
 9   smoking_status     511 non-null    object 
dtypes: float64(7), int64(2), object(1)
memory usage: 43.9+ KB


In [15]:
train_set["smoking_status"].value_counts()

never smoked       1691
Unknown            1390
formerly smoked     812
smokes              706
Name: smoking_status, dtype: int64

In [16]:
import numpy as np
train_set.loc[train_set["smoking_status"] == "Unknown","smoking_status"] = pd.NA


In [17]:
train_set["smoking_status"].value_counts()

never smoked       1691
formerly smoked     812
smokes              706
Name: smoking_status, dtype: int64

In [18]:
from sklearn.preprocessing import StandardScaler

stdscaler = StandardScaler()

smoking_status_mode = train_set["smoking_status"].mode()[0]
train_set["smoking_status"].fillna(smoking_status_mode,inplace=True)




In [19]:
train_set["smoking_status"].value_counts()

never smoked       3081
formerly smoked     812
smokes              706
Name: smoking_status, dtype: int64

In [20]:
ordinalencoder = OrdinalEncoder(categories=[['never smoked','formerly smoked','smokes']])
train_set["smoking_status"] = ordinalencoder.fit_transform(train_set[["smoking_status"]])

train_set = stdscaler.fit_transform(train_set)

test_set.loc[test_set["smoking_status"] == "Unknown","smoking_status"] = pd.NA
smoking_status_mode = test_set["smoking_status"].mode()[0]
test_set["smoking_status"].fillna(smoking_status_mode,inplace=True)

test_set["smoking_status"] = ordinalencoder.fit_transform(test_set[["smoking_status"]])
test_set = stdscaler.fit_transform(test_set)

In [21]:
train_set.shape

(4599, 10)

# **Training and Testing**

In [22]:
import tensorflow as tf

In [24]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(100,activation="relu",input_shape=(10,)))
model.add(tf.keras.layers.Dense(100,activation="relu"))
model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

In [25]:
model.compile(optimizer='adam',loss="binary_crossentropy",metrics=["accuracy"])

In [26]:
def exponential_decay_fn(epoch):
  return 0.01 * 0.1**(epoch/20)

lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

In [29]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor = 'loss',patience=3)

In [30]:
model.fit(train_set,train_labels,epochs=30,validation_split=0.2,batch_size=32,callbacks=[lr_scheduler,earlystopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fa47b229850>

In [31]:
model.evaluate(test_set,test_labels)



[0.4937700927257538, 0.9393346309661865]