In [1]:
!pip install pyyaml h5py #to save models in HDF5 format (if needed)



In [2]:
import os

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION) #print tensor flow version


2.6.0


In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
import time as timer
from sklearn.inspection import permutation_importance



In [4]:
import tempfile

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

In [5]:
#load dataset from kaggle or elsewhere
url = "https://gist.githubusercontent.com/aishwarya8615/d2107f828d3f904839cbcb7eaa85bd04/raw/cec0340503d82d270821e03254993b6dede60afb/healthcare-dataset-stroke-data.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [7]:


miss_val = data.isnull().sum()/len(data)*100
print(miss_val)
print("# Missing values in variable bmi\t\t: {:.2f}%".format(miss_val['bmi']))
print("# Missing values in variable smoking_status\t: {:.2f}%".format(miss_val['smoking_status']))
print("Data shape: {}".format(data.shape))



id                   0.000000
gender               0.000000
age                  0.000000
hypertension         0.000000
heart_disease        0.000000
ever_married         0.000000
work_type            0.000000
Residence_type       0.000000
avg_glucose_level    0.000000
bmi                  3.933464
smoking_status       0.000000
stroke               0.000000
dtype: float64
# Missing values in variable bmi		: 3.93%
# Missing values in variable smoking_status	: 0.00%
Data shape: (5110, 12)


In [8]:
pd.options.mode.chained_assignment = None 
# replace missing values for 'bmi' are replaced with its mean
data['bmi']=data['bmi'].fillna(data['bmi'].mean())
# drop missing values in variable 'smoking_status'
updated_data = data[data['smoking_status'].notnull()]
# drop 'id'
updated_data.drop(columns='id',axis=1,inplace=True)
# check for missing values
miss_val = updated_data.isnull().sum()/len(updated_data)*100
print(miss_val)
print("# Missing values in variable 'bmi'\t\t: {}".format(miss_val['bmi']))
print("# Missing values in variable 'smoking_status'\t: {}".format(miss_val['smoking_status']))
print("Shape of data without missing values: {}".format(updated_data.shape))

gender               0.0
age                  0.0
hypertension         0.0
heart_disease        0.0
ever_married         0.0
work_type            0.0
Residence_type       0.0
avg_glucose_level    0.0
bmi                  0.0
smoking_status       0.0
stroke               0.0
dtype: float64
# Missing values in variable 'bmi'		: 0.0
# Missing values in variable 'smoking_status'	: 0.0
Shape of data without missing values: (5110, 11)


In [9]:
# You can do this step instead of the one before if you wish to drop labels.
# data=data.dropna()
# data.columns

In [10]:
print("Unique 'gender': {}".format(updated_data['gender'].unique()))
print("Unique 'ever_married': {}".format(updated_data['ever_married'].unique()))
print("Unique 'work_type': {}".format(updated_data['work_type'].unique()))
print("Unique 'Residence_type': {}".format(updated_data['Residence_type'].unique()))
print("Unique 'smoking_status': {}".format(updated_data['smoking_status'].unique()))

Unique 'gender': ['Male' 'Female' 'Other']
Unique 'ever_married': ['Yes' 'No']
Unique 'work_type': ['Private' 'Self-employed' 'Govt_job' 'children' 'Never_worked']
Unique 'Residence_type': ['Urban' 'Rural']
Unique 'smoking_status': ['formerly smoked' 'never smoked' 'smokes' 'Unknown']


In [11]:
#Declare an Encoder for the labels above
label_gender = LabelEncoder()
label_married = LabelEncoder()
label_work = LabelEncoder()
label_residence = LabelEncoder()
label_smoking = LabelEncoder()

In [12]:
updated_data['gender'] = label_gender.fit_transform(updated_data['gender'])
updated_data['ever_married'] = label_married.fit_transform(updated_data['ever_married'])
updated_data['work_type']= label_work.fit_transform(updated_data['work_type'])
updated_data['Residence_type']= label_residence.fit_transform(updated_data['Residence_type'])
updated_data['smoking_status']= label_smoking.fit_transform(updated_data['smoking_status'])


In [13]:
updated_data['age'] = updated_data['age'].astype(np.float32)
updated_data['avg_glucose_level'] = updated_data['avg_glucose_level'].astype(np.float32)
updated_data['bmi'] = updated_data['bmi'].astype(np.float32)

In [14]:
with pd.option_context('expand_frame_repr', False):
    print(updated_data.head())

   gender   age  hypertension  heart_disease  ever_married  work_type  Residence_type  avg_glucose_level        bmi  smoking_status  stroke
0       1  67.0             0              1             1          2               1         228.690002  36.599998               1       1
1       0  61.0             0              0             1          3               0         202.210007  28.893236               2       1
2       1  80.0             0              1             1          2               0         105.919998  32.500000               2       1
3       0  49.0             0              0             1          2               1         171.229996  34.400002               3       1
4       0  79.0             1              0             1          3               0         174.119995  24.000000               2       1


In [15]:
df=updated_data[['gender','age', 'hypertension', 'heart_disease', 'ever_married','work_type', 'Residence_type', 'avg_glucose_level', 'bmi','smoking_status','stroke']]


In [16]:
# check data type of each column make sure float64 are now float32
df.dtypes

gender                 int64
age                  float32
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float32
bmi                  float32
smoking_status         int64
stroke                 int64
dtype: object

In [17]:
updated_data['age']

0       67.0
1       61.0
2       80.0
3       49.0
4       79.0
        ... 
5105    80.0
5106    81.0
5107    35.0
5108    51.0
5109    44.0
Name: age, Length: 5110, dtype: float32

In [18]:
updated_data['bmi']

0       36.599998
1       28.893236
2       32.500000
3       34.400002
4       24.000000
          ...    
5105    28.893236
5106    40.000000
5107    30.600000
5108    25.600000
5109    26.200001
Name: bmi, Length: 5110, dtype: float32

In [19]:
updated_data['avg_glucose_level']

0       228.690002
1       202.210007
2       105.919998
3       171.229996
4       174.119995
           ...    
5105     83.750000
5106    125.199997
5107     82.989998
5108    166.289993
5109     85.279999
Name: avg_glucose_level, Length: 5110, dtype: float32

In [20]:
# we then encode non numerical columns as we did before with XGBoost
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['gender']=le.fit_transform(updated_data['gender'])
df['ever_married']=le.fit_transform(updated_data['ever_married'])
df['work_type']=le.fit_transform(updated_data['work_type'])
df['Residence_type']=le.fit_transform(updated_data['Residence_type'])
df['smoking_status']=le.fit_transform(updated_data['smoking_status'])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1,67.0,0,1,1,2,1,228.690002,36.599998,1,1
1,0,61.0,0,0,1,3,0,202.210007,28.893236,2,1
2,1,80.0,0,1,1,2,0,105.919998,32.5,2,1
3,0,49.0,0,0,1,2,1,171.229996,34.400002,3,1
4,0,79.0,1,0,1,3,0,174.119995,24.0,2,1


In [21]:
updated_data.shape,df.shape

((5110, 11), (5110, 11))

In [24]:
#  import io
#  towrite = io.BytesIO()
#  df.to_excel(towrite)  # write to BytesIO buffer
#  towrite.seek(0) 
#  print(towrite)
#  b''
#  print(type(towrite))
rec = df.to_records(index=False)

print(repr(rec))
# rec.array([(10, 18446744073709551615, 13240000000.0), (15, 230498234019, 3.14159),
#  (20, 32094812309, 234.1341)], 
#           dtype=[('a', '|u1'), ('b', '<u8'), ('c', '<f8')])

s = rec.tostring()
rec2 = np.fromstring(s, rec.dtype)

print(np.all(rec2 == rec))

rec.array([(1, 67., 0, 1, 1, 2, 1, 228.69, 36.6     , 1, 1),
           (0, 61., 0, 0, 1, 3, 0, 202.21, 28.893236, 2, 1),
           (1, 80., 0, 1, 1, 2, 0, 105.92, 32.5     , 2, 1), ...,
           (0, 35., 0, 0, 1, 3, 0,  82.99, 30.6     , 2, 0),
           (1, 51., 0, 0, 1, 2, 0, 166.29, 25.6     , 1, 0),
           (0, 44., 0, 0, 1, 0, 1,  85.28, 26.2     , 0, 0)],
          dtype=[('gender', '<i8'), ('age', '<f4'), ('hypertension', '<i8'), ('heart_disease', '<i8'), ('ever_married', '<i8'), ('work_type', '<i8'), ('Residence_type', '<i8'), ('avg_glucose_level', '<f4'), ('bmi', '<f4'), ('smoking_status', '<i8'), ('stroke', '<i8')])
True


  from ipykernel import kernelapp as app
  app.launch_new_instance()


In [25]:
rec.dtype

dtype((numpy.record, [('gender', '<i8'), ('age', '<f4'), ('hypertension', '<i8'), ('heart_disease', '<i8'), ('ever_married', '<i8'), ('work_type', '<i8'), ('Residence_type', '<i8'), ('avg_glucose_level', '<f4'), ('bmi', '<f4'), ('smoking_status', '<i8'), ('stroke', '<i8')]))

In [27]:
#save updated_data on a cvs file for later use with Rune
with open('datasets/encoded-healthcare-dataset-stroke-data.csv', 'wb') as f: 
  f.write(rec) 


In [28]:
y=df['stroke']
x=df.drop('stroke',axis=1)
x.shape,y.shape

((5110, 10), (5110,))

In [29]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test=tts(x,y,test_size=0.2)

In [30]:
 # Define a simple sequential model
def create_model():
      ann_model = tf.keras.models.Sequential([
        tf.keras.layers.Flatten(input_shape=(1, 10)),
        tf.keras.layers.Dense(25, activation='relu'),
        tf.keras.layers.Dense(25, activation='relu'),
      
      tf.keras.layers.Dense(1, activation='sigmoid')
    ])

      ann_model.compile('adam','binary_crossentropy',metrics=['accuracy'])
      return ann_model

In [31]:
nn_model = create_model()
nn_model.compile('adam','binary_crossentropy',metrics=['accuracy'])

result=nn_model.fit(x_train,y_train, epochs=11)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [32]:
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten (Flatten)            (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 25)                275       
_________________________________________________________________
dense_1 (Dense)              (None, 25)                650       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 26        
Total params: 951
Trainable params: 951
Non-trainable params: 0
_________________________________________________________________


In [33]:
from sklearn.metrics import confusion_matrix
y_pred=[]
for i in nn_model.predict(x_test):
    if i>0.5:
        y_pred.append(1)
    if i<0.5:
        y_pred.append(0)
confusion_matrix(y_test,y_pred)



array([[981,   0],
       [ 41,   0]])

In [34]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9598825831702544

In [35]:
# Create a Callback During training
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [36]:
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [37]:
# Train the model with the new callback
  # Pass callback to training
nn_model.fit(x_train,y_train,epochs=11, validation_data=(x_test, y_test),callbacks=[cp_callback])
# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

Epoch 1/11

Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/11

Epoch 00002: saving model to training_1/cp.ckpt
Epoch 3/11

Epoch 00003: saving model to training_1/cp.ckpt
Epoch 4/11

Epoch 00004: saving model to training_1/cp.ckpt
Epoch 5/11

Epoch 00005: saving model to training_1/cp.ckpt
Epoch 6/11

Epoch 00006: saving model to training_1/cp.ckpt
Epoch 7/11

Epoch 00007: saving model to training_1/cp.ckpt
Epoch 8/11

Epoch 00008: saving model to training_1/cp.ckpt
Epoch 9/11

Epoch 00009: saving model to training_1/cp.ckpt
Epoch 10/11

Epoch 00010: saving model to training_1/cp.ckpt
Epoch 11/11

Epoch 00011: saving model to training_1/cp.ckpt


<keras.callbacks.History at 0x7f54ca21ea90>

In [38]:
os.listdir(checkpoint_dir)


['cp.ckpt.data-00000-of-00001', 'cp.ckpt.index', 'checkpoint']

In [39]:
# Create a basic model instance
model = create_model()

# Evaluate the model
loss, acc = model.evaluate(x_test, y_test, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100 * acc))


32/32 - 0s - loss: 5.2657 - accuracy: 0.0548
Untrained model, accuracy:  5.48%


In [40]:
# Loads the weights
model.load_weights(checkpoint_path)

# Re-evaluate the model
loss, acc = model.evaluate(x_test, y_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))


32/32 - 0s - loss: 0.1669 - accuracy: 0.9579
Restored model, accuracy: 95.79%


In [41]:
# Create and train a new model instance.
model = create_model()
model.fit(x_train, y_train, epochs=11)

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/stroke_model')

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
INFO:tensorflow:Assets written to: saved_model/stroke_model/assets


In [42]:
ls saved_model

[0m[01;34mstroke_model[0m/


In [43]:
ls saved_model/stroke_model

[0m[01;34massets[0m/  keras_metadata.pb  saved_model.pb  [01;34mvariables[0m/


In [44]:
# Convert .pb into  .tflite
import logging
logging.getLogger("tensorflow").setLevel(logging.DEBUG)

import tensorflow as tf
import numpy as np
assert float(tf.__version__[:3]) >= 2.3


In [45]:
new_stroke_model = tf.keras.models.load_model('saved_model/stroke_model')

# Check its architecture
new_stroke_model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 10)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 25)                275       
_________________________________________________________________
dense_7 (Dense)              (None, 25)                650       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 26        
Total params: 951
Trainable params: 951
Non-trainable params: 0
_________________________________________________________________


In [46]:
# Evaluate the restored model
loss, acc = new_stroke_model.evaluate(x_test, y_test, verbose=2)
print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

print(new_stroke_model.predict(x_test).shape)


32/32 - 0s - loss: 0.1546 - accuracy: 0.9599
Restored model, accuracy: 95.99%
(1022, 1)


In [47]:
new_stroke_model.fit(x_train, y_train, epochs=11)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x7f54c9bfb850>

In [48]:
converter = tf.lite.TFLiteConverter.from_keras_model(new_stroke_model)

converter.target_spec.supported_ops = [
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
]


fb_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmpk64l4x19/assets


In [49]:
with open('stroke_tflite_models/stroke_model.tflite', 'wb') as f: 
  f.write(fb_model) 

In [50]:
ls -a


[0m[01;34m.[0m/   [01;34m.config[0m/   [01;34m.ipynb_checkpoints[0m/  [01;34msaved_model[0m/           [01;34mtraining_1[0m/
[01;34m..[0m/  [01;34mdatasets[0m/  [01;34msample_data[0m/         [01;34mstroke_tflite_models[0m/


In [51]:
pwd

'/content'

In [None]:
# path_to_zip_file = "rune/rune.x86_64-unknown-linux-gnu.zip"
# directory_to_extract_to = "rune/"

In [None]:
# import zipfile
# with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
#     zip_ref.extractall(directory_to_extract_to)

In [None]:
#mv rune ~/.cargo/bin when in workspace
