In [1]:
!pip install pyyaml h5py #to save models in HDF5 format (if needed)



In [2]:
import os

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION) #print tensor flow version


2.6.0


In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_score,recall_score,f1_score
import time as timer
from sklearn.inspection import permutation_importance



In [4]:
import tempfile

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

In [5]:
#load dataset from kaggle or elsewhere
url = "https://gist.githubusercontent.com/aishwarya8615/d2107f828d3f904839cbcb7eaa85bd04/raw/cec0340503d82d270821e03254993b6dede60afb/healthcare-dataset-stroke-data.csv"
data = pd.read_csv(url)
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [6]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [8]:
#Again like we did before for XGBoost we can drop the null values
data=data.dropna()
data.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [9]:
data['age'] = data['age'].astype(np.float32)
data['avg_glucose_level'] = data['avg_glucose_level'].astype(np.float32)
data['bmi'] = data['bmi'].astype(np.float32)

In [10]:
df=data[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi','stroke']]
df.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
0,67.0,0,1,228.690002,36.599998,1
2,80.0,0,1,105.919998,32.5,1
3,49.0,0,0,171.229996,34.400002,1
4,79.0,1,0,174.119995,24.0,1
5,81.0,0,0,186.210007,29.0,1


In [11]:
# check data type of each column make sure float64 are now float32
df.dtypes

age                  float32
hypertension           int64
heart_disease          int64
avg_glucose_level    float32
bmi                  float32
stroke                 int64
dtype: object

In [12]:
data['age']

0       67.0
2       80.0
3       49.0
4       79.0
5       81.0
        ... 
5104    13.0
5106    81.0
5107    35.0
5108    51.0
5109    44.0
Name: age, Length: 4909, dtype: float32

In [13]:
data['bmi']

0       36.599998
2       32.500000
3       34.400002
4       24.000000
5       29.000000
          ...    
5104    18.600000
5106    40.000000
5107    30.600000
5108    25.600000
5109    26.200001
Name: bmi, Length: 4909, dtype: float32

In [14]:
data['avg_glucose_level']

0       228.690002
2       105.919998
3       171.229996
4       174.119995
5       186.210007
           ...    
5104    103.080002
5106    125.199997
5107     82.989998
5108    166.289993
5109     85.279999
Name: avg_glucose_level, Length: 4909, dtype: float32

In [15]:
# we then encode non numerical columns as we did before with XGBoost
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['gender']=le.fit_transform(data['gender'])
df['ever_married']=le.fit_transform(data['ever_married'])
df['work_type']=le.fit_transform(data['work_type'])
df['Residence_type']=le.fit_transform(data['Residence_type'])
df['smoking_status']=le.fit_transform(data['smoking_status'])
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender,ever_married,work_type,Residence_type,smoking_status
0,67.0,0,1,228.690002,36.599998,1,1,1,2,1,1
2,80.0,0,1,105.919998,32.5,1,1,1,2,0,2
3,49.0,0,0,171.229996,34.400002,1,0,1,2,1,3
4,79.0,1,0,174.119995,24.0,1,0,1,3,0,2
5,81.0,0,0,186.210007,29.0,1,1,1,2,1,1


In [16]:
data.shape,df.shape

((4909, 12), (4909, 11))

In [17]:
y=df['stroke']
x=df.drop('stroke',axis=1)
x.shape,y.shape

((4909, 10), (4909,))

In [18]:
from sklearn.model_selection import train_test_split as tts
x_train,x_test,y_train,y_test=tts(x,y,test_size=0.2)

In [19]:
 # Define a simple sequential model
def create_model():
  
      ann_model=tf.keras.Sequential()

      ann_model.add(tf.keras.layers.Dense(units=25,activation='relu'))

      ann_model.add(tf.keras.layers.Dense(units=25,activation='relu'))

      ann_model.add(tf.keras.layers.Dense(units=1,activation='sigmoid'))

      ann_model.compile('adam','binary_crossentropy',metrics=['accuracy'])
      return ann_model

In [20]:
nn_model = create_model()
result=nn_model.fit(x_train,y_train, epochs=11)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [21]:
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 25)                275       
_________________________________________________________________
dense_1 (Dense)              (None, 25)                650       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 26        
Total params: 951
Trainable params: 951
Non-trainable params: 0
_________________________________________________________________


In [22]:
from sklearn.metrics import confusion_matrix
y_pred=[]
for i in nn_model.predict(x_test):
    if i>0.5:
        y_pred.append(1)
    if i<0.5:
        y_pred.append(0)
confusion_matrix(y_test,y_pred)

array([[941,   6],
       [ 35,   0]])

In [23]:
from sklearn.metrics import accuracy_score
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9582484725050916

In [24]:
# Create a Callback During training
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [25]:
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [26]:
# Train the model with the new callback
  # Pass callback to training
nn_model.fit(x_train,y_train,epochs=11, validation_data=(x_test, y_test),callbacks=[cp_callback])
# This may generate warnings related to saving the state of the optimizer.
# These warnings (and similar warnings throughout this notebook)
# are in place to discourage outdated usage, and can be ignored.

Epoch 1/11

Epoch 00001: saving model to training_1/cp.ckpt
Epoch 2/11

Epoch 00002: saving model to training_1/cp.ckpt
Epoch 3/11

Epoch 00003: saving model to training_1/cp.ckpt
Epoch 4/11

Epoch 00004: saving model to training_1/cp.ckpt
Epoch 5/11

Epoch 00005: saving model to training_1/cp.ckpt
Epoch 6/11

Epoch 00006: saving model to training_1/cp.ckpt
Epoch 7/11

Epoch 00007: saving model to training_1/cp.ckpt
Epoch 8/11

Epoch 00008: saving model to training_1/cp.ckpt
Epoch 9/11

Epoch 00009: saving model to training_1/cp.ckpt
Epoch 10/11

Epoch 00010: saving model to training_1/cp.ckpt
Epoch 11/11

Epoch 00011: saving model to training_1/cp.ckpt


<keras.callbacks.History at 0x7f815e796fd0>

In [27]:
os.listdir(checkpoint_dir)


['cp.ckpt.data-00000-of-00001',
 'cp.ckpt.index',
 'checkpoint',
 '.ipynb_checkpoints']

In [28]:
# Create a basic model instance
model = create_model()

# Evaluate the model
loss, acc = model.evaluate(x_test, y_test, verbose=2)
print("Untrained model, accuracy: {:5.2f}%".format(100 * acc))


31/31 - 0s - loss: 0.6809 - accuracy: 0.9644
Untrained model, accuracy: 96.44%


In [29]:
# Loads the weights
model.load_weights(checkpoint_path)

# Re-evaluate the model
loss, acc = model.evaluate(x_test, y_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100 * acc))


31/31 - 0s - loss: 0.1396 - accuracy: 0.9644
Restored model, accuracy: 96.44%


In [30]:
# Create and train a new model instance.
model = create_model()
model.fit(x_train, y_train, epochs=11)

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/stroke_model')

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11
INFO:tensorflow:Assets written to: saved_model/stroke_model/assets


In [31]:
ls saved_model

[0m[01;34mstroke_model[0m/


In [32]:
ls saved_model/stroke_model

[0m[01;34massets[0m/  keras_metadata.pb  saved_model.pb  [01;34mvariables[0m/


In [33]:
# Convert .pb into  .tflite
import logging
logging.getLogger("tensorflow").setLevel(logging.DEBUG)

import tensorflow as tf
import numpy as np
assert float(tf.__version__[:3]) >= 2.3


In [34]:
new_stroke_model = tf.keras.models.load_model('saved_model/stroke_model')

# Check its architecture
new_stroke_model.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 25)                275       
_________________________________________________________________
dense_7 (Dense)              (None, 25)                650       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 26        
Total params: 951
Trainable params: 951
Non-trainable params: 0
_________________________________________________________________


In [35]:
# Evaluate the restored model
loss, acc = new_stroke_model.evaluate(x_test, y_test, verbose=2)
print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

print(new_stroke_model.predict(x_test).shape)


31/31 - 0s - loss: 0.1419 - accuracy: 0.9644
Restored model, accuracy: 96.44%
(982, 1)


In [36]:
new_stroke_model.fit(x_train, y_train, epochs=11)

Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


<keras.callbacks.History at 0x7f815f9be810>

In [37]:
converter = tf.lite.TFLiteConverter.from_keras_model(new_stroke_model)



In [38]:
converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter.allow_custom_ops=True
converter.experimental_new_converter =True
tflite_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmpy9vo8fe0/assets


In [39]:
converter = tf.lite.TFLiteConverter.from_keras_model(new_stroke_model)


converter.optimizations = [tf.lite.Optimize.DEFAULT]


In [40]:
converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter.allow_custom_ops=True
converter.experimental_new_converter =True
tflite_model = converter.convert()

INFO:tensorflow:Assets written to: /tmp/tmp0rzlh4xz/assets


INFO:tensorflow:Assets written to: /tmp/tmp0rzlh4xz/assets


In [41]:
ls -a


[0m[01;34m.[0m/  [01;34m..[0m/  [01;34m.config[0m/  [01;34msample_data[0m/  [01;34msaved_model[0m/  [01;34mtraining_1[0m/


In [46]:
pwd

'/content'

In [43]:
#import pathlib
import pathlib

In [44]:
# saved the tflite model in the saved_model folder


tflite_models_dir = pathlib.Path("/tmp/stroke_tflite_models/")
tflite_models_dir.mkdir(exist_ok=True, parents=True)

# Save the unquantized/float model:
tflite_model_file = tflite_models_dir/"stroke_model.tflite"
tflite_model_file.write_bytes(tflite_model)


6016

In [45]:
ls -a /tmp/

[0m[30;42m.[0m/
[01;34m..[0m/
dap_multiplexer.c3ba2d7b0f03.root.log.INFO.20211106-160202.55
[01;36mdap_multiplexer.INFO[0m@
[01;35mdebugger_23k42w6jdx[0m=
[01;34minitgoogle_syslog_dir.0[0m/
[01;34m__pycache__[0m/
[01;34mstroke_tflite_models[0m/
tmp_azz0p5h.py
tmpme16cb2m.py
tmppz_3qzii.py
tmpyy_g8vyc.py
tmpz0icu0ut.py
