In [1]:
%pip install --upgrade tensorflow  # % sign executes a shell command through jupyter notebook

Note: you may need to restart the kernel to use updated packages.


In [9]:
import pandas as pd
import urllib.request

# Downloading data file and save in compute file store
url = "https://raw.githubusercontent.com/emmett-tomai/DHS-DUST-workshop/main/data/penguins.csv"
local_file_path = "penguins.csv"
urllib.request.urlretrieve(url, local_file_path)

# Read the data into a Pandas DataFrame
df =  pd.read_csv(local_file_path)

# Get rid of incomplete lines and show the first 5 rows
df = df.dropna()
df

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
1,37.8,17.1,186.0,3300.0,0
2,37.8,17.3,180.0,3700.0,0
3,41.1,17.6,182.0,3200.0,0
4,38.6,21.2,191.0,3800.0,0
5,34.6,21.1,198.0,4400.0,0
...,...,...,...,...,...
196,55.8,19.8,207.0,4000.0,2
197,43.5,18.1,202.0,3400.0,2
198,49.6,18.2,193.0,3775.0,2
199,50.8,19.0,210.0,4100.0,2


In [10]:
# Problem Data Definition
penguin_classes = ['Adelie', 'Gentoo', 'Chinstrap']
features = ['CulmenLength','CulmenDepth','FlipperLength','BodyMass']
label = 'Species'

In [16]:
# Pandas DataFrames and Series
# https://pandas.pydata.org/docs/user_guide/10min.html



# Deep Learning models work best when features are on similar scales
for x in features:
    df[x] = df[x]/max(df[x])


#df['CulmenLength'] = df['CulmenLength']/max(df['CulmenLength'])
#df['CulmenDepth'] = df['CulmenDepth']/max(df['CulmenDepth'])
#df['FlipperLength'] = df['FlipperLength']/max(df['FlipperLength'])
df

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
1,0.651724,0.795349,0.877358,0.687500,0
2,0.651724,0.804651,0.849057,0.770833,0
3,0.708621,0.818605,0.858491,0.666667,0
4,0.665517,0.986047,0.900943,0.791667,0
5,0.596552,0.981395,0.933962,0.916667,0
...,...,...,...,...,...
196,0.962069,0.920930,0.976415,0.833333,2
197,0.750000,0.841860,0.952830,0.708333,2
198,0.855172,0.846512,0.910377,0.786458,2
199,0.875862,0.883721,0.990566,0.854167,2


In [17]:
# Import tensorflow ANN library and easy-to-use Keras tools

import tensorflow
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import utils
from tensorflow.keras import optimizers

# Set random seed for reproducability
tensorflow.random.set_seed(8)

print("Libraries imported.")
print('TensorFlow version:',tensorflow.__version__)

2023-08-12 18:28:59.608217: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Libraries imported.
TensorFlow version: 2.13.0


In [75]:
import numpy as np

# Extract features as an array of arrays (tensorflow uses high-performance numpy arrays)
x_train = np.array([np.array(values) for values in df[features].values])
y_train = np.array([np.array(values) for values in df[label].values])

# Convert input values to float32 for network calculations
x_train = x_train.astype('float32')

# Set data types for categorical labels to "one-hot"
y_train = utils.to_categorical(y_train)

y_train


array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

In [87]:
# Create neural network

# Fully-connected "dense" layers
#  width, input dimentions, activation
#  relu and softmax (probabilities)

hl = 20 # Number of hidden layer nodes

model = Sequential()
model.add(Dense(hl, input_dim=len(features), activation='relu'))
model.add(Dense(hl, input_dim=hl, activation='relu')) 
model.add(Dense(len(penguin_classes), input_dim=hl, activation='softmax')) # Softmax used to display probablitiy

print(model.summary())

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_23 (Dense)            (None, 20)                100       
                                                                 
 dense_24 (Dense)            (None, 20)                420       
                                                                 
 dense_25 (Dense)            (None, 3)                 63        
                                                                 
Total params: 583 (2.28 KB)
Trainable params: 583 (2.28 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [89]:
# Train the model

# Optimizer and learning rate
learning_rate = 0.001
opt = optimizers.Adam(learning_rate=learning_rate)

# Loss function (minimize)
#  output: probability of each category being correct
#  cross-entroy: sum of error between predicted and actual probabilities
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

# Training
#  train on full data set each epoch (and repeat)
#  batch samples together for parameter updates (more efficient, stable)

num_epochs = 50
history = model.fit(x_train, y_train, epochs=num_epochs, batch_size=30)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [91]:
# Model evalutaion (WAIT ON THIS PART)

# load testing set
url = "https://raw.githubusercontent.com/emmett-tomai/DHS-DUST-workshop/main/data/penguins_test.csv"
local_file_path = "penguins_test.csv"
urllib.request.urlretrieve(url, local_file_path)

# read the data into a Pandas DataFrame
testdf = pd.read_csv(local_file_path).dropna()

# pre-process (same as training data)
for x in features:
    testdf[x] = df[x]/max(df[x])

x_test = np.array([np.array(values) for values in testdf[features].values])
y_test = np.array([np.array(values) for values in testdf[label].values])

# Convert input values to float32 for network calculations
x_test = x_test.astype('float32')

# Set data types for categorical labels to "one-hot"
y_test = utils.to_categorical(y_test)
# get predictions from model (inference)
class_probabilities = model.predict(x_test)

# take the highest probability answer for each test sample
predictions = np.argmax(class_probabilities, axis=1)
true_label = np.argmax(y_test, axis=1)

# calculate accuracy (number right / total predictions)
right_ct = len([p for p, tl in zip(predictions, true_label) if p == tl])
acc = right_ct/len(predictions)

predictions
print(acc)

0.3103448275862069


In [94]:
# Plotting (WAIT ON THIS PART)

import matplotlib.pyplot as plt
%matplotlib inline

#plt.imshow(cm, interpolation="nearest", cmap=plt.cm.Blues)
plt.colorbar()
tick_marks = np.arange(len(penguin_classes))
plt.xticks(tick_marks, penguin_classes, rotation=85)
plt.yticks(tick_marks, penguin_classes)
plt.xlabel("Predicted Species")
plt.ylabel("Actual Species")
plt.show()

%matplotlib inline
from matplotlib import pyplot as plt

epoch_nums = range(1,num_epochs+1)
training_loss = history.history["loss"]
validation_loss = history.history["val_loss"]
plt.plot(epoch_nums, training_loss)
plt.plot(epoch_nums, validation_loss)
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['training', 'validation'], loc='upper right')
plt.show()

RuntimeError: No mappable was found to use for colorbar creation. First define a mappable such as an image (with imshow) or a contour set (with contourf).

In [93]:
model.save("models/penguin-classifier.h5")


  saving_api.save_model(
