# Introduction to neural network classification with TensorFlow

Learning how to write neural networks for classification problems

Classification is identifying something as one thing or another.

Some types of classification problems:
* Binary Classificatoin
* Multiclass Classification
* Multilabel Classification

## Creating data to view and fit

In [None]:
from sklearn.datasets import make_circles

# Make 1000 samples
n_samples = 1000

# Create circles
X, y = make_circles(n_samples,
                    noise=0.03,
                    random_state=42)

In [None]:
# Check out features
X

In [None]:
# Check the labels
y[:10]

Visualize the data

In [None]:
import polars as pl
circles = pl.DataFrame({"X0":X[:, 0], "X1": X[:, 1], "label": y})
circles

In [None]:
# Visualize with a plot
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu);

## Input and output shapes

In [None]:
# Check the shapes of features and lables
X.shape, y.shape

In [None]:
# How many samples
len(X), len(y)

In [None]:
# View first sample of features and labels
X[0], y[0]

## Steps in modelling

1. Create or import a model
2. Compile the model
3. Fit the model
4. Evaluate the model
5. Tweak the model
6. Evaluate...

In [None]:
# Import TensorFlow
import tensorflow as tf
print(tf.__version__)

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model using the Sequential API
model_1 = tf.keras.Sequential([
    tf.keras.layers.Dense(1)
])

# 2. Compile the model
model_1.compile(loss=tf.keras.losses.CategoricalCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# 3. Fit the model
model_1.fit(X, y, epochs=5)

In [None]:
# Improve model by training longer
model_1.fit(X, y, epochs=50, verbose=0)
model_1.evaluate(X, y)

In [None]:
# Add an extra layer to new model

# Set the random seed
tf.random.set_seed(42)

# 1. Create the model using the Sequential API
model_2 = tf.keras.Sequential([
    tf.keras.layers.Dense(1),
    tf.keras.layers.Dense(1), 
])

# 2. Compile the model
model_2.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.SGD(),
                metrics=["accuracy"])

# 3. Fit the model
model_2.fit(X, y, epochs=50)

In [None]:
# 4. Evaluate the model
model_2.evaluate(X, y)

## Improving model

1. Create a model - add more layers, increase number of hidden units in a layer, change or add activation layer
2. Compiling a model - Choose a different optimization function such as Adam instead of SGD
3. Fitting a model - fit model with more epochs (training for longer)

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model using the Sequential API (with 3 layers)
model_3 = tf.keras.Sequential([
    tf.keras.layers.Dense(100), # Add 100 dense neurons
    tf.keras.layers.Dense(10), # Add another layer with 10 neurons
    tf.keras.layers.Dense(1) 
])

# 2. Compile the model
model_3.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

# 3. Fit the model
model_3.fit(X, y, epochs=100, verbose=0)

In [None]:
# 4. Evaluate the model
model_3.evaluate(X, y)

In [None]:
model_3.predict(X)

To visualize the model's predicitons. Create a function `plot_decision_boundary`, this function will:

* Take in a trained model, features (X) and labels (y)
* Create a meshgrid of the different X values
* Make predictions across the meshgrid
* Plot the prediction as well as a line between zones (where each unique class falls)

In [None]:
import numpy as np

def plot_decision_boundary(model, X, y):
    """
    Plots the decision boundary created by a model predicting on X.
    """

    # Define the axis boundaries of the plot and create a meshgrid
    x_min, x_max = X[:, 0].min() - 0.1, X[:, 0].max() + 0.1
    y_min, y_max = X[:, 1].min() - 0.1, X[:, 1].max() + 0.1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    
    # Create X value (make predictions on these)
    x_in = np.c_[xx.ravel(), yy.ravel()] # Stack 2D arrays together

    # Make predictions
    y_pred = model.predict(x_in)

    # Check for multiclass
    if len(y_pred[0]) > 1:
        print("doing multiclass classification")
        # Reshape predictions for plotting
        y_pred = np.argmax(y_pred, axis=1).reshape(xx.shape)
    else: 
        print("doing binary classification")
        y_pred = np.round(y_pred).reshape(xx.shape)

    # Plot the decision boundary
    plt.contourf(xx, yy, y_pred, cmap=plt.cm.RdYlBu, alpha=0.7)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.RdYlBu) 
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())       

In [None]:
# Plot the decision boundary
plot_decision_boundary(model=model_3, X=X, y=y)

## Non-linearity

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model
model_4 = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation="linear") 
])

# 2. Compile the model
model_4.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=["accuracy"])

# 3. Fit the model
history = model_4.fit(X, y, epochs=100)

In [None]:
# Check out data
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.RdYlBu);

In [None]:
# Check the decision boundary for latest model
plot_decision_boundary(model_4, X, y)

Build neural network with a non-linear activation function

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model using a non-linear activation
model_5 = tf.keras.Sequential([
    tf.keras.layers.Dense(1, activation="relu") 
])

# 2. Compile the model
model_5.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=["accuracy"])

# 3. Fit the model
history = model_5.fit(X, y, epochs=100)

In [None]:
# Create another model with more layers

# Set the random seed
tf.random.set_seed(42)

# 1. Create the model 
model_6 = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"), 
    tf.keras.layers.Dense(1), 
])

# 2. Compile the model
model_6.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=["accuracy"])

# 3. Fit the model
history = model_6.fit(X, y, epochs=250)

In [None]:
# Plot decision boundary
plot_decision_boundary(model_6, X, y)

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model 
model_7 = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"), 
    tf.keras.layers.Dense(1, activation="sigmoid"), 
])

# 2. Compile the model
model_7.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=["accuracy"])

# 3. Fit the model
history = model_7.fit(X, y, epochs=100)

In [None]:
# 4. Evaluate model
model_7.evaluate(X, y)

In [None]:
plot_decision_boundary(model_7, X, y)

The combination of **linear (straight lines) and non-linear (non-straight lines) functions** is one of the key fundamentals of neural networks.

In [None]:
# Create a toy tensor 
A = tf.cast(tf.range(-10, 10), tf.float32)
A

In [None]:
# Visualize toy tensor
plt.plot(A)

In [None]:
# Replicate sigmoid(x) = 1 / (1 + exp(-x))
def sigmoid(x):
    return 1 / (1 + tf.exp(-x))

# Use sigmoid function
sigmoid(A)

In [None]:
# Plot tensor transformed by sigmoid
plt.plot(sigmoid(A))

In [None]:
# Recreate relu function
def relu(x):
    return tf.maximum(0, x)

# Pass toy tensor to relu function
relu(A)

In [None]:
# Plot tensor transformed by ReLU
plt.plot(relu(A))

In [None]:
tf.keras.activations.linear(A)

In [None]:
plt.plot(tf.keras.activations.linear(A))

## Evaluating and improving classification model

Create a training and test set.

In [None]:
# Check how many samples
len(X), len(y)

In [None]:
# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=42)

len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
# Create model that fits on the training data and evaluated using the testing data

# Set the random seed
tf.random.set_seed(42)

# 1. Create the model (same as model_7)
model_8 = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

# 2. Compile the model (increase learning rate)
model_8.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.01),
                metrics=["accuracy"])

# 3. Fit the model (lower number of epochs)
history = model_8.fit(X_train, y_train, epochs=25)


In [None]:
# 4. Evaluate the model (must be using test data)
model_8.evaluate(X_test, y_test)

In [None]:
# Plot decision boundary
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model=model_8, X=X_train, y=y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model=model_8, X=X_test, y=y_test)
plt.show();

## Plot the loss (or training) curvers

In [None]:
# Convert history object to a dataframe
pl.DataFrame(history.history)

In [None]:
# Plot the loss curves
plt.plot(pl.DataFrame(history.history))
plt.legend(["accuracy","loss"])
plt.title("Model 8 loss curves");

For many problems, the loss function going down means the model is improving (the predictions it's making are getting close to the ground truth labels)

## Finding the best learning rate

To find the ideal learning rate (the learning rate where the loss decreatese the most during training) use the following steps:

* A learning rate **callback** - extra piece of functionality that can be added while training model
* Another model 
* Modified loss curves plot

In [None]:
# Set the random seed
tf.random.set_seed(42)

# 1. Create the model
model_9 = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

# 2. Compile the model
model_9.compile(loss="binary_crossentropy",
                optimizer="Adam",
                metrics=["accuracy"])

# 3. Create a learning rate callback
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-4 * 10**(epoch/20))

# 4. Fit the model (pass lr_scheduler)
history_9 = model_9.fit(X_train, 
                      y_train, 
                      epochs=100, 
                      callbacks=[lr_scheduler])

In [None]:
# Checkout history
plt.figure(figsize=(10, 7))
plt.plot(pl.DataFrame(history_9.history))
plt.legend(["loss", "accuracy", "lr"])
plt.xlabel("epochs");

In [None]:
# Checkout the learning rate versus the loss
lrs = 1e-4 * (10 ** (tf.range(100)/200))
plt.figure(figsize=(10, 7))
plt.semilogx(lrs, history_9.history["loss"])
plt.xlabel("Learning rate")
plt.ylabel("Loss")
plt.title("Learning rate vs. Loss")

In [None]:
# Example of other typical learning rates values:
10**0, 10**-1, 10**-2, 10**-3, 1e-4

In [None]:
# Try using a higher ideal learing rate with the same model

# Set the random seed
tf.random.set_seed(42)

# 1. Create the model
model_10 = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

# 2. Compile the model
model_10.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.02),
                metrics=["accuracy"])

# 3. Fit the model (pass lr_scheduler)
history_10 = model_10.fit(X_train, y_train, epochs=20)

In [None]:
# 4. Evaluate model 10
model_10.evaluate(X_test, y_test)

In [None]:
# Evaluate model 8 
model_8.evaluate(X_test, y_test)

In [None]:
# Plot the decision boundaries for the training and test sets
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title("Train")
plot_decision_boundary(model_10, X_train, y_train)
plt.subplot(1, 2, 2)
plt.title("Test")
plot_decision_boundary(model_10, X_test, y_test)
plt.show();

## More classification evaluation methods

* Accuracy
* Precision 
* Recall
* F1-score 
* Confusion Matrix
* Classification Report (Scikit-learn)

In [None]:
# Check model accuracy
loss, accuracy = model_10.evaluate(X_test, y_test)
print(f"Model loss: {loss}")
print(f"Model accuracy: {(accuracy*100):.2f}%")

In [None]:
# Create confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Make prediction (Convert prediction probabilites to binary predictions)
y_preds = tf.round(model_10.predict(X_test))

# Create and plot confusion matrix
cm = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_preds))
cm.plot(cmap="Blues");

# Working with a larger example (Multiclass classification)

When there are more than two classes of an option, it's known as **multi-class classification**

* This means having 3 different classes is a multiclass-classifcation.
* Also means that having 100 different classes is also a mutliclass=classification.

To practice multiclass classification, build a neural network to classify images of different items of clothin.

In [None]:
import tensorflow as tf
from tensorflow.keras.datasets import fashion_mnist

# The data has already been splitted into training and test sets
(train_data, train_labels), (test_data, test_labels) = fashion_mnist.load_data()

In [None]:
# Show the first training example
print(f"Training sample:\n{train_data[0]}\n")
print(f"Training label:\n{train_labels[0]}\n")

In [None]:
# Check the shape of a single example
train_data[0].shape, train_labels[0].shape

In [None]:
# Show a sample
import matplotlib.pyplot as plt
plt.imshow(train_data[8]);

In [None]:
# Check out samples label
train_labels[8]

In [None]:
# Create a small list to index training labels to become human readable
class_names = ["T-shirt/top","Trouser","Pullover","Dress", "Coat", "Sandal", "Shirt", "Sneaker", "Bag", "Ankle boot"]
len(class_names)

In [None]:
# Plot sample image and label
chosen_index = 20
plt.imshow(train_data[chosen_index], cmap=plt.cm.binary)
plt.title(class_names[train_labels[chosen_index]]);

In [None]:
# Plot multiple random images of fashion MNISt
import random
plt.figure(figsize=(7, 7))
for i in range(4):
    ax = plt.subplot(2, 2, i+1)
    rand_index = random.choice(range(len(train_data)))
    plt.imshow(train_data[rand_index], cmap=plt.cm.binary)
    plt.title(class_names[train_labels[rand_index]])
    plt.axis(False)

## Building a multiclass classification model

For the multiclass classification model, a similar architecture of binary classification can be followed but a few things need to be tweaked:

* Input shape = 28 x 28 (the shape of one image)
* Output shape = 10 (one per class of clothing)
* Loss function = tf.keras.losses.CategoricalCrossentropy()
  * If labels are one-hot encoded use CategoricalCrossEntropy() but if labels are in integer form use SparseCategoricalEntropy()
* Output layer activation = Softmax (not Sigmoid)

In [None]:
# Set random seed
tf.random.set_seed(42)

# 1. Create the model
model_11 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)), # data needs to be flattend to a vector
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

# 2. Compile the model
model_11.compile(loss="categorical_crossentropy",
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=["accuracy"])

# 3. Fit the model
non_norm_history = model_11.fit(train_data,
                                tf.one_hot(train_labels, depth=10),
                                epochs=10,
                                validation_data=(test_data, tf.one_hot(test_labels, depth=10)))


In [None]:
# Check model summary
model_11.summary()

In [None]:
# Check the min and max values of the training dataf
train_data.min(), train_data.max()

Neural networks prefer data to be scaled (or normalized), this means that they prefer to have the numbers in the tensors to be between 0 & 1 to find patterns.

In [None]:
# Get training and testing data between 0 and 1 by dividing by the maximum
train_data_norm = train_data / 255.0
test_data_norm = test_data / 255.0

# Check the min and max values of the scaled training data
train_data_norm.min(), train_data_norm.max()

In [None]:
# Set random seed
tf.random.set_seed(42)

# 1. Create the model
model_12 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

# 2. Compile the model
model_12.compile(loss="sparse_categorical_crossentropy",
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=["accuracy"])

# 3. Fit the model
norm_history = model_12.fit(train_data_norm,
                                train_labels,
                                epochs=10,
                                validation_data=(test_data_norm, test_labels))


In [None]:
import polars as pl
# Plot non-normalized data loss curves
plt.plot(pl.DataFrame(non_norm_history.history))
plt.legend(pl.DataFrame(non_norm_history.history).columns)
plt.title("Non-normalized data");

In [None]:
# Plot normalized data loss curves
plt.plot(pl.DataFrame(norm_history.history))
plt.legend(pl.DataFrame(norm_history.history).columns)
plt.title("Normalized data");

The same model with even *slightly* different data can produce *dramatically* different results. So when comparing models, it's important to make sure they are being compared on the same criteria (e.g. same architecture but different data or same data but different architecture)

## Find the ideal learning rate

In [None]:
# Set random seed
tf.random.set_seed(42)

# 1. Create the model
model_13 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

# 2. Compile the model
model_13.compile(loss="sparse_categorical_crossentropy",
                 optimizer=tf.keras.optimizers.Adam(),
                 metrics=["accuracy"])

# Create the learning rate callback
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 * 10**(epoch/20))

# 3. Fit the model
find_lr_history = model_13.fit(train_data_norm,
                                train_labels,
                                epochs=40,
                                validation_data=(test_data_norm, test_labels),
                                callbacks=[lr_scheduler])


In [None]:
# Plot the learning rate decay curve
lrs = 1e-3 * (10**(tf.range(40)/20))
plt.semilogx(lrs, find_lr_history.history["loss"])
plt.xlabel("Learning rate")
plt.ylabel("Loss")
plt.title("Finding the ideal learning rate")

In [None]:
# Refit model

# Set random seed
tf.random.set_seed(42)

# 1. Create the model
model_14 = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

# 2. Compile the model
model_14.compile(loss="sparse_categorical_crossentropy",
                 optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                 metrics=["accuracy"])

# 3. Fit the model
history_14 = model_14.fit(train_data_norm,
                                train_labels,
                                epochs=20,
                                validation_data=(test_data_norm, test_labels))


## Evaluating multiclass classification model

* Evaluate its performance using other classification metrics
* Assess some of its predictions (through visualizations)
* Improve its results (by trainin it for longer or changing the architecture)

In [None]:
# Create a confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Make prediction (Convert prediction probabilites to interger predictions)
predictions = model_14.predict(test_data_norm).argmax(axis=1)

# Create and plot confusion matrix
cm = ConfusionMatrixDisplay.from_predictions(test_labels, predictions, 
                            display_labels=class_names,
                            cmap="Blues")
cm.figure_.set_figwidth(10)
cm.figure_.set_figheight(10)

Create a function for:
* Plotting a random image
* Making a prediction on said image
* Label the plot with the truth label and the predicted label

In [None]:
import random

def plot_random_image(model, images, true_labels, classes):
    """
    Picks a random image, plots it and labels it with a prediciton and truth label.
    """

    # Set up random integer
    i = random.randint(0, len(images))

    # Create predictions and targets
    target_image = images[i]
    pred_probs = model.predict(target_image.reshape(1, 28, 28))
    pred_label = classes[pred_probs.argmax()]
    true_label = classes[true_labels[i]]

    # Plot the image
    plt.imshow(target_image, cmap=plt.cm.binary)

    # Change the color of the titles depending if the prediction is right or wrong
    if pred_label == true_label:
        color = "green"
    else:
        color = "red"

    # Add xlabel information (prediction/true label)
    plt.xlabel("Pred: {} {:2.0f}% True: {}".format(pred_label, 
                                                     100*tf.reduce_max(pred_probs),
                                                     true_label),
                                                     color=color) # set the color to green or red depending on prediction        


In [None]:
# Check out a random image as well as its prediction
plot_random_image(model_14, 
                  images=test_data_norm, 
                  true_labels=test_labels, 
                  classes=class_names)

## What patterns is the model learning?

In [None]:
# Find the layers of the most recent model
model_14.layers

In [None]:
# Extract a particular layer
model_14.layers[1]

In [None]:
# Get the patterns of a layer in the network
weights, biases = model_14.layers[1].get_weights()

# Shapes
weights, weights.shape

In [None]:
model_14.summary()

In [None]:
biases, biases.shape

Every neuron has a bias vector. Each of these is paired with a weights matrix.

The bias vector get initialized as zeroes.

The bias vector dictates how much the patterns within the corresponding weights matrix should influence the next layer.

In [None]:
# Check out another way of viewing deep learning models
from tensorflow.keras.utils import plot_model

# See the inputs and outputs of each layer
plot_model(model_14, show_shapes=True, to_file="models/model_14_layers.png")