<a href="https://colab.research.google.com/github/AndrewDavidRatnam/HandsonWorkingML/blob/main/CustomModels_and_CustomTraining_with_tensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Checks

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from pathlib import Path
import sys
import pandas as pd

tf.random.set_seed(42)
np.random.seed(42)

if not (tf.config.list_physical_devices('TPU') or tf.config.list_physical_devices('GPU') ):
    print("No GPU was detected. Neural nets can be very slow without a GPU.")
    if "google.colab" in sys.modules:
        print("Go to Runtime > Change runtime and select a GPU hardware "
              "accelerator.")
    if "kaggle_secrets" in sys.modules:
        print("Go to Settings > Accelerator and select GPU.")
if tf.config.list_physical_devices("TPU"):
  print("LEZZZZZZ GOOOO  TPU") #ALL GOOD
else:
  print("Using GPU")


No GPU was detected. Neural nets can be very slow without a GPU.
Go to Runtime > Change runtime and select a GPU hardware accelerator.
Using GPU


In [None]:
train = pd.read_csv("/content/sample_data/california_housing_train.csv")
test = pd.read_csv("/content/sample_data/california_housing_test.csv")


In [None]:
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [None]:
from sklearn.model_selection import train_test_split

X_train_full = train.drop("median_house_value", axis=1)
y_train_full = train["median_house_value"]

X_test = test.drop("median_house_value", axis=1)
y_test = test["median_house_value"]

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [None]:
#convert dataframe/series to numpy
X_train = X_train.values
X_valid = X_valid.values
X_test = X_test.values
y_train = y_train.values
y_valid = y_valid.values
y_test = y_test.values

## Custom Layer Normalization

In [None]:
class LayerNormalization(tf.keras.layers.Layer):
  def __init__(self, eps=1e-3, **kwargs):
    super().__init__(**kwargs)
    self.eps = eps

  def build(self, batch_input_shape):
    self.alpha = self.add_weight(
        name="alpha",
        shape=batch_input_shape[-1:],
        initializer="ones")
    self.beta = self.add_weight(
        name="beta",
        shape=batch_input_shape[-1:],
        initializer="zeros")

  def call(self, X):
    mean, variance = tf.nn.moments(X, axes=-1, keepdims=True)
    return self.alpha * (X - mean) / tf.sqrt(variance + self.eps) + self.beta, mean, variance #changed this

  def get_config(self):
    base_config = super().get_config()
    return {**base_config, "eps":self.eps}


In [None]:
X = X_train.astype(np.float32)

custom_layer_norm = LayerNormalization()
keras_layer_norm = tf.keras.layers.LayerNormalization()

tf.reduce_mean(tf.keras.losses.mean_absolute_error(keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=4.9642363e-08>

In [None]:
#trust is in short supply these days and also double check
tf.keras.utils.set_random_seed(42)
random_alpha = np.random.rand(X.shape[-1])
random_beta = np.random.rand(X.shape[-1])

custom_layer_norm.set_weights([random_alpha, random_beta])
keras_layer_norm.set_weights([random_alpha, random_beta])

tf.reduce_mean(tf.keras.losses.mean_absolute_error(keras_layer_norm(X), custom_layer_norm(X)))

<tf.Tensor: shape=(), dtype=float32, numpy=2.9002763e-08>

## Custom Training Loop in Fashion MNIST

In [2]:
(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
X_train_full = X_train_full.astype(np.float32)/255.
X_valid, X_train = X_train_full[:5_000], X_train_full[5_000:]
y_valid, y_train = y_train_full[:5_000], y_train_full[5_000:]
X_test = X_test.astype(np.float32)/255.


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [3]:
X_train.shape

(55000, 28, 28)

In [4]:
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

# X_train = scaler.fit_transform(tf.keras.layers.Flatten(input_shape=[28, 28])(X_train))
# X_valid = scaler.transform(tf.keras.layers.Flatten(input_shape=[28, 28])(X_valid))
# X_test = scaler.transform(tf.keras.layers.Flatten(input_shape=[28, 28])(X_test))

### Custom training Loop

In [5]:
tf.keras.utils.set_random_seed(42)

In [6]:
X_train.shape

(55000, 28, 28)

In [7]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]), #already flattened while scaling try out
    tf.keras.layers.Dense(300, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [8]:
def random_batch(X, y, batch_size=256):
  idx = np.random.randint(len(X), size=batch_size)
  return X[idx], y[idx]


In [9]:
def print_status_bar(step, total, loss, metrics=None):
  metrics = " - ".join([f"{m.name}: {m.result():.4f}" for m in loss + (metrics or [])]) #loss is a list of losses
  end = "" if step < total else "\n"
  print(f"\r{step}/{total} - " + metrics, end=end)


In [16]:
def print_status_validloss(loss, metrics=None):
  metrics = " - ".join([f"{m.name}: {m.result():.4f}" for m in loss + (metrics or [])]) #loss is a list of losses
  print("VALID LOSS AND ACCURACY")
  print(f"\r " + metrics)

In [10]:
tf.keras.utils.set_random_seed(42)


In [11]:
len(X_train)// 32

1718

In [12]:
n_epochs = 5
batch_size = 256 # was actually 32 that's why 1718 steps lol
n_steps = len(X_train) // batch_size
optimizer1 = tf.keras.optimizers.Nadam(learning_rate=1e-2)
optimizer2 = tf.keras.optimizers.Nadam(learning_rate=1e-3)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
valid_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [17]:
for epoch in range(1, n_epochs + 1):
  print(f"Epoch {epoch}/{n_epochs}")
  for step in range(1, n_steps + 1):
    X_batch, y_batch = random_batch(X_train, y_train, batch_size=256)
    with tf.GradientTape() as tape:
      y_pred = model(X_batch, training=True) # can this be outside grad tape?
      main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
      loss = tf.add_n([main_loss] + model.losses)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer1.apply_gradients(zip(gradients, model.trainable_variables))

    for variable in model.variables:
      if variable.constraint is not None:
        variable.assign(variable.constraint(variable))

    mean_loss(loss)
    for metric in metrics:
      metric(y_batch, y_pred)

    print_status_bar(step, n_steps, [mean_loss], metrics)

  for metric in [mean_loss, valid_loss] + metrics:
    metric.reset_states()

  v_loss = tf.reduce_mean(loss_fn(y_valid, model(X_valid)))
  valid_loss(v_loss)
  for metric in metrics:
    metric(y_valid, model(X_valid))
  print_status_validloss([valid_loss], metrics)
  for metric in [mean_loss, valid_loss] + metrics:
    metric.reset_states()


Epoch 1/5
214/214 - mean: 0.2358 - sparse_categorical_accuracy: 0.9120
VALID LOSS AND ACCURACY
 mean: 0.3292 - sparse_categorical_accuracy: 0.8858
Epoch 2/5
214/214 - mean: 0.2309 - sparse_categorical_accuracy: 0.9146
VALID LOSS AND ACCURACY
 mean: 0.3341 - sparse_categorical_accuracy: 0.8898
Epoch 3/5
214/214 - mean: 0.2243 - sparse_categorical_accuracy: 0.9168
VALID LOSS AND ACCURACY
 mean: 0.3669 - sparse_categorical_accuracy: 0.8834
Epoch 4/5
214/214 - mean: 0.2319 - sparse_categorical_accuracy: 0.9138
VALID LOSS AND ACCURACY
 mean: 0.3509 - sparse_categorical_accuracy: 0.8786
Epoch 5/5
214/214 - mean: 0.2152 - sparse_categorical_accuracy: 0.9196
VALID LOSS AND ACCURACY
 mean: 0.3537 - sparse_categorical_accuracy: 0.8834


In [16]:
tf.keras.utils.set_random_seed(42)
model_2 = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(300, activation="relu"),
    tf.keras.layers.Dense(10, activation="softmax")
])

In [17]:
n_epochs = 5
batch_size = 512
n_steps = len(X_train) // batch_size
optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

In [18]:
@tf.function
def print_status_bar(step, total, loss, metrics=None):
  metrics = " - ".join([f"{m.name}: {m.result():.4f}" for m in loss + (metrics or [])]) #loss is a list of losses
  end = "" if step < total else "\n"
  print(f"\r{step}/{total} - " + metrics, end=end)

@tf.function
def print_status_validloss(loss, metrics=None):
  metrics = " - ".join([f"{m.name}: {m.result():.4f}" for m in loss + (metrics or [])]) #loss is a list of losses
  print("VALID LOSS AND ACCURACY")
  print(f"\r " + metrics)

@tf.function
def random_batch(X, y, batch_size=128):
  idx = tf.random.uniform(shape=[batch_size], maxval=len(X), dtype=tf.int32) # Use tf.random to generate indices not numpy indices
  return tf.gather(X, idx), tf.gather(y, idx) # Use tf.gather to index the tensors

In [19]:
from tqdm.notebook import trange
from collections import OrderedDict


with trange(1, n_epochs + 1, desc="All Epochs") as epochs:
  for epoch in epochs:
    with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
      for step in steps:
        X_batch, y_batch = random_batch(X_train, y_train,batch_size=512)
        with tf.GradientTape() as tape:
          y_pred = model_2(X_batch)# training=True ?
          main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
          loss = tf.add_n([main_loss] + model_2.losses)

        gradients = tape.gradient(loss, model_2.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model_2.trainable_variables))
        for variable in model_2.variables:
          if variable.constraint is not None:
            variable.assign(variable.constraint(variable))

        status = OrderedDict()
        mean_loss(loss)
        status["loss"] = mean_loss.result().numpy()
        for metric in metrics:
          metric(y_batch, y_pred)
          status[metric.name] = metric.result().numpy()
        steps.set_postfix(status)

      y_pred = model(X_valid)
      status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
      status["val_accuracy"] = np.mean(tf.keras.metrics.sparse_categorical_accuracy(
            tf.constant(y_valid, dtype=np.float32), y_pred
        ))
      steps.set_postfix(status)
    for metric in [mean_loss] + metrics:
      metric.reset_states()

All Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/107 [00:00<?, ?it/s]

### Different Learning Rates for upper and lower layers

In [3]:
tf.keras.utils.set_random_seed(42)


In [4]:
lower_layers = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=[28, 28]),
    tf.keras.layers.Dense(300, activation="relu"),
])
upper_layers = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation="softmax")
])
model = tf.keras.Sequential([lower_layers, upper_layers])

In [5]:
lower_optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)
upper_optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)

In [11]:
n_epochs = 5
batch_size = 512
n_steps = len(X_train)//batch_size
loss_fn = tf.keras.losses.sparse_categorical_crossentropy
mean_loss = tf.keras.metrics.Mean()
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]


In [14]:
from tqdm.notebook import trange
from collections import OrderedDict

with trange(1, n_epochs + 1, desc="All Epochs") as epochs:
  for epoch in epochs:
    with trange(1, n_steps + 1, desc=f"Epoch {epoch}/{n_epochs}") as steps:
      for step in steps:
        X_batch, y_batch = random_batch(X_train, y_train, batch_size=512)
        with tf.GradientTape(persistent=True) as tape: # 2 optimizers so doing it twice
          y_pred = model(X_batch)
          main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
          loss = tf.add_n([main_loss] + model.losses)
        for layers, optimizer in ((lower_layers, lower_optimizer), (upper_layers, upper_optimizer)):
          gradients = tape.gradient(loss, layers.trainable_variables)
          optimizer.apply_gradients(zip(gradients, layers.trainable_variables))
        del tape #persistent = True

        for variable in model.variables:
          if variable.constraint is not None:
            variable.assign(variable.constraint(variable))

        status = OrderedDict()
        mean_loss(loss)
        status["loss"] = mean_loss.result().numpy()
        for metric in metrics:
          metric(y_batch, y_pred)
          status[metric.name] = metric.result().numpy()
        steps.set_postfix(status)
      y_pred = model(X_valid)
      status["val_loss"] = np.mean(loss_fn(y_valid, y_pred))
      status["val_accuracy"] = np.mean(tf.keras.metrics.sparse_categorical_accuracy(
          tf.constant(y_valid, dtype=np.float32), y_pred
      ))
      steps.set_postfix(status)
    for metric in [mean_loss] + metrics:
      metric.reset_states()

All Epochs:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 2/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 3/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 4/5:   0%|          | 0/107 [00:00<?, ?it/s]

Epoch 5/5:   0%|          | 0/107 [00:00<?, ?it/s]