<a href="https://colab.research.google.com/github/DaisyLaw/Machine-Learning-practices/blob/main/dog_vision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet tf_keras

In [None]:
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'

In [None]:
import pandas as pd
import numpy as np

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version:", tf.__version__)
print("TF Hub version:", hub.__version__)

print("GPU available!" if tf.config.list_physical_devices("GPU") else "not available")

In [None]:
import pandas as pd
labels_csv = pd.read_csv("/content/sample_data/Dog_Vision/labels.csv")
print(labels_csv.describe())
labels_csv.head()

In [None]:
labels_csv["breed"].value_counts().plot.bar(figsize=(20, 10));

In [None]:
labels_csv["breed"].value_counts().median()

In [None]:
cp "/content/sample_data/Dog_Vision/train.zip" "."

In [None]:
!unzip /content/sample_data/Dog_Vision/train.zip -d /content/sample_data/Dog_Vision

In [None]:
from IPython.display import Image
Image("/content/sample_data/Dog_Vision/train/001513dfcb2ffafc82cccf4d8bbaba97.jpg")

In [None]:
filenames = ["/content/sample_data/Dog_Vision/train/" + fname + ".jpg" for fname in labels_csv["id"]]
filenames[:10]

In [None]:
import os
if len(os.listdir("/content/sample_data/Dog_Vision/train/")) == len(filenames):
  print("Filenames match actual amount of files! Proceed.")
else:
  print("Filenames do not match actual amount of files, check the target directory.")

In [None]:
Image(filenames[9000])

In [None]:
len(filenames)

In [None]:
labels_csv["breed"][9000]

In [None]:
labels = np.array(labels_csv["breed"])
labels

In [None]:
len(labels)

In [None]:
len(labels) == len(filenames)

In [None]:
unique_labels = np.unique(labels)
unique_labels

In [None]:
len(unique_labels)

In [None]:
boolean_labels = [label == unique_labels for label in labels]
boolean_labels[:2]

In [None]:
len(boolean_labels)

In [None]:
print(labels[0])
print(np.where(unique_labels == labels[0]))
print(boolean_labels[0].argmax())
print(boolean_labels[0].astype(int))

In [None]:
X = filenames
y = boolean_labels

In [None]:
NUM_IMAGES = 1000 #@param {type: "slider", min: 1000, max: 10000, step: 1000}

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X[:NUM_IMAGES],
                                                  y[:NUM_IMAGES],
                                                  test_size=0.2,
                                                  random_state=42)

len(X_train), len(X_val), len(y_train), len(y_val)

In [None]:
X_train[:5], y_train[:2]

In [None]:
from matplotlib.pyplot import imread
image = imread(filenames[42])
image.shape

In [None]:
tf.constant(image)

In [None]:
IMG_SIZE = 224

def process_image(image_path, image_size=IMG_SIZE):

  image = tf.io.read_file(image_path)
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])

  return image

In [None]:
def get_image_label(image_path, label):

  image = process_image(image_path)

  return image, label

In [None]:
BATCH_SIZE = 32

def create_data_batches(X, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X)))
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch

  elif valid_data:
    print("Creating validatiion data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X),
                                              tf.constant(y)))
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    print("Creating training data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X),
                                               tf.constant(y)))
    data = data.shuffle(buffer_size=len(X))
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

In [None]:
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:
train_data.element_spec, val_data.element_spec

In [None]:
import matplotlib.pyplot as plt

def show_25_images(images, labels):

  plt.figure(figsize=(10, 10))

  for i in range(25):
    ax = plt.subplot(5, 5, i+1)
    plt.imshow(images[i])
    plt.title(unique_labels[labels[i].argmax()])
    plt.axis("off")

In [None]:
train_images, train_labels = next(train_data.as_numpy_iterator())
show_25_images(train_images, train_labels)

In [None]:
val_images, val_labels = next(val_data.as_numpy_iterator())
show_25_images(val_images, val_labels)

In [None]:
INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3]
OUTPUT_SHAPE = len(unique_labels)
MODEL_URL = "https://www.kaggle.com/models/google/mobilenet-v2/tensorFlow2/130-224-classification/2"

In [None]:
import tf_keras

def create_model(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, model_url=MODEL_URL):
  print("Building model with:", MODEL_URL)

  model = tf_keras.Sequential([
    hub.KerasLayer(MODEL_URL), # Layer 1 (input layer)
    tf_keras.layers.Dense(units=OUTPUT_SHAPE,
                          activation="softmax") # Layer 2 (output layer)
  ])

  model.compile(
      loss=tf_keras.losses.CategoricalCrossentropy(),
      optimizer=tf_keras.optimizers.Adam(),
      metrics=["accuracy"]
  )

  model.build(INPUT_SHAPE)

  return model

In [None]:
model = create_model()
model.summary()

In [None]:
%load_ext tensorboard

In [None]:
import datetime
import os

def create_tensorboard_callback():
  logdir = os.path.join("/content/sample_data/Dog_Vision/logs",
                        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  return tf_keras.callbacks.TensorBoard(logdir)

In [None]:
early_stopping = tf_keras.callbacks.EarlyStopping(monitor="val_accuracy",
                                                  patience=3)

In [None]:
NUM_EPOCHS = 100 #@param {type: "slider", min:10, max:100, step:10}

In [None]:
print("GPU available" if tf.config.list_physical_devices("GPU") else "not available")

In [None]:
def train_model():

  model = create_model()
  tensorboard = create_tensorboard_callback()
  model.fit(x=train_data,
            epochs=NUM_EPOCHS,
            validation_data=val_data,
            validation_freq=1,
            callbacks=[tensorboard, early_stopping])

  return model

In [None]:
model = train_model()

In [None]:
%tensorboard --logdir /content/sample_data/Dog_Vision/logs

In [None]:
predictions = model.predict(val_data, verbose=1)
predictions

In [None]:
predictions.shape

In [None]:
print(predictions[0])
print(f"Max value (probability of prediction): {np.max(predictions[0])}")
print(f"Sum: {np.sum(predictions[0])}")
print(f"Max index: {np.argmax(predictions[0])}")
print(f"Predicted label: {unique_labels[np.argmax(predictions[0])]}")

In [None]:
def get_pred_label(prediction_probabilities):
  return unique_labels[np.argmax(prediction_probabilities)]

pred_label = get_pred_label(predictions[0])
pred_label

In [None]:
def unbatchify(data):
  images = []
  labels = []

  for image, label in data.unbatch().as_numpy_iterator():
    images.append(image)
    labels.append(unique_labels[np.argmax(label)])

  return images, labels

val_images, val_labels = unbatchify(val_data)
val_images[0], val_labels[0]

In [None]:
def plot_pred(prediction_probabilities, labels, images, n=1):

  pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]

  pred_label = get_pred_label(pred_prob)

  plt.imshow(image)
  plt.xticks([])
  plt.yticks([])

  if pred_label == true_label:
    colour = "green"
  else:
    colour = "red"

  plt.title("{} {:2.0f}% ({})".format(pred_label,
                                      np.max(pred_prob)*100,
                                      true_label),
                                      color = colour)

In [None]:
plot_pred(prediction_probabilities=predictions,
          labels=val_labels,
          images=val_images)

In [None]:
def plot_pred_conf(prediction_probabilities, labels, n=1):
  pred_prob, true_label = prediction_probabilities[n], labels[n]

  pred_label = get_pred_label(pred_prob)
  top_10_pred_indexes = pred_prob.argsort()[-10:][::-1]
  top_10_pred_values = pred_prob[top_10_pred_indexes]
  top_10_pred_labels = unique_labels[top_10_pred_indexes]

  top_plot = plt.bar(np.arange(len(top_10_pred_labels)),
                     top_10_pred_values,
                     color="grey")
  plt.xticks(np.arange(len(top_10_pred_labels)),
             labels=top_10_pred_labels,
             rotation="vertical")

  if np.isin(true_label, top_10_pred_labels):
    top_plot[np.argmax(top_10_pred_labels == true_label)].set_color("green")
  else:
    pass

In [None]:
plot_pred_conf(prediction_probabilities=predictions,
               labels=val_labels,
               n=199)

In [None]:
i_multiplier = 18
num_rows = 3
num_cols = 2
num_images = num_rows * num_cols

plt.figure(figsize=(5*2*num_cols, 5*num_rows))

for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_pred(prediction_probabilities=predictions,
            labels=val_labels,
            images=val_images,
            n=i+i_multiplier)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_pred_conf(prediction_probabilities=predictions,
                 labels=val_labels,
                 n=i+i_multiplier)

plt.tight_layout(h_pad=1.0)
plt.show()

In [None]:
def save_model(model, suffix=None):

  modeldir = os.path.join("/content/sample_data/Dog_Vision/models",
                          datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
  model_path = modeldir + "-" + suffix + ".keras"
  print(f"Saving model to: {model_path}")
  model.save(model_path)

  return model_path

In [None]:
def load_model(model_path):

  print("Loading saved model from: {model_path}")
  model = tf_keras.models.load_model(model_path,
                                     custom_objects={"KerasLayer": hub.KerasLayer})

  return model

In [None]:
save_model(model, suffix="1000-images-Adam")

In [None]:
model_1000_images = load_model("/content/sample_data/Dog_Vision/models/20240829-03351724902514-1000-images-Adam.keras")

In [None]:
model.evaluate(val_data)

In [None]:
model_1000_images.evaluate(val_data)

In [None]:
full_data = create_data_batches(X, y)

In [None]:
full_model = create_model()

In [None]:
full_model_tensorboard = create_tensorboard_callback()

full_model_early_stopping = tf_keras.callbacks.EarlyStopping(monitor="accuracy",
                                                            patience=3)

In [None]:
full_model.fit(x=full_data,
               epochs=NUM_EPOCHS,
               callbacks=[full_model_tensorboard,
                           full_model_early_stopping])

In [None]:
save_model(full_model, suffix="all_images_Adam")

In [None]:
load_full_model = load_model("/content/sample_data/Dog_Vision/models/20240829-04041724904296-all_images_Adam.keras")

In [None]:
cp "/content/sample_data/Dog_Vision/test.zip" "."

In [None]:
!unzip /content/sample_data/Dog_Vision/test.zip -d /content/sample_data/Dog_Vision

In [None]:
test_path = "/content/sample_data/Dog_Vision/test/"
test_filenames = [test_path + fname for fname in os.listdir(test_path)]

test_filenames[:10]

In [None]:
test_data = create_data_batches(test_filenames, test_data=True)

In [None]:
test_predictions = load_full_model.predict(test_data,
                                             verbose=1)

In [None]:
test_predictions[:10]

In [None]:
preds_df = pd.DataFrame(columns = ["id"] + list(unique_labels))
preds_df

In [None]:
preds_df["id"] = [os.path.splitext(path)[0] for path in os.listdir(test_path)]
preds_df.head()

In [None]:
preds_df[list(unique_labels)] = test_predictions
preds_df.head()

In [None]:
preds_df.to_csv("/content/sample_data/Dog_Vision/prediction.csv",
                index=False)