# Classification by a neural network using Keras -- Penguins Classification

## 0. Import packages and modules

In [None]:
!pwd

In [None]:
!hostname

In [None]:
import os
# Limit log messages from Tensorflow
# 0 = all messages are logged (default behavior)
# 1 = INFO messages are not printed
# 2 = INFO and WARNING messages are not printed
# 3 = INFO, WARNING, and ERROR messages are not printed
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import sklearn
import tensorflow
import keras

print(sklearn.__version__)
print(tensorflow.__version__)
print(keras.__version__)

In [None]:
# print GPU info

print(tensorflow.config.list_physical_devices('GPU'))

print("Num GPUs Available: ", len(tensorflow.config.list_physical_devices('GPU')))

## 1. Formulate/outline the problem: penguin classification

## 2. Identify inputs and outputs

In [None]:
penguins = pd.read_csv('./penguins_dataset.csv')
penguins.head()

In [None]:
penguins.shape

In [None]:
# sns.pairplot(penguins.iloc[:, 1:8], hue="species") # `1:8` means without the first (rowid) and the last column (year)

sns.pairplot(penguins[["species", "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]], hue="species", height=2.5)

## 3. Prepare data

In [None]:
# drop categorical columns

penguins_filtered = penguins.drop(columns=['island', 'sex'])
penguins_filtered.head(7)

In [None]:
# drop rows that have NaN values

penguins_filtered = penguins_filtered.dropna()
penguins_filtered.head(7)

In [None]:
# Extract columns corresponding to features
features = penguins_filtered.drop(columns=['species'])
features

In [None]:
target = pd.get_dummies(penguins_filtered['species'])

print(target.head(5))   # print out the top 5 to see what it looks like.

print(target.sample(7)) # randomly pickup 7 examples from the dataset

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=0, shuffle=True, stratify=target)

## 4. Build an architecture from scratch

In [None]:
# set two random seeds, one for numpy and one for tensorflow

from numpy.random import seed
seed(1)

keras.utils.set_random_seed(2)

In [None]:
inputs = keras.Input(shape=(X_train.shape[1],))

In [None]:
hidden_layer = keras.layers.Dense(10, activation="relu")(inputs)

In [None]:
output_layer = keras.layers.Dense(3, activation="softmax")(hidden_layer)

In [None]:
model = keras.Model(inputs=inputs, outputs=output_layer)
model.summary()

## 5. Choose a loss function and optimizer

In [None]:
model.compile(optimizer='adam', loss=keras.losses.CategoricalCrossentropy())

In [None]:
history = model.fit(X_train, y_train, epochs=100)

## 7. Perform a prediction/classification

In [None]:
y_pred = model.predict(X_test)
prediction = pd.DataFrame(y_pred, columns=target.columns)
prediction

In [None]:
predicted_species = prediction.idxmax(axis="columns")
predicted_species

## 8. Measuring performance

In [None]:
from sklearn.metrics import confusion_matrix

true_species = y_test.idxmax(axis="columns")

matrix = confusion_matrix(true_species, predicted_species)
print(matrix)

In [None]:
# Convert to a pandas dataframe
confusion_df = pd.DataFrame(matrix, index=y_test.columns.values, columns=y_test.columns.values)

# Set the names of the x and y axis, this helps with the readability of the heatmap.
confusion_df.index.name = 'True Label'
confusion_df.columns.name = 'Predicted Label'
confusion_df.head()

In [None]:
sns.heatmap(confusion_df, annot=True)

## 9. Refine the model

## 10. Share model

In [None]:
model.save('penguins_classification.keras')

In [None]:
pretrained_model = keras.models.load_model('penguins_classification.keras')

In [None]:
# use the pretrained model here
y_pretrained_pred = pretrained_model.predict(X_test)
pretrained_prediction = pd.DataFrame(y_pretrained_pred, columns=target.columns.values)

# idxmax will select the column for each row with the highest value
pretrained_predicted_species = pretrained_prediction.idxmax(axis="columns")
print(pretrained_predicted_species)