In [1]:
### Intro to Deep Learning

## 1. A Single Neuron

# Deep learning is an approach to machine learning characterized by deep stacks of computations.
# The power of a neural network comes from the complexity of the connections neurons form.
# Neural networks are composed of neurons, where each neuron individually performs only a simple computation.

# The Linear Unit: y = wx + b
# w: weight, b: bias
# Multiple Inputs: y = w0x0 + w1x1 + w2x2 + b

In [2]:
# Linear Units in Keras

from tensorflow import keras
from tensorflow.keras import layers
#Create a network with 1 linear uit
model = keras.Sequential([
    layers.Dense(units=1, input_shape=[3])
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [3]:
## 2. Deep Neural Networks
# When we collect together linear units having a common set of inputs we get a dense layer.
# A layer can be any kind of data transformation.
# Many layers, like the convolutional and recurrent layers, transform data through use of neurons and differ primarily in the pattern of connections they form.

# The Activation Fuction
# However two dense layers with nothing in between are no better than a single dense layer by itself.
# What we need are activation functions that move us out of the world of lines and planes.
# An activation function is simply some function we apply to each of a layer's outputs.
# ex) rectified linear unit = ReLU

# Stacking Dense Layers
# The layers before the final(output) layer are hidden layers. We never see their outputs directly.
# In regression tasks, the final layer is a linear unit(=no activation function)
# Other tasks like classification might require an activation function on the output.

In [4]:
# Building Sequential Models
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    # the hidden ReLU layers
    layers.Dense(units=4, activation='relu', input_shape=[2]),
    layers.Dense(units=3, activation='relu'),
    # the linear output layer
    layers.Dense(units=1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [5]:
## 3. Stochastic Gradient Descent
# The Loss Function
# The loss function measures the disparity between the target's true value and the value the model predicts.
# A common loss function for regression problem is the mean absolute error, MAE.
# MAE measures the disparsity from the true target y_true by an absolute difference abs(y_true - y_pred).
# During training, the model will use the loss function as a guide for finding the correct values of its weights.

# The Optimizer - Stochastic Gradient Descent
# The optimizer is an algorithm that adjusts the weights to minimize the loss.
# Adam is a great general-purpose optimizer.
# The learning rate and the size of the minibatches are the two parameters that have the largest effect on how the SGD training proceeds.

In [6]:
# Adding the Loss and Optimizer
model.compile(
    optimizer='adam',
    loss='mae'
)

In [None]:
# Example - Red Wine Quality
import pandas as pd
from IPython.display import display

red_wine = pd.read_csv('../input/dl-course-data/red-wine.csv')

# Create training and validation splits
df_train = red_wine.sample(frac=0.7, random_state=0)
df_valid = red_wine.drop(df_train.index)
display(df_train.head(4))

# Scale to [0, 1]
max_ = df_train.max(axis=0)
min_ = df_train.min(axis=0)
df_train = (df_train - min_) / (max_ - min_)
df_valid = (df_valid - min_) / (max_ - min_)

# Split features and target
X_train = df_train.drop('quality', axis=1)
X_valid = df_valid.drop('quality', axis=1)
y_train = df_train['quality']
y_valid = df_valid['quality']

print(X_train.shape)

from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=[11]),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss='mae',
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=256,
    epochs=10,
)

import pandas as pd

# convert the training history to a dataframe
history_df = pd.DataFrame(history.history)
# use Pandas native plot method
history_df['loss'].plot();

In [None]:
## 4. Overfitting and Underfitting

# Interpreting the Learning Curves
# Underfitting the training set is when the loss is not as low as it could be b/c the model hasn't learned enough signal.
# Overfitting the training set is when the loss is not as low as it could be b/c the model learned too much noise.

# A model's capacity refers to the size and complexity of the patterns it is able to learn.
# underfitted -> try increasing its capacity
# Increase capacity of a network by making it wider(more units to existing layers) or deeper(adding more layers)
model = keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(1),
])

wider = keras.Sequential([
    layers.Dense(32, activation='relu'),
    layers.Dense(1),
])

deeper = keras.Sequential([
    layers.Dense(16, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1),
])

# Early Stopping
# When a model is too eagerly learning noise, the validation loss may start to increase during training.
# To prevent this, early stopping can stop training whenever it seems the validation loss isn't decreasing anymore.
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

In [None]:
# Example - Train a Model with Early Stopping

import pandas as pd
from IPython.display import display

red_wine = pd.read_csv('../input/dl-course-data/red-wine.csv')

# Create training and validation splits
df_train = red_wine.sample(frac=0.7, random_state=0)
df_valid = red_wine.drop(df_train.index)
display(df_train.head(4))

# Scale to [0, 1]
max_ = df_train.max(axis=0)
min_ = df_train.min(axis=0)
df_train = (df_train - min_) / (max_ - min_)
df_valid = (df_valid - min_) / (max_ - min_)

# Split features and target
X_train = df_train.drop('quality', axis=1)
X_valid = df_valid.drop('quality', axis=1)
y_train = df_train['quality']
y_valid = df_valid['quality']

from tensorflow import keras
from tensorflow.keras import layers, callbacks

early_stopping = callbacks.EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

model = keras.Sequential([
    layers.Dense(512, activation='relu', input_shape=[11]),
    layers.Dense(512, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.Dense(1),
])
model.compile(
    optimizer='adam',
    loss='mae',
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=256,
    epochs=500,
    callbacks=[early_stopping], # put your callbacks in a list
    verbose=0,  # turn off training log
)

history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();
print("Minimum validation loss: {}".format(history_df['val_loss'].min()))

In [None]:
## 5. Dropout and Batch Normalization
# Dropout layer can help correct overfitting.
keras.Sequential([
    # ...
    layers.Dropout(rate=0.3), # apply 30% dropout to the next layer
    layers.Dense(16),
    # ...
])

# Batch Normalization
# A batch normalization layer looks at each batch as it comes in, first normalizaing the batch with its own mean and standard deviation,
# and then also putting the data on a new scale with two trainable rescaling parameters.
# Most often, batchnorm is added as an aid to the optimization process.
# after a layer:
layers.Dense(16, activation='relu'),
layers.BatchNormalization(),
# between a layer and its activation function:
layers.Dense(16),
layers.BatchNormalization(),
layers.Activation('relu'),

In [None]:
# Example - Using Dropout and Batch Normalization
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(1024, activation='relu', input_shape=[11]),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.3),
    layers.BatchNormalization(),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss='mae',
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=256,
    epochs=100,
    verbose=0,
)


# Show the learning curves
history_df = pd.DataFrame(history.history)
history_df.loc[:, ['loss', 'val_loss']].plot();

In [None]:
## 6. Binary Classification
# Accuracy is the ratio of correct predictions to total predictions:
# accuracy = number_correct / total
# Cross-entropy is a sort of measure for the distance from one probability distribution to another.
# The further away the predicted probability is from 1.0, the greater will be the cross-entropy loss.

import pandas as pd
from IPython.display import display

ion = pd.read_csv('../input/dl-course-data/ion.csv', index_col=0)
display(ion.head())

df = ion.copy()
df['Class'] = df['Class'].map({'good': 0, 'bad': 1})

df_train = df.sample(frac=0.7, random_state=0)
df_valid = df.drop(df_train.index)

max_ = df_train.max(axis=0)
min_ = df_train.min(axis=0)

df_train = (df_train - min_) / (max_ - min_)
df_valid = (df_valid - min_) / (max_ - min_)
df_train.dropna(axis=1, inplace=True) # drop the empty feature in column 2
df_valid.dropna(axis=1, inplace=True)

X_train = df_train.drop('Class', axis=1)
X_valid = df_valid.drop('Class', axis=1)
y_train = df_train['Class']
y_valid = df_valid['Class']

from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(4, activation='relu', input_shape=[33]),
    layers.Dense(4, activation='relu'),
    layers.Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=1000,
    callbacks=[early_stopping],
    verbose=0, # hide the output because we have so many epochs
)

history_df = pd.DataFrame(history.history)
# Start the plot at epoch 5
history_df.loc[5:, ['loss', 'val_loss']].plot()
history_df.loc[5:, ['binary_accuracy', 'val_binary_accuracy']].plot()

print(("Best Validation Loss: {:0.4f}" +\
      "\nBest Validation Accuracy: {:0.4f}")\
      .format(history_df['val_loss'].min(),
              history_df['val_binary_accuracy'].max()))