In [None]:
# %% Deep learning - Section 14.140
#    FFN project 3: FFN for missing data interpolation
#    1) Use the wine quality dataset
#    2) Choose 10 random data values in the 'residual sugar' column to replace
#       with NaN (save removed)
#    3) Split data into Train and Test and use the data with missing residual
#       sugar as test
#    4) Fit a model to predict the missing values
#    5) Plot model performance
#    6) Plot model-predicted data against ground truth for train and test (use a
#       rank correlation, data likely not normal)

# This code pertains a deep learning course provided by Mike X. Cohen on Udemy:
#   > https://www.udemy.com/course/deeplearning_x
# The "base" code in this repository is adapted (with very minor modifications)
# from code developed by the course instructor (Mike X. Cohen), while the
# "exercises" and the "code challenges" contain more original solutions and
# creative input from my side. If you are interested in DL (and if you are
# reading this statement, chances are that you are), go check out the course, it
# is singularly good.


In [2]:
# %% Libraries and modules
import numpy               as np
import matplotlib.pyplot   as plt
import torch
import torch.nn            as nn
import seaborn             as sns
import copy
import torch.nn.functional as F
import pandas              as pd
import scipy.stats         as stats
import sklearn.metrics     as skm
import time

from torch.utils.data                 import DataLoader,TensorDataset
from sklearn.model_selection          import train_test_split
from google.colab                     import files
from torchsummary                     import summary
from IPython                          import display
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('svg')
plt.style.use('default')


In [151]:
# %% Function to get the data

def get_data(url='https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',drop_vals=10):

    # Get data
    data = pd.read_csv(url,sep=';')

    # Remove some outliers (see lec. 82 for why) and 10 random residual sugar values
    data = data[data['total sulfur dioxide']<200]

    miss_idx = np.random.choice(data.index,drop_vals,replace=False)
    true_missing_sugar = data.loc[miss_idx, 'residual sugar'].values.copy()
    true_missing_sugar = (true_missing_sugar - true_missing_sugar.mean()) / true_missing_sugar.std()
    data.loc[miss_idx,'residual sugar'] = np.nan

    # Z-score all the variables (removed residual sugar normalised above)
    cols2zscore = data.keys()

    for col in cols2zscore:
        mean_val  = np.mean(data[col])
        std_val   = np.std(data[col])
        data[col] = (data[col] - mean_val) / std_val

    cols2zscore = cols2zscore.drop('residual sugar')

    # Split data based on residual sugar vals
    train_data_df = data[data['residual sugar'].notna()].copy()
    test_data_df  = data[data['residual sugar'].isna()].copy()

    # Convert from pandas dataframe to PyTorch tensor
    train_data   = torch.tensor(train_data_df[cols2zscore].values).float()
    train_labels = torch.tensor(train_data_df['residual sugar'].values).float().view(-1,1)

    test_data    = torch.tensor(test_data_df[cols2zscore].values).float()
    test_labels  = torch.tensor(test_data_df['residual sugar'].values).float().view(-1,1)

    print(f'Train data shape:   {train_data.shape}')
    print(f'Train labels shape: {train_labels.shape}')
    print(f'Test data shape:    {test_data.shape}')
    print(f'Test labels shape:  {test_labels.shape}')

    # Convert into PyTorch datasets
    train_data = TensorDataset(train_data,train_labels)
    test_data  = TensorDataset(test_data,test_labels)

    # Convert into DataLoader objects
    batch_size   = 32
    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True,drop_last=True)
    test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

    return train_loader,test_loader,true_missing_sugar,train_data_df


In [None]:
# %% Test data function

train_loader,test_loader,_,_ = get_data(url,drop_vals=10)


In [21]:
# %% Model class

# Optional parametrised metaparameters:
#  > number of layers and of units per layer
#  > starting learning rate
#  > optimizer (e.g. 'SGD', 'RMSprop', or 'Adam')
#  > L2 regularisation
#  > activation function (e.g., 'ReLU', 'LeakyReLU', 'ReLU6', or 'GELU')

def gen_model(n_units=16,n_layers=2,lr=0.01,optim='SGD',L2_lambda=0,act_fun='ReLU'):

    class model(nn.Module):
        def __init__(self,n_units,n_layers):
            super().__init__()

            # Dictionary to store the layers and the activation function
            self.layers  = nn.ModuleDict()
            self.nLayers = n_layers
            self.act_fun = act_fun

            # Architecture (input, hidden, output)
            # Input layer
            self.layers['input'] = nn.Linear(11,n_units)

            # Hidden layers
            for i in range(n_layers):
                self.layers[f'hidden{i}'] = nn.Linear(n_units,n_units)

            # Output layer
            self.layers['output'] = nn.Linear(n_units,1)

        def forward(self,x):

            # Input layer
            x = self.layers['input'](x)

            # Hidden layers (fetch selected activation function)
            act_fun = getattr(torch.nn,self.act_fun)()
            for i in range(self.nLayers):
                x = act_fun(self.layers[f'hidden{i}'](x))

            # Output layer
            x = self.layers['output'](x)

            return x

    # Model instance, loss function, and optimizer
    ANN       = model(n_units,n_layers)
    loss_fun  = nn.MSELoss()
    opti_fun  = getattr( torch.optim,optim )
    optimizer = opti_fun(ANN.parameters(),lr=lr,weight_decay=L2_lambda)

    return ANN,loss_fun,optimizer


In [None]:
# %% Test model function

n_units   = 16
n_layers  = 2
lr        = 0.01
optim_alg = 'Adam'
L2_decay  = 0.01
act_fun   = 'ReLU'

ANN,loss_fun,optimizer = gen_model(n_units,n_layers,lr,optim_alg,L2_decay,act_fun)
print(ANN)
print(loss_fun)
print(optimizer)


In [77]:
# %% Function to train the model

# Optional parametrised metaparameters:
#  > number of epochs
#  > tol is hard-coded but set the tolerance to consider a prediction coorect

def train_model(num_epochs=50):

    # Epochs and fresh model instance
    num_epochs = num_epochs
    ANN,loss_fun,optimizer = gen_model(n_units,n_layers,lr,optim,L2_lambda,act_fun)

    # Preallocate vars
    train_loss  = torch.zeros(num_epochs)
    train_psacc = torch.zeros(num_epochs)
    test_loss   = torch.zeros(num_epochs)
    test_psacc  = torch.zeros(num_epochs)

    # Loop over epochs
    for epoch_i in range(num_epochs):

        # Loop over training data batches
        batch_loss = []
        batch_acc  = []

        for X,y in train_loader:

            # Forward pass, backpropagation, and optimizer step
            yHat = ANN(X)
            loss = loss_fun(yHat,y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Loss and pseudo-accuracy from this batch
            batch_loss.append(loss.item())
            tol = 0.2
            batch_acc.append( 100*torch.mean((torch.abs(yHat-y)<tol).float()) )

        train_loss[epoch_i]  = np.mean(batch_loss).item()
        train_psacc[epoch_i] = np.mean(batch_acc).item()

        # Test loss and pseudo-accuracy (losses should be NaNs here)
        ANN.eval()

        with torch.no_grad():
            X,y  = next(iter(test_loader))
            yHat = ANN(X)
            tol  = 0.2
            test_loss[epoch_i]  = loss_fun(yHat,y)
            test_psacc[epoch_i] = 100*torch.mean((torch.abs(yHat-y)<tol).float())

        ANN.train()

    return train_loss,train_psacc,test_loss,test_psacc,ANN


In [None]:
# %% Test the whole setting

# Generate data
n_to_drop = 10
train_loader,test_loader,true_missing_sugar,train_data_df = get_data(url,drop_vals=n_to_drop)

# Set parameters and generate model
n_units    = 32
n_layers   = 2
lr         = 0.01
optim      = 'Adam'
L2_lambda  = 0
act_fun    = 'ReLU'
num_epochs = 500

ANN,loss_fun,optimizer = gen_model( n_units   = n_units,
                                    n_layers  = n_layers,
                                    lr        = lr,
                                    optim     = optim,
                                    L2_lambda = L2_lambda,
                                    act_fun   = act_fun )

# Train model
train_loss,_,test_loss,_,ANN = train_model(num_epochs=num_epochs)


In [None]:
# %% Corralations between predicted and true values

# Get predictions on train set
X_train, y_train = train_loader.dataset.tensors
with torch.no_grad():
    y_train_pred = ANN(X_train).squeeze().numpy()

y_train_true = y_train.squeeze().numpy()

# Get predictions on test set (fetch ground truth vals)
X_test, y_test = test_loader.dataset.tensors
with torch.no_grad():
    y_test_pred = ANN(X_test).squeeze().numpy()

y_test_true = true_missing_sugar

# Kolmogorov–Smirnov test for Gaussianity on raw sugar
sugar = train_data_df['residual sugar'].values
mu    = sugar.mean()
sigma = sugar.std()

ks_sugar = stats.kstest(sugar,'norm',args=(mu,sigma))
print(f"Kolmogorov–Smirnov test on raw sugar: ks = {ks_sugar.statistic:.3f}, p = {ks_sugar.pvalue:.3g}")

# Kolmogorov–Smirnov test for Gaussianity on training residuals
train_resid = y_train_true - y_train_pred
train_ks    = stats.kstest( (train_resid - train_resid.mean()) / train_resid.std(),'norm')
print(f"Kolmogorov–Smirnov test on training residuals: ks = {train_ks.statistic:.3f}, p = {train_ks.pvalue:.3g}")

# Spearman rank and Pearson correlations
train_spear,train_spear_p = stats.spearmanr(y_train_true,y_train_pred)
test_spear,test_spear_p   = stats.spearmanr(y_test_true,y_test_pred)

train_pear,train_pear_p   = stats.pearsonr(y_train_true,y_train_pred)
test_pear,test_pear_p     = stats.pearsonr(y_test_true,y_test_pred)

print()
print(f"Spearman correlation (train): r = {train_spear:.3f}, p = {train_spear_p:.3g}")
print(f"Spearman correlation (test):  r = {test_spear:.3f}, p = {test_spear_p:.3g}")
print()
print(f"Pearson correlation (train):  r = {train_pear:.3f}, p = {train_pear_p:.3g}")
print(f"Pearson correlation (test):   r = {test_pear:.3f}, p = {test_pear_p:.3g}")


In [114]:
# %% Functions for 1D smoothing filter

# Improved for edge effects - adaptive window
def smooth_adaptive(x,k):
    smoothed = np.zeros_like(x)
    half_k   = k // 2

    for i in range(len(x)):
        start       = max(0, i-half_k)
        end         = min(len(x), i+half_k + 1)
        smoothed[i] = np.mean(x[start:end])

    return smoothed


In [None]:
# %% Plotting

phi = (1 + np.sqrt(5)) / 2
fig, ax = plt.subplots(1, 2, figsize=(1.5*phi*6, 6))

# Train loss curve
ax[0].plot(smooth_adaptive(train_loss.numpy(),5), label="Train loss")
ax[0].set_title("Training loss")
ax[0].set_xlabel("Epoch")
ax[0].set_ylabel("Loss")
ax[0].legend()
ax[0].grid(alpha=0.5)

# Predicted against true values
ax[1].scatter(y_train_true,y_train_pred,alpha=0.6,label=f"Train (ρ={train_spear:.2f})",color="tab:blue")
ax[1].scatter(y_test_true,y_test_pred,alpha=0.8,label=f"Test (ρ={test_spear:.2f})",color="tab:red",marker="^")

lims = [ min(y_train_true.min()-.25,y_test_true.min()-.25,
             y_train_pred.min()-.25,y_test_pred.min()-.25),
         max(y_train_true.max()+.25,y_test_true.max()+.25,
             y_train_pred.max()+.25,y_test_pred.max()+.25) ]

ax[1].plot(lims,lims,"k--",alpha=0.7)
ax[1].set_xlim(lims)
ax[1].set_ylim(lims)

ax[1].set_title(f"Predicted vs. true residual sugar (dropped vals = {n_to_drop})")
ax[1].set_xlabel("True values")
ax[1].set_ylabel("Predicted values")
ax[1].legend()
ax[1].grid(alpha=0.5)

plt.tight_layout()

plt.savefig('figure8_ffn_project_3.png')

plt.show()

files.download('figure8_ffn_project_3.png')
