# Fully connected neural net to qualify wine according to chemical analysis. #

Wine data-set downloaded from [csv-file](http://mng.bz/90Ol)

* Two hidden layers
* ADAM optimizer: learning rate: 1e-4
* epochs = 600
* accuracy: 85.20%

Training:

----- Epoch: 100 -----
Epoch loss: 0.0298558915147976
Epoch accuracy: 0.6173469387755102

----- Epoch: 200 -----
Epoch loss: 0.02590804458880911
Epoch accuracy: 0.6816326530612244

----- Epoch: 300 -----
Epoch loss: 0.021330454793511606
Epoch accuracy: 0.7357142857142858

----- Epoch: 400 -----
Epoch loss: 0.018781860081516968
Epoch accuracy: 0.7857142857142857

----- Epoch: 500 -----
Epoch loss: 0.015203852860295042
Epoch accuracy: 0.8397959183673469

----- Epoch: 600 -----
Epoch loss: 0.013255607230322701
Epoch accuracy: 0.8520408163265306
----- Finished Training -----


### Training with kFold cross-validation: ###

* Two hidden layers
* ADAM optimizer: learning rate: 1e-4
* number of folds: 5
* epochs = 600
* accuracy: 87.08%
* runtime: 1969.8s

----- Epoch: 100 -----
Epoch loss (avg. all folds): 0.025227703386630017
Epoch accuracy (avg. all folds): 0.7076557482645764

----- Epoch: 200 -----
Epoch loss (avg. all folds): 0.018748069187298207
Epoch accuracy (avg. all folds): 0.7878191282233015

----- Epoch: 300 -----
Epoch loss (avg. all folds): 0.015001535402682996
Epoch accuracy (avg. all folds): 0.8273287242987082

----- Epoch: 400 -----
Epoch loss (avg. all folds): 0.012860514906850948
Epoch accuracy (avg. all folds): 0.8486841930541368

----- Epoch: 500 -----
Epoch loss (avg. all folds): 0.01152177774778462
Epoch accuracy (avg. all folds): 0.8618339396718852

----- Epoch: 600 -----
Epoch loss (avg. all folds): 0.010597546124541527
Epoch accuracy (avg. all folds): 0.8708151622855476
----- Finished Training -----

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path("./winequality-white.csv")
wine_df = pd.read_csv(data_path, header=0, names=["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"], sep=";")

# data overview:
print(wine_df.describe())

# which quality classes do we have? :
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")


In [None]:
import torch
from torch import nn

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))

# Define model
class WineNetwork(nn.Module):
    def __init__(self):
        super(WineNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(11, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            #nn.Dropout(p=0.2),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, 7),
            nn.ReLU()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

model = WineNetwork().to(device)
print(model)

In [3]:
# define torch.dataset: __init__(), __len__(), __getitem__()
from torch.utils.data import Dataset

class WineDataSet(Dataset):
    def __init__(self, data_df, transform=None, target_transform=None):
        self.wine_df = data_df
        self.transform = transform
        self.target_transform = target_transform
        self.X = np.asarray(self.wine_df.iloc[:,:-1].values, dtype=np.float32)
        self.Y = np.asarray(self.wine_df["quality"].values, dtype= np.int32)
    def __len__(self):
        return len(self.Y)
    def __getitem__(self,idx):
        self.x = self.X[idx,:]
        self.y = self.Y[idx]
        if self.transform != None:
            self.x = self.transform(self.x)
        if self.target_transform != None:
            self.y = self.target_transform(self.y)
        return self.x, self.y
            

In [4]:
from sklearn.preprocessing import StandardScaler

def normalize_dataframe(data_df, column_names_to_normalize):
    """
        Normalizes all given columns of a given data frame with a StandardScaler from Sklearn. 
        Input:
            data_df: dataframe with numerical values to normalize
            column_names_to_normalize: list of the names of the columns to be normalized
        Output:
            dataframe with columns normalized
    """
    scaler = StandardScaler()
    data_to_norm = data_df[column_names_to_normalize].values
    data_normed = scaler.fit_transform(data_to_norm)
    df_temp = pd.DataFrame(data_normed, columns=column_names_to_normalize, index=data_df.index)
    data_df[column_names_to_normalize]= df_temp
    return data_df


In [5]:
# Target transformation of the labels:
from torchvision.transforms import Lambda # might be overkill to call these just for OHE...

# OHE encoding supposing, that the labels y are integer-encoded:
transform_ohe = Lambda(lambda y: torch.zeros(7, dtype=torch.float).scatter_(dim=0, index=torch.tensor(y - 3), value=1))

# "quality" - encoding starting with 0 instead of 3:
classes_zero_based = { c : c-3 for c in wine_df["quality"].unique()}

target_transform = classes_zero_based.get

In [6]:
# test and train loops:
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

def train_loop(dataloader, model, loss_fn, optimizer):
    losses, no_correct = 0,0
    for n_batch, (X, y) in enumerate(dataloader):
        pred = model(X)
        loss = loss_fn(pred, y)
        losses += loss.item()
        no_correct +=(pred.argmax(1)==y).sum().item()

        optimizer.zero_grad()        
        loss.backward()
        optimizer.step()
    
    return losses, no_correct
        

def test_loop(dataloader, model, loss_fn):
    losses, no_correct = 0, 0
    with torch.no_grad():
        for (X,y) in dataloader:
            pred = model(X)
            losses += loss_fn(pred, y).item()
            no_correct += (pred.argmax(1)== y).sum().item()
     
    return losses, no_correct
    

In [None]:
# Train the model:
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split, StratifiedKFold

# writer for tensorboard:
writer = SummaryWriter()

# create new model instance:
model = WineNetwork().to(device) 

# loss function:
# cross-entropy:
loss_fn = nn.CrossEntropyLoss()

# optimizer:
# adam:
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# sgd:
#learning_rate = 1e-5
#optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate, momentum=0.9)

# training parameters:
from pathlib import Path
data_path = Path("./winequality-white.csv")
column_names = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]
column_names_to_normalize = column_names[:-1]
wine_df = pd.read_csv(data_path, header=0, names=column_names, sep=";")

apply_stratified_kfold = True
no_folds = 5

epochs = 200
write_log_after_epochs = 100

epoch_losses = []
epoch_no_corrects = []
train_losses = []
train_acc = []
test_losses = []
test_acc = []
for ep in range(1, epochs+1):

    if apply_stratified_kfold:
        X = wine_df[column_names_to_normalize]
        y = wine_df["quality"].values
        skf = StratifiedKFold(n_splits=no_folds, shuffle=True, random_state=42)
        losses, acc = 0, 0

        for fold, (train_ids, test_ids) in enumerate(skf.split(X, y)):
            train_df = wine_df.iloc[train_ids].copy()
            train_df = normalize_dataframe(train_df, column_names_to_normalize)
            test_df = wine_df.iloc[test_ids].copy()
            test_df = normalize_dataframe(test_df, column_names_to_normalize)
            train_ds = WineDataSet(data_df=train_df, target_transform=target_transform)
            test_ds = WineDataSet(data_df=test_df, target_transform=target_transform)
            train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
            test_dl = DataLoader(test_ds, batch_size=32, shuffle=True)

            # put model in train mode:
            model.train()
            (train_loss, train_no_correct) = train_loop(train_dl, model, loss_fn, optimizer)
            if ep % write_log_after_epochs == 0:
                train_losses.append(train_loss/ len(train_ds))
                train_acc.append(train_no_correct/ len(train_ds))

            # switch model to to evaluation mode:
            model.eval()
            (test_loss, test_no_correct) = test_loop(test_dl, model, loss_fn)
            if ep % write_log_after_epochs == 0:
                test_losses.append(test_loss/len(test_ds))
                test_acc.append(test_no_correct/ len(test_ds))
                    
        if ep % write_log_after_epochs == 0:
            print(f"\n----- Epoch: {ep} -----")
            print(f"Epoch loss (avg. all folds): {np.average(test_losses)}")
            print(f"Epoch accuracy (avg. all folds): {np.average(test_acc)}")
            writer.add_scalar("Loss/train", np.average(train_losses), ep)
            writer.add_scalar("Accuracy/train", np.average(train_acc), ep)
            writer.add_scalar("Loss/test", np.average(test_losses), ep)
            writer.add_scalar("Accuracy/test", np.average(test_acc), ep)
            train_losses.clear()
            test_losses.clear()
            train_acc.clear()
            test_acc.clear()
    
    else:
        train_df, test_df = train_test_split(wine_df, test_size=0.2)
        train_df = train_df.copy()
        test_df = test_df.copy()
        train_df = normalize_dataframe(train_df, column_names_to_normalize)
        test_df = normalize_dataframe(test_df, column_names_to_normalize)
        train_ds = WineDataSet(data_df=train_df, target_transform=target_transform)
        test_ds = WineDataSet(data_df=test_df, target_transform=target_transform)
        train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
        test_dl = DataLoader(test_ds, batch_size=32, shuffle=True)
       
        # put model in train mode:
        model.train()
        (train_loss, train_no_correct) = train_loop(train_dl, model, loss_fn, optimizer)
              
        # switch model to to evaluation mode:
        model.eval()
        (test_loss, test_no_correct) = test_loop(test_dl, model, loss_fn)
       
        if ep % write_log_after_epochs == 0:
            print(f"\n----- Epoch: {ep} -----")
            print(f"Epoch loss: {test_loss/ len(test_ds)}")
            print(f"Epoch accuracy: {test_no_correct/ len(test_ds)}")
            writer.add_scalar("Loss/test", test_loss/ len(test_ds), ep)
            writer.add_scalar("Accuracy/test", test_no_correct/ len(test_ds), ep)
            writer.add_scalar("Loss/train", train_loss/ len(train_ds), global_step=ep)
            writer.add_scalar("Accuracy/train", train_no_correct/ len(train_ds), global_step=ep)
        
writer.flush()
writer.close()
print("----- Finished Training -----")