In [9]:
import pandas as pd
import numpy as np
from pathlib import Path

data_path = Path("./winequality-white.csv")
wine_df = pd.read_csv(data_path, header=0, names=["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"], sep=";")

# data overview:
#print(wine_df.describe())

# which quality classes do we have? :
qualities = wine_df["quality"].unique()
print(f"Number of unique 'qualities': {len(qualities)}")
print(f"Qualities: {sorted(qualities)}")
binc = np.bincount([q for q in wine_df["quality"]])
no_inst = len(wine_df)
print(f"\nClass counts: {binc}")
print(f"\nNumber of instances: {no_inst} ")
print(f"\nClass fractions: {np.round(binc/no_inst,4) * 100}")


Number of unique 'qualities': 7
Qualities: [3, 4, 5, 6, 7, 8, 9]

Class counts: [   0    0    0   20  163 1457 2198  880  175    5]

Number of instances: 4898 

Class fractions: [ 0.    0.    0.    0.41  3.33 29.75 44.88 17.97  3.57  0.1 ]


In [84]:
import torch
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

input_dim = 10 + 7 # net-work output, decision-tree output - here: decision_function(X) -> definition of WineStackSet

class WineStackedVoter(nn.Module):
    
    def __init__(self, input_dim):
        super(WineStackedVoter, self).__init__()
        self.linear_relu_stack = nn.Sequential(
        nn.Linear(input_dim, 32),
        nn.ReLU(),
        nn.Linear(32, 10),
        nn.ReLU()
        )
    
    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


Using device: cpu


In [81]:
from torch.utils.data import Dataset

class WineStackedData(Dataset):
    def __init__(self, data_df, net_transform, tree_transform, target_transform=None):
        self.data_df = data_df
        self.net_transform = net_transform
        self.tree_transform = tree_transform
        self.target_transform = target_transform
        self.X = torch.tensor(self.data_df.iloc[:,:-1].values, dtype=torch.float32)
        self.Y = torch.tensor(self.data_df["quality"].values, dtype=torch.long)
    
    def __len__(self):
        return len(self.Y)

    def __getitem__(self, idx):
        self.x = self.X[idx, :].unsqueeze(dim=0)
        self.xx = torch.cat((self.net_transform(self.x), torch.tensor(self.tree_transform.decision_function(self.x), dtype=torch.float32)), dim=1).squeeze()
        self.y = self.Y[idx]
        if(self.target_transform):
            self.y = self.target_transform(self.y)
        return self.xx, self.y

In [82]:
# test for WineStackedData:
idx = 0
train_ds = WineStackedData(data_df=train_df, net_transform=net_model, tree_transform=tree_model)
train_ds.__getitem__(idx)

(tensor([  0.0000,   0.0000,   0.0000,   0.0000,   9.3640,   5.7459,   3.6224,
           0.0000,   0.0000,   0.0000, -35.2011,  38.0797,  65.9575,  15.8541,
         -19.7279, -26.0759, -38.8864], grad_fn=<SqueezeBackward0>),
 tensor(4))

In [65]:
from sklearn.preprocessing import StandardScaler

def normalize_dataframe(data_df, column_names_to_normalize):
    """
        Normalizes all given columns of a given data frame with a StandardScaler from Sklearn. 
        Input:
            data_df: dataframe with numerical values to normalize
            column_names_to_normalize: list of the names of the columns to be normalized
        Output:
            dataframe with columns normalized
    """
    scaler = StandardScaler()
    data_to_norm = data_df[column_names_to_normalize].values
    data_normed = scaler.fit_transform(data_to_norm)
    df_temp = pd.DataFrame(data_normed, columns=column_names_to_normalize, index=data_df.index)
    data_df[column_names_to_normalize]= df_temp
    return data_df


In [66]:
# Model definition from the WineDataset note book -- I don't know how to import from another Jupyter notebook...
class WineNetwork(nn.Module):
    def __init__(self):
        super(WineNetwork, self).__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(11, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(p=0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Linear(256, 10),
            nn.ReLU()
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits

In [87]:
def train_loop(dataloader, model, loss_fn, optimizer):
    losses, nof_correct = 0, 0
    for xx, y_true in dataloader:
        y_pred = model(xx)
        loss= loss_fn(y_pred, y_true)
        losses += loss.item()
        nof_correct += (y_pred.argmax(1) == y_true).sum().item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses, nof_correct

def eval_loop(dataloader, model, loss_fn, optimizer):
    losses, nof_correct = 0, 0 
    with torch.no_grad:
        for xx, y_true in dataloader:
            y_pred = model(xx)
            loss = loss_fn(y_pred, y_true)
            losses += loss.item()
            nof_correct += (y_pred.argmax(1) == y_true).sum().item()

    return losses, nof_correct


In [89]:
# Train the model:
from torch.utils.data import DataLoader
import os
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import train_test_split

# writer for tensorboard:
writer = SummaryWriter()

# create new model instance:
input_dim = 10 + 7
model = WineStackedVoter(input_dim=input_dim).to(device) 

# loss function:
# cross-entropy:
loss_fn = nn.CrossEntropyLoss()

# optimizer:
# adam:
# optimizer_name = "ADAM"
#learning_rate = 1e-4
#optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# sgd:
optimizer_name = "SGD"
learning_rate = 1e-3
optimizer = torch.optim.SGD(model.parameters(), lr= learning_rate, momentum=0.9)

# training parameters:
from pathlib import Path
data_path = Path("./winequality-white.csv")
column_names = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", 
"pH", "sulphates", "alcohol", "quality"]
column_names_to_normalize = column_names[:-1]
wine_df = pd.read_csv(data_path, header=0, names=column_names, sep=";")


epochs = 100
write_log_after_epochs = 20

epoch_losses = []
epoch_no_corrects = []
train_losses = []
train_acc = []
test_losses = []
test_acc = []

test_size=0.2
train_df, test_df = train_test_split(wine_df, test_size=test_size)

train_df = normalize_dataframe(train_df, column_names_to_normalize)
test_df = normalize_dataframe(test_df, column_names_to_normalize)

net_model = WineNetwork()
net_model.load_state_dict(torch.load("model_640_0.001_369_64_SGD.pt"))
net_model.eval()

import pickle

filename = 'AdaBoost_071_model.dct'
tree_model = pickle.load(open(filename, 'rb'))

train_ds = WineStackedData(data_df=train_df, net_transform=net_model, tree_transform=tree_model)
test_ds = WineStackedData(data_df=test_df, net_transform=net_model, tree_transform=tree_model)

batch_size=64
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=True)

best_model_name = ""
max_correct = -torch.inf

for ep in range(1, epochs+1):        
       
        # put model in train mode:
        model.train()
        (train_loss, train_no_correct) = train_loop(train_dl, model, loss_fn, optimizer)
              
        # switch model to to evaluation mode:
        model.eval()
        (test_loss, test_no_correct) = test_loop(test_dl, model, loss_fn)

        if(test_no_correct > max_correct):
            max_correct = test_no_correct
            if(best_model_name):
                os.remove(best_model_name)
            best_model_name = "./model_" + str(test_no_correct) + "_" + str(learning_rate) + "_" + str(ep) + "_" + str(batch_size) + "_" + optimizer_name + "_sv.pt"
            torch.save(model.state_dict(), best_model_name)

        writer.add_scalar("Loss/test", test_loss/ len(test_ds), ep)
        writer.add_scalar("Accuracy/test", test_no_correct/ len(test_ds), ep)
        writer.add_scalar("Loss/train", train_loss/ len(train_ds), global_step=ep)
        writer.add_scalar("Accuracy/train", train_no_correct/ len(train_ds), global_step=ep)
       
        if ep % write_log_after_epochs == 0:
            print(f"\n----- Epoch: {ep} -----")
            print(f"Epoch loss: {test_loss/ len(test_ds)}")
            print(f"Epoch accuracy: {test_no_correct/ len(test_ds)}")
            

KeyboardInterrupt: 