In [1]:
#env image-processing
import pandas as pd
from sklearn import metrics
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torch.nn.functional as F
import math
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

### Read the data

In [2]:
df=pd.read_csv('../input/winequality_red.csv')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.quality.value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

### Make three classes 

In [4]:
quality = df["quality"].values
df["wclass"] = 0

for i, num in enumerate(quality):
    if num <=8 and num >=7:
        df.loc[i,"wclass"] = "high"
    if num <=6 and num >=5:
        df.loc[i,"wclass"] = "mid"
    if num <=4 and num >=3:
        df.loc[i,"wclass"] = "low"

### Add a column changing  class values to number 

In [5]:
quality = df["wclass"].values
df["wclass_num"] = 0

for i, num in enumerate(quality):
    if num=='high':
        df.loc[i,"wclass_num"] = 2
    if num=='mid':
        df.loc[i,"wclass_num"] = 1
    if num=='low':
        df.loc[i,"wclass_num"] = 0

### Save the data

In [6]:
df.to_csv("../input/winequality.csv", index = False)  

### Look at the data and see 

In [7]:
df1=pd.read_csv("../input/winequality.csv")

In [8]:
df1.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wclass,wclass_num
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mid,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,mid,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,mid,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,mid,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mid,1


### Create a custom dataset class for pytorch 

In [9]:
#One has to make sure the class index starts at 0, so that we can use cross entropy loss
class WineDataset(Dataset):
    def __init__(self):
        # data loading
        xy = pd.read_csv('../input/winequality.csv')
        self.x = torch.tensor((xy.drop(['quality','wclass','wclass_num'], axis =1)).values).float()
        self.y = torch.tensor((df['wclass_num']-df['wclass_num'].min()).astype('category'))
        self.n_samples = xy.shape[0]
  
    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):    
        return self.n_samples
classes =("high","mid","low")

### Look at what the custom dataset returns

In [10]:
dataset = WineDataset()
first_data = dataset[0]
features_label = first_data
print(features_label)

(tensor([ 7.4000,  0.7000,  0.0000,  1.9000,  0.0760, 11.0000, 34.0000,  0.9978,
         3.5100,  0.5600,  9.4000]), tensor(1))


### Create a Dataloader for pytorch 

In [11]:
dataloader = DataLoader(dataset=dataset, batch_size =10, shuffle=False, num_workers=0)
# Make iterator
dataiter = iter(dataloader)
data = dataiter.next()
features, labels = data
#print(features, labels)
print(features.shape)
print(labels)
print([classes[i] for i in labels])

torch.Size([10, 11])
tensor([1, 1, 1, 1, 1, 1, 1, 2, 2, 1])
['mid', 'mid', 'mid', 'mid', 'mid', 'mid', 'mid', 'low', 'low', 'mid']


### Create a neural network

In [12]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(in_features=11, out_features=100)
        self.fc2 = nn.Linear(in_features=100, out_features=100)
        self.out = nn.Linear(in_features=100, out_features=3)

    
    def forward(self,t):
        t = self.fc1(t)
        t = F.relu(t)
        t = self.fc2(t)
        t = F.relu(t)
        t = self.out(t)
        return t  

### Create functions to qualify the predictions from the model

In [13]:
def get_num_correct(preds,labels):
    return preds.argmax(dim=1).eq(labels).sum().item()
def get_balanced_accuracy(preds,labels):
    return metrics.balanced_accuracy_score(labels, preds.argmax(dim=1))

In [14]:
dataloader = DataLoader(dataset=dataset, batch_size =10, shuffle=False, num_workers=0)
# Make iterator
dataiter = iter(dataloader)
data = dataiter.next()
features, labels = data
#print(features, labels)
print(features.shape)
print(labels)

torch.Size([10, 11])
tensor([1, 1, 1, 1, 1, 1, 1, 2, 2, 1])



### Check dimensions of predictions of the nn model

In [15]:
network = Network()
lambda_l2 = 1e-5
learning_rate = 0.0009
pred = network(features)
pred

tensor([[ 0.6584, -1.7469,  0.6928],
        [ 1.6747, -3.6838,  1.0420],
        [ 1.3354, -2.9418,  1.0588],
        [ 1.3747, -3.2077,  1.1540],
        [ 0.6584, -1.7469,  0.6928],
        [ 0.8466, -2.1055,  0.7932],
        [ 1.4386, -3.2121,  1.1394],
        [-0.0185, -1.1520,  0.3291],
        [ 0.0472, -0.8985,  0.4027],
        [ 2.9753, -6.1572,  2.1157]], grad_fn=<AddmmBackward>)

### All looks good, now run a training loop 

In [16]:
network = Network()
lambda_l2 = 1e-5
learning_rate = 0.0009

train_loader = torch.utils.data.DataLoader(dataset, batch_size=100)
optimizer = optim.Adam(network.parameters(), lr=learning_rate, weight_decay=1e-5)

for epoch in range(10):
    total_loss = 0
    total_correct = 0

    for batch in train_loader: # Get Batch
        features, labels = batch 
        
        preds = network(features)# Pass Batch

        loss = F.cross_entropy(preds, labels) # Calculate Loss

        optimizer.zero_grad()
        loss.backward() # Calculate Gradients
        optimizer.step() # Update Weights using Adam

        total_loss += loss.item()
        total_correct += get_num_correct(preds, labels) 
    
    bal_acc = get_balanced_accuracy(preds,labels)
    
    if epoch >4:
        print("epoch", epoch)
        print(    
            "total_correct:", total_correct,
            "total_pcntg: ",total_correct/1599,
            "loss:", total_loss,
            "balancd_acc",bal_acc
        )

epoch 5
total_correct: 1319 total_pcntg:  0.8248905565978737 loss: 8.886488109827042 balancd_acc 0.3333333333333333
epoch 6
total_correct: 1319 total_pcntg:  0.8248905565978737 loss: 8.847149312496185 balancd_acc 0.3333333333333333
epoch 7
total_correct: 1319 total_pcntg:  0.8248905565978737 loss: 8.801017224788666 balancd_acc 0.3333333333333333
epoch 8
total_correct: 1319 total_pcntg:  0.8248905565978737 loss: 8.729695439338684 balancd_acc 0.3333333333333333
epoch 9
total_correct: 1319 total_pcntg:  0.8248905565978737 loss: 8.71617916226387 balancd_acc 0.3333333333333333


### This all runs. Now try another model

In [17]:
import random
from IPython import display
from tqdm.notebook import tqdm
seed = 12345
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fad1ac1b930>

In [18]:
X = torch.tensor((df.drop(['quality','wclass','wclass_num'], axis =1)).values).float()
y = torch.tensor((df['wclass_num']-df['wclass_num'].min()).astype('category'))

In [19]:
D = 11  # dimensions
C = 3  # num_classes
H1 = 100  # num_hidden_units
H2 = 100  # num_hidden_units
learning_rate = 5e-3
lambda_l2 = 1e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [20]:
# nn package to create our linear model
# each Linear module has a weight and bias

model = nn.Sequential(
    nn.Linear(D, H1),
    nn.ReLU(),
    nn.Linear(H1, H2),
    nn.ReLU(),
    nn.Linear(H2,C)
)
model.to(device)

# nn package also has different loss functions.
# we use cross entropy loss for our classification task
criterion = torch.nn.CrossEntropyLoss()

# we use the optim package to apply
# ADAM for our parameter updates
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=lambda_l2) # built-in L2

# e = 1.  # plotting purpose

# Training
for t in tqdm(range(100)):
    
    # Feed forward to get the logits
    y_pred = model(X)
    
    # Compute the loss and accuracy
    loss = criterion(y_pred, y)
    score, predicted = torch.max(y_pred, 1)
    acc = (y == predicted).sum().float() / len(y)
    if t>90:
        tqdm.write("[EPOCH]    : %i, [LOSS]: %.6f, [ACCURACY]: %.3f" % (t, loss.item(), acc))
    #display.clear_output(wait=True)
    
    # zero the gradients before running
    # the backward pass.
    optimizer.zero_grad()
    
    # Backward pass to compute the gradient
    # of loss w.r.t our learnable params. 
    loss.backward()
    
    # Update params
    optimizer.step()

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))

[EPOCH]    : 91, [LOSS]: 0.419184, [ACCURACY]: 0.843
[EPOCH]    : 92, [LOSS]: 0.418786, [ACCURACY]: 0.846
[EPOCH]    : 93, [LOSS]: 0.420002, [ACCURACY]: 0.841
[EPOCH]    : 94, [LOSS]: 0.417826, [ACCURACY]: 0.850
[EPOCH]    : 95, [LOSS]: 0.415325, [ACCURACY]: 0.842
[EPOCH]    : 96, [LOSS]: 0.411437, [ACCURACY]: 0.849
[EPOCH]    : 97, [LOSS]: 0.409786, [ACCURACY]: 0.849
[EPOCH]    : 98, [LOSS]: 0.410212, [ACCURACY]: 0.841
[EPOCH]    : 99, [LOSS]: 0.410683, [ACCURACY]: 0.852



In [23]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wclass,wclass_num
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mid,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,mid,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,mid,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,mid,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,mid,1


### This runs too, so we go ahead and write  the python files

In [24]:
quality_mapping = {
    3: 0,
    4: 1,
    5: 2,
    6: 3,
    7: 4,
    8: 5
}

# you can use the map function of pandas with
# any dictionary to convert the values in a given
# column to values in the dictionary
df.loc[:, "quality"] = df.quality.map(quality_mapping)

In [25]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wclass,wclass_num
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,mid,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,2,mid,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,2,mid,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,3,mid,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,2,mid,1
