In [1]:
import numpy as np 
import pandas as pd 
import sys 
import torch 
import csv 
import matplotlib.pyplot as plt 
import shutil 

from google.colab import drive 
from google.colab.patches import cv_imshow 

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
FILE_PATH = "/content/drive/MyDrive/dataset/pytorch/tabular-wine.zip"
EXTRACTOR_DIR = "/content/data/"
shutil.unpack_archive(FILE_PATH, EXTRACTOR_DIR)

In [4]:
wine_numpy = np.loadtxt("./data/winequality-white.csv", dtype=np.float32, delimiter=";", skiprows=1)
col_list = next(csv.reader(open("./data/winequality-white.csv"), delimiter=";"))
wine_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [7]:
wineq = torch.from_numpy(wine_numpy)
wineq, wineq.shape, wineq.dtype

(tensor([[ 7.0000,  0.2700,  0.3600,  ...,  0.4500,  8.8000,  6.0000],
         [ 6.3000,  0.3000,  0.3400,  ...,  0.4900,  9.5000,  6.0000],
         [ 8.1000,  0.2800,  0.4000,  ...,  0.4400, 10.1000,  6.0000],
         ...,
         [ 6.5000,  0.2400,  0.1900,  ...,  0.4600,  9.4000,  6.0000],
         [ 5.5000,  0.2900,  0.3000,  ...,  0.3800, 12.8000,  7.0000],
         [ 6.0000,  0.2100,  0.3800,  ...,  0.3200, 11.8000,  6.0000]]),
 torch.Size([4898, 12]),
 torch.float32)

In [20]:
data = wineq[:, :-1]
target = wineq[:, -1].long()
data.shape, target.shape
#data.dtype, target.dtype

(torch.Size([4898, 11]), torch.Size([4898]))

In [21]:
# One Hot Encoding 
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.shape

torch.Size([4898, 10])

In [22]:
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)
target_onehot[0]

tensor([0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [23]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.8548e+00, 2.7824e-01, 3.3419e-01, 6.3914e+00, 4.5772e-02, 3.5308e+01,
        1.3836e+02, 9.9403e-01, 3.1883e+00, 4.8985e-01, 1.0514e+01])

In [24]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.1211e-01, 1.0160e-02, 1.4646e-02, 2.5726e+01, 4.7733e-04, 2.8924e+02,
        1.8061e+03, 8.9455e-06, 2.2801e-02, 1.3025e-02, 1.5144e+00])

In [25]:
data_normalized = (data - data_mean)/torch.sqrt(data_var)
data_normalized, data_normalized.shape

(tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
          -3.4915e-01, -1.3930e+00],
         [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
           1.3422e-03, -8.2419e-01],
         [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
          -4.3677e-01, -3.3663e-01],
         ...,
         [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
          -2.6153e-01, -9.0545e-01],
         [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
          -9.6251e-01,  1.8574e+00],
         [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
          -1.4882e+00,  1.0448e+00]]), torch.Size([4898, 11]))

In [26]:
# Fiding thresholds 
bad_indexes = target <=3 
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [27]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [30]:
bad_data = data[target <=3 ]
mid_data = data[(target > 3) & (target < 7)]
good_data = data[target >=7 ]
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print("{:2} {:20} {:6.2f} {:6.2f} {:6.2f}".format(i, *(args)))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [31]:
total_sulfur_threshold = 141.83
total_sulfur_Data = data[:, 6]
predicted_indexes = torch.lt(total_sulfur_Data, total_sulfur_threshold)
predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [32]:
actual_indexes = target > 5 
actual_indexes, actual_indexes.dtype, actual_indexes.sum()

(tensor([True, True, True,  ..., True, True, True]), torch.bool, tensor(3258))

In [33]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

In [None]:
n_matches, n_matches / n_predicted