In [1]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2,precision=2,linewidth=75)

In [2]:
import pandas as pd
import csv
wine_path="../data/p1ch4/tabular-wine/winequality-white.csv"

In [3]:
wineq_numpy = np.loadtxt(wine_path,dtype=np.float32,delimiter=";",
                         skiprows=1)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

In [4]:
col_list = next(csv.reader(open(wine_path),delimiter=";"))
wineq_numpy.shape,col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

In [5]:
wineq = torch.from_numpy(wineq_numpy)
wineq.shape,wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [6]:
data = wineq[:,:-1]
data,data.shape

(tensor([[ 7.00,  0.27,  ...,  0.45,  8.80],
         [ 6.30,  0.30,  ...,  0.49,  9.50],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]),
 torch.Size([4898, 11]))

In [7]:
target = wineq[:,-1]
target,target.shape

(tensor([6., 6.,  ..., 7., 6.]), torch.Size([4898]))

In [8]:
target = wineq[:,-1].long()
target

tensor([6, 6,  ..., 7, 6])

In [9]:
target.min(),target.max()

(tensor(3), tensor(9))

In [10]:
target_onehot = torch.zeros(target.shape[0],10)

In [11]:
temp = target.unsqueeze(1)
target_onehot.scatter_(1,temp,1.0),temp

(tensor([[0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.],
         ...,
         [0., 0.,  ..., 0., 0.],
         [0., 0.,  ..., 0., 0.]]),
 tensor([[6],
         [6],
         ...,
         [7],
         [6]]))

In [12]:
target_onehot[0:3],target[0:3]

(tensor([[0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 tensor([6, 6, 6]))

In [13]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed,target,target.shape,target_unsqueezed.shape

(tensor([[6],
         [6],
         ...,
         [7],
         [6]]),
 tensor([6, 6,  ..., 7, 6]),
 torch.Size([4898]),
 torch.Size([4898, 1]))

In [15]:
data_mean = torch.mean(data,dim=0)
data_mean.shape,data_mean

(torch.Size([11]),
 tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
         1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01]))

In [16]:
data_std = torch.std(data,dim=0) # torch.std 计算标准差
data_std

tensor([8.44e-01, 1.01e-01, 1.21e-01, 5.07e+00, 2.18e-02, 1.70e+01,
        4.25e+01, 2.99e-03, 1.51e-01, 1.14e-01, 1.23e+00])

In [17]:
data_var = torch.var(data,dim=0)
data_var

tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02,
        1.81e+03, 8.95e-06, 2.28e-02, 1.30e-02, 1.51e+00])

In [18]:
data_normalized = data - data_mean /torch.sqrt(data_var)
data_normalized

tensor([[-1.12, -2.49,  ..., -3.84,  0.26],
        [-1.82, -2.46,  ..., -3.80,  0.96],
        ...,
        [-2.62, -2.47,  ..., -3.91,  4.26],
        [-2.12, -2.55,  ..., -3.97,  3.26]])

In [20]:
bad_indexs = target<=3
bad_indexs.shape,bad_indexs.dtype,bad_indexs.sum(),bad_indexs,target

(torch.Size([4898]),
 torch.bool,
 tensor(20),
 tensor([False, False,  ..., False, False]),
 tensor([6, 6,  ..., 7, 6]))

In [22]:
bad_data = data[bad_indexs]
bad_data.shape,bad_data,data

(torch.Size([20, 11]),
 tensor([[8.50e+00, 2.60e-01, 2.10e-01, 1.62e+01, 7.40e-02, 4.10e+01,
          1.97e+02, 9.98e-01, 3.02e+00, 5.00e-01, 9.80e+00],
         [5.80e+00, 2.40e-01, 4.40e-01, 3.50e+00, 2.90e-02, 5.00e+00,
          1.09e+02, 9.91e-01, 3.53e+00, 4.30e-01, 1.17e+01],
         [9.10e+00, 5.90e-01, 3.80e-01, 1.60e+00, 6.60e-02, 3.40e+01,
          1.82e+02, 9.97e-01, 3.23e+00, 3.80e-01, 8.50e+00],
         [7.10e+00, 3.20e-01, 3.20e-01, 1.10e+01, 3.80e-02, 1.60e+01,
          6.60e+01, 9.94e-01, 3.24e+00, 4.00e-01, 1.15e+01],
         [6.90e+00, 3.90e-01, 4.00e-01, 4.60e+00, 2.20e-02, 5.00e+00,
          1.90e+01, 9.92e-01, 3.31e+00, 3.70e-01, 1.26e+01],
         [1.03e+01, 1.70e-01, 4.70e-01, 1.40e+00, 3.70e-02, 5.00e+00,
          3.30e+01, 9.94e-01, 2.89e+00, 2.80e-01, 9.60e+00],
         [7.90e+00, 6.40e-01, 4.60e-01, 1.06e+01, 2.44e-01, 3.30e+01,
          2.27e+02, 9.98e-01, 2.87e+00, 7.40e-01, 9.10e+00],
         [8.30e+00, 3.30e-01, 4.20e-01, 1.15e+00, 3.30e-02, 

In [26]:
bad_data = data[target<=3]
min_data = data[(target>3)&(target<7)]
good_data = data[target>=7]
bad_data.shape,min_data.shape,good_data.shape,data.shape,target.shape,target,target_unsqueezed.shape

(torch.Size([20, 11]),
 torch.Size([3818, 11]),
 torch.Size([1060, 11]),
 torch.Size([4898, 11]),
 torch.Size([4898]),
 tensor([6, 6,  ..., 7, 6]),
 torch.Size([4898, 1]))

In [27]:
bad_mean =torch.mean(bad_data,dim=0)
mid_mean = torch.mean(min_data,dim=0)
good_mean = torch.mean(good_data,dim=0)

In [28]:
bad_mean

tensor([7.60e+00, 3.33e-01, 3.36e-01, 6.39e+00, 5.43e-02, 5.33e+01,
        1.71e+02, 9.95e-01, 3.19e+00, 4.75e-01, 1.03e+01])

In [29]:
mid_mean

tensor([6.89e+00, 2.82e-01, 3.36e-01, 6.71e+00, 4.78e-02, 3.54e+01,
        1.42e+02, 9.94e-01, 3.18e+00, 4.87e-01, 1.03e+01])

In [30]:
good_mean

tensor([6.73e+00, 2.65e-01, 3.26e-01, 5.26e+00, 3.82e-02, 3.46e+01,
        1.25e+02, 9.92e-01, 3.22e+00, 5.00e-01, 1.14e+01])

In [32]:
for i,args in enumerate(zip(col_list,bad_mean,mid_mean,good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i,*args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


In [33]:
total_sulfur_threshold =141.83
total_sulfur_data = data[:,6]
predicted_indexes = torch.lt(total_sulfur_data,total_sulfur_threshold)

In [34]:
predicted_indexes.shape,predicted_indexes.dtype,predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [35]:
actual_index = target >5
actual_index.shape, actual_index.dtype,actual_index.sum()#actual_index为布尔类型，则.sum()返回True元素数量

(torch.Size([4898]), torch.bool, tensor(3258))

In [36]:
n_matches = torch.sum(actual_index & predicted_indexes).item()#.
n_matches

2018

In [38]:
n_predicted = torch.sum(predicted_indexes).item()
n_predicted

2727

In [39]:
n_actual =torch.sum(actual_index).item()
n_actual

3258

In [40]:
n_matches / n_predicted,n_matches/n_actual  #precision and recall

(0.74000733406674, 0.6193984039287906)