In [3]:
import torch
import numpy as np
import csv

In [6]:
wine_path = r'D:\InterestingProgramming\python\Pytorch深度学习实战\data\p1ch4\tabular-wine\winequality-white.csv'
wine_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=';', skiprows=1)
wine_numpy.shape

(4898, 12)

In [8]:
# next取得csv列表的第一行，reader读取特定的csv文件（第一个参数是打开的文件）
col_list = next(csv.reader(open(wine_path), delimiter=';'))
col_list

['fixed acidity',
 'volatile acidity',
 'citric acid',
 'residual sugar',
 'chlorides',
 'free sulfur dioxide',
 'total sulfur dioxide',
 'density',
 'pH',
 'sulphates',
 'alcohol',
 'quality']

In [9]:
# 把numpy数组转化成pytorch张量
wineq = torch.from_numpy(wine_numpy)
wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [10]:
target = wineq[:, -1]
data = wineq[:, :-1]
target.shape, data.shape

(torch.Size([4898]), torch.Size([4898, 11]))

In [17]:
# 得到目标的独热编码
target = target.to(dtype=torch.long)
target_onehot = torch.zeros(target.shape[0], 10)
target_onehot.scatter_(1, target.unsqueeze(-1), 1.0)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 1., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [18]:
# 得到数据的平均值和标准差
data_mean = torch.mean(data, dim=0)
data_var = torch.var(data, dim=0)
data_normalized = (data-data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.7208e-01, -8.1761e-02,  2.1326e-01,  ..., -1.2468e+00,
         -3.4915e-01, -1.3930e+00],
        [-6.5743e-01,  2.1587e-01,  4.7996e-02,  ...,  7.3995e-01,
          1.3422e-03, -8.2419e-01],
        [ 1.4756e+00,  1.7450e-02,  5.4378e-01,  ...,  4.7505e-01,
         -4.3677e-01, -3.3663e-01],
        ...,
        [-4.2043e-01, -3.7940e-01, -1.1915e+00,  ..., -1.3130e+00,
         -2.6153e-01, -9.0545e-01],
        [-1.6054e+00,  1.1666e-01, -2.8253e-01,  ...,  1.0049e+00,
         -9.6251e-01,  1.8574e+00],
        [-1.0129e+00, -6.7703e-01,  3.7852e-01,  ...,  4.7505e-01,
         -1.4882e+00,  1.0448e+00]])

In [20]:
# 分析数据查看能不能分辨出好酒和劣质酒
bad_indices = target <= 3 # 我们认为小于三分的就是差酒
float(bad_indices.sum(dim=0))

20.0

In [21]:
bad_data = data[bad_indices]
bad_data.shape

torch.Size([20, 11])

In [23]:
# 以此获得中等酒和上等酒的数据
mid_data = data[(target < 7) & (target > 3)]
good_data = data[target >= 7]

bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:25} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity               7.60   6.89   6.73
 1 volatile acidity            0.33   0.28   0.27
 2 citric acid                 0.34   0.34   0.33
 3 residual sugar              6.39   6.71   5.26
 4 chlorides                   0.05   0.05   0.04
 5 free sulfur dioxide        53.33  35.42  34.55
 6 total sulfur dioxide      170.60 141.83 125.25
 7 density                     0.99   0.99   0.99
 8 pH                          3.19   3.18   3.22
 9 sulphates                   0.47   0.49   0.50
10 alcohol                    10.34  10.26  11.42


In [24]:
# 我们发现第六个数据（总二氧化硫）对好酒和坏酒来说数量差距较大
total_sulfur_threshold = mid_mean[6]
total_sulfur_data = data[:, 6]
predicted_indices = torch.lt(total_sulfur_data, total_sulfur_threshold)

predicted_indices.shape, predicted_indices.dtype, predicted_indices.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [25]:
actual_indices = target > 5

actual_indices.shape, actual_indices.dtype, actual_indices.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [None]:
n_matches = torch.sum(actual_indices & predicted_indices).item()
n_predicted = torch.sum(predicted_indices).item()
n_actual = torch.sum(actual_indices).item()

n_matches, n_matches / n_predicted, n_matches / n_actual