# Read table

In [153]:
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, precision=2, linewidth=75)

In [154]:
import pandas as pd
wine_path = "../data/p1ch4/tabular-wine/winequality-white.csv"
df = pd.read_csv(wine_path, sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [155]:
wineq_numpy = df.values.astype(np.float32)
wineq_numpy

array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  6.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  6.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  6.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  6.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  7.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  6.  ]], dtype=float32)

👆

值得一提的是, 以下是书中的代码:

```python
import csv
wine_path = "../data/p1ch4/tabular-wine/winequality-white.csv"
wineq_numpy = np.loadtxt(wine_path, dtype=np.float32, delimiter=";",
                         skiprows=1)
col_list = next(csv.reader(open(wine_path), delimiter=';'))
print(wineq_numpy.shape, col_list)
```


In [156]:
# Get the list of column names
col_list = df.columns.tolist()

wineq_numpy.shape, col_list

((4898, 12),
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol',
  'quality'])

👆

原代码:

```python
col_list = next(csv.reader(open(wine_path), delimiter=';'))

wineq_numpy.shape, col_list
```

In [157]:
wineq = torch.from_numpy(wineq_numpy)

wineq.shape, wineq.dtype

(torch.Size([4898, 12]), torch.float32)

In [158]:
data = wineq[:, :-1]
data, data.shape

(tensor([[ 7.00,  0.27,  ...,  0.45,  8.80],
         [ 6.30,  0.30,  ...,  0.49,  9.50],
         ...,
         [ 5.50,  0.29,  ...,  0.38, 12.80],
         [ 6.00,  0.21,  ...,  0.32, 11.80]]),
 torch.Size([4898, 11]))

👆

因为我们需要预测的是最后的一个列, 即 `quality`, 所以我们需要将其分离出来

In [159]:
target = wineq[:, -1].long()
target, target.shape

(tensor([6, 6,  ..., 7, 6]), torch.Size([4898]))

In [160]:
# 查找下一共有多少类不同的target
unique_target = torch.unique(target, sorted=True)
unique_target

tensor([3, 4, 5, 6, 7, 8, 9])

In [161]:
target = target - 3
unique_target = torch.unique(target, sorted=True)
unique_target

tensor([0, 1, 2, 3, 4, 5, 6])

```python
target = target - 3
```

`target - 3`的操作是在确定了`target`的最大值为`9`而最小值为`3`的情况下, 将其转换为`0`到`6`的值, 以减少其所需要的内存空间.

In [162]:
target_onehot = torch.zeros(target.shape[0], 7)
target_onehot[:3, :]

tensor([[0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0.]])

In [163]:
target_unsqueezed = target.unsqueeze(1)
target_unsqueezed

tensor([[3],
        [3],
        ...,
        [4],
        [3]])

In [164]:
target_onehot.scatter_(1, target.unsqueeze(1), 1.0)

tensor([[0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.],
        ...,
        [0., 0.,  ..., 0., 0.],
        [0., 0.,  ..., 0., 0.]])

```python
.scatter_(a, b, c)
```

其中:

- `a`: 指定维度
- `b`: 指定索引
- `c`: 指定值

即, 将 `a` 维度的 `b` 索引的值, 赋值为 `c`

## 标准化

In [165]:
data_mean = torch.mean(data, dim=0)
data_mean

tensor([6.85e+00, 2.78e-01, 3.34e-01, 6.39e+00, 4.58e-02, 3.53e+01,
        1.38e+02, 9.94e-01, 3.19e+00, 4.90e-01, 1.05e+01])

In [166]:
data_var = torch.var(data, dim=0)
data_var

tensor([7.12e-01, 1.02e-02, 1.46e-02, 2.57e+01, 4.77e-04, 2.89e+02,
        1.81e+03, 8.95e-06, 2.28e-02, 1.30e-02, 1.51e+00])

也可以用:

```python
>>> data_std = torch.std(data, dim=0)
>>> print(data_std)
tensor([8.44e-01, 1.01e-01, 1.21e-01, 5.07e+00, 2.18e-02, 1.70e+01,
        4.25e+01, 2.99e-03, 1.51e-01, 1.14e-01, 1.23e+00])
```

对此我们可以进行简要的检查:

```python
>>> 8.44 * 8.44 / 100
0.712336
```

其结果符合`data_var`的结果, 所以我们可以判断: 该标准化的操作也可以用std来实现

In [167]:
data_normalized = (data - data_mean) / torch.sqrt(data_var)
data_normalized

tensor([[ 1.72e-01, -8.18e-02,  ..., -3.49e-01, -1.39e+00],
        [-6.57e-01,  2.16e-01,  ...,  1.34e-03, -8.24e-01],
        ...,
        [-1.61e+00,  1.17e-01,  ..., -9.63e-01,  1.86e+00],
        [-1.01e+00, -6.77e-01,  ..., -1.49e+00,  1.04e+00]])

## 预测

In [172]:
bad_indexes = target == 0
bad_indexes.shape, bad_indexes.dtype, bad_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(20))

In [171]:
bad_data = data[bad_indexes]
bad_data.shape

torch.Size([20, 11])

In [174]:
bad_data = data[target == 0]
mid_data = data[(target > 0) & (target < 4)]
good_data = data[target >= 4]

bad_data.shape, mid_data.shape, good_data.shape

(torch.Size([20, 11]), torch.Size([3818, 11]), torch.Size([1060, 11]))

In [176]:
bad_mean = torch.mean(bad_data, dim=0)
mid_mean = torch.mean(mid_data, dim=0)
good_mean = torch.mean(good_data, dim=0)

bad_mean.shape, mid_mean.shape, good_mean.shape

(torch.Size([11]), torch.Size([11]), torch.Size([11]))

In [177]:
for i, args in enumerate(zip(col_list, bad_mean, mid_mean, good_mean)):
    print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))

 0 fixed acidity          7.60   6.89   6.73
 1 volatile acidity       0.33   0.28   0.27
 2 citric acid            0.34   0.34   0.33
 3 residual sugar         6.39   6.71   5.26
 4 chlorides              0.05   0.05   0.04
 5 free sulfur dioxide   53.33  35.42  34.55
 6 total sulfur dioxide 170.60 141.83 125.25
 7 density                0.99   0.99   0.99
 8 pH                     3.19   3.18   3.22
 9 sulphates              0.47   0.49   0.50
10 alcohol               10.34  10.26  11.42


这段代码是在遍历三个列表：`col_list`，`bad_mean`，`mid_mean`和`good_mean`，并打印出每个元素的信息。

`enumerate`函数会返回一个元组，其中第一个元素是当前元素的索引，第二个元素是当前元素的值。在这个例子中，`args`是一个元组，包含了`col_list`，`bad_mean`，`mid_mean`和`good_mean`在当前索引下的元素。

`zip`函数会将多个列表的元素按照相同的索引组合在一起，返回一个元组的迭代器。在这个例子中，`zip(col_list, bad_mean, mid_mean, good_mean)`会返回一个元组的迭代器，每个元组包含了四个列表在相同索引下的元素。

`print`函数用于打印信息。在这个例子中，`print('{:2} {:20} {:6.2f} {:6.2f} {:6.2f}'.format(i, *args))`会打印出一个格式化的字符串，其中`{:2}`，`{:20}`，`{:6.2f}`，`{:6.2f}`和`{:6.2f}`是占位符，它们会被`format(i, *args)`中的参数替换。`*args`是一个解包操作，它会将`args`元组中的元素解包成单独的参数。

总的来说，这段代码的作用是打印出每个特征的名称（来自`col_list`），以及在`bad_mean`，`mid_mean`和`good_mean`中对应的值。

---

接下来, 书中选择了用`total sulfur dioxide`来进行预测.

👇

In [178]:
total_sulfur_threshold = 141.83
total_sulfur_day = data[:, 6]
predicted_indexes = torch.lt(total_sulfur_day, total_sulfur_threshold)

predicted_indexes.shape, predicted_indexes.dtype, predicted_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(2727))

In [180]:
actual_indexes = target > 2
actual_indexes.shape, actual_indexes.dtype, actual_indexes.sum()

(torch.Size([4898]), torch.bool, tensor(3258))

In [181]:
n_matches = torch.sum(actual_indexes & predicted_indexes).item()

- `&`操作符在PyTorch中执行按位与操作。对于每一对对应的元素，如果它们都为真（即非零），则结果为真；否则结果为假（即零）。
- `torch.sum`函数计算张量中所有元素的总和。在这个例子中，它计算的是按位与操作的结果中的真值（即1）的总数。
- `.item()`方法将一个只包含一个元素的张量转换为一个Python标量。在这个例子中，它将总和的张量转换为一个Python的整数。

所以，`n_matches`的值是`actual_indexes`和`predicted_indexes`中对应位置都为真的元素的数量。这通常用于计算预测正确的数量，其中`actual_indexes`是实际的标签，`predicted_indexes`是模型的预测。

In [182]:
n_predicted = torch.sum(predicted_indexes).item()
n_actual = torch.sum(actual_indexes).item()

In [183]:
n_matches, n_matches / n_predicted, n_matches / n_actual

(2018, 0.74000733406674, 0.6193984039287906)