In [2]:
"""
pandas可以与张量兼容，torch中经常使用pandas进行预处理
"""
import torch

In [15]:
import os

"""
创建一个csv文件
"""
os.makedirs(os.path.join('data'), exist_ok=True)
data_file = os.path.join('data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # 列名
    f.write('NA,Pave,127500\n')  # 每⾏表⽰⼀个数据样本
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [16]:
import pandas as pd

data = pd.read_csv(os.path.join('data', 'house_tiny.csv'))
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [18]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]
inputs, outputs

(   NumRooms Alley
 0       NaN  Pave
 1       2.0   NaN
 2       4.0   NaN
 3       NaN   NaN,
 0    127500
 1    106000
 2    178100
 3    140000
 Name: Price, dtype: int64)

In [19]:
"""
对于NumRooms这种数值类型，inputs的平均值计算，会跳过NaN数据，所以结果3=(4+2)/2
Alley这种字符串类型，无法进行平均值计算
"""
inputs.mean()

  inputs.mean()


NumRooms    3.0
dtype: float64

In [21]:
"""
fillna无法对Alley这组数据进行处理，因为inputs.mean()的输出没有alley这一列
"""
inputs = inputs.fillna(inputs.mean())
inputs

  inputs=inputs.fillna(inputs.mean())


Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [22]:
"""
对于inputs中的类别值或离散值，我们将“NaN”视为⼀个类别。由于“巷⼦类型”（“Alley”）列只接受两
种类型的类别值“Pave”和“NaN”， pandas可以⾃动将此列转换为两列“Alley_Pave”和“Alley_nan”。巷
⼦类型为“Pave”的⾏会将“Alley_Pave”的值设置为1，“Alley_nan”的值设置为0。缺少巷⼦类型的⾏会
将“Alley_Pave”和“Alley_nan”分别设置为0和1。
"""
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


In [38]:
"""
删除缺失值最多的那一列
"""
inputs2 = data.iloc[:, 0:2]
inputs2
# isna判断是否是NaN数据
print(inputs2.isna())
sum = inputs2.isna().sum()
# 查找最大列
print(sum, "---->", sum.index[sum.argmax()])
# 删除
inputs2.drop(columns=sum.index[sum.argmax()])

   NumRooms  Alley
0      True  False
1     False   True
2     False   True
3      True   True
NumRooms    2
Alley       3
dtype: int64 ----> Alley


Unnamed: 0,NumRooms
0,
1,2.0
2,4.0
3,


In [3]:
"""
转换为张量类型
"""
print(type(inputs), type(outputs))
x, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
x, y

NameError: name 'inputs' is not defined

In [5]:
import random

"""
数据小批量迭代器，每次返回一个batch的数据。
"""
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    # 这些样本是随机读取的，没有特定的顺序
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i: min(i + batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]

# 测试
x, y = torch.arange(1, 10), torch.randn((9,))
for batch, label in data_iter(3, x, y, ):
    print(batch,label)

tensor([7, 8, 1]) tensor([-0.2986,  0.0565, -0.5831])
tensor([2, 9, 5]) tensor([-1.8615,  1.4220,  0.7200])
tensor([3, 6, 4]) tensor([ 0.0987, -1.5014, -1.6756])


In [12]:
from torch.utils import data
# 使用pytorch提供的库进行小批数据迭代
"""
is_train表⽰是否希望数据迭代器对象在每个迭代周期内打乱数据
"""
def load_array(data_arrays, batch_size, is_train=True): #@save
    """构造⼀个PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

batch_size = 3
# 获取迭代器第一项
print(next(iter(load_array((x, y), batch_size))))
# 打乱迭代
for batch, label in load_array((x, y), batch_size):
    print(batch,label)
# 不打乱迭代
for batch, label in load_array((x, y), batch_size,False):
    print(batch,label)

[tensor([2, 5, 4]), tensor([-1.8615,  0.7200, -1.6756])]
tensor([5, 8, 4]) tensor([ 0.7200,  0.0565, -1.6756])
tensor([7, 2, 3]) tensor([-0.2986, -1.8615,  0.0987])
tensor([6, 9, 1]) tensor([-1.5014,  1.4220, -0.5831])
tensor([1, 2, 3]) tensor([-0.5831, -1.8615,  0.0987])
tensor([4, 5, 6]) tensor([-1.6756,  0.7200, -1.5014])
tensor([7, 8, 9]) tensor([-0.2986,  0.0565,  1.4220])
