# Data Preprocessing

Create a CSV file below

In [1]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000


For categorical input fields, 
we can treat `NaN` as a category

In [2]:
inputs, targets = data.iloc[:, 0:2], data.iloc[:, 2]
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       NaN               0             1
1       2.0               0             1
2       4.0               1             0
3       NaN               0             1


Replace the `NaN` entries with 
the mean value of the corresponding column

In [3]:
inputs = inputs.fillna(inputs.mean())
print(inputs)

   NumRooms  RoofType_Slate  RoofType_nan
0       3.0               0             1
1       2.0               0             1
2       4.0               1             0
3       3.0               0             1


All the entries in `inputs` and `targets` are numerical,
we can load them into a tensor

In [4]:
import torch

X, y = torch.tensor(inputs.values), torch.tensor(targets.values)
X, y

(tensor([[3., 0., 1.],
         [2., 0., 1.],
         [4., 1., 0.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

## 练习
创建包含更多行和列的原始数据集。

删除缺失值最多的列。

将预处理后的数据集转换为张量格式。

In [6]:
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
NA,NA,12750
NA,NA,12750
NA,NA,1275
NA,NA,127
2,NA,106000
4,Slate,178100
NA,NA,140000''')
    
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       NaN      NaN   12750
2       NaN      NaN   12750
3       NaN      NaN    1275
4       NaN      NaN     127
5       2.0      NaN  106000
6       4.0    Slate  178100
7       NaN      NaN  140000


In [21]:
data_dropna = data.drop(columns=data.columns[data.isna().sum().argmax()]).fillna(0)

In [30]:
torch.tensor(data_dropna.values, dtype=torch.float64)

tensor([[0.0000e+00, 1.2750e+05],
        [0.0000e+00, 1.2750e+04],
        [0.0000e+00, 1.2750e+04],
        [0.0000e+00, 1.2750e+03],
        [0.0000e+00, 1.2700e+02],
        [2.0000e+00, 1.0600e+05],
        [4.0000e+00, 1.7810e+05],
        [0.0000e+00, 1.4000e+05]], dtype=torch.float64)