# Data Processing

### Reading the Datasets

In [1]:
import os

def mkdir_if_not_exist(path):
    if not isinstance(path, str):
        path = os.path.join(*path)
    if not os.path.exists(path):
        os.makedirs(path)

In [2]:
data_file = '../data/house_tiny.csv'
mkdir_if_not_exist('../data')

with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')
    f.write('NA,Pave,127500\n')
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,14000\n')

In [3]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pave  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN   14000


### Handling Missing Data

In [4]:
inputs, outputs = data.iloc[:, 0:2], data.iloc[:, 2]

In [5]:
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [6]:
outputs

0    127500
1    106000
2    178100
3     14000
Name: Price, dtype: int64

In [7]:
inputs = inputs.fillna(inputs.mean())

In [8]:
inputs

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [9]:
inputs = pd.get_dummies(inputs, dummy_na=True)

In [10]:
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


### Conversion to the ndarray Format

In [11]:
import torch

In [13]:
inputs.values

array([[3., 1., 0.],
       [2., 0., 1.],
       [4., 0., 1.],
       [3., 0., 1.]])

In [14]:
outputs.values

array([127500, 106000, 178100,  14000])

In [17]:
X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500, 106000, 178100,  14000]))