Preprocessing raw data with pandas and convert them into tensor format

In [7]:
import pandas as pd
import torch

### create dataset

In [9]:
file_name = '../data/house_tiny.csv'

with open(file_name, 'w') as f:
    f.write('NumRooms,Alley,Price\n') # Column names 
    f.write('NA,Pave,127500\n') # Each row represents a data example 
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

### read dataset

In [10]:
data = pd.read_csv('../data/house_tiny.csv')
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


### Handle missing data

In [12]:
data.isnull().any()


NumRooms     True
Alley        True
Price       False
dtype: bool

In [19]:
input = data[['NumRooms', 'Alley']]
output = data['Price']
input

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [20]:
# data = data.dropna()
input = input.fillna(input.mean())
input

Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


for categorical or discrete values in inputs, consider NaN as a category and pandas can convert it to 2 cols automatically

In [22]:
# get_dummies is used to get one hot encode
# dummy_na: Add a column to indicate NaNs, if False NaNs are ignored.
input = pd.get_dummies(input, dummy_na = True)
input

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,3.0,1,0
1,2.0,0,1
2,4.0,0,1
3,3.0,0,1


### Convert to tensor format

In [25]:
input.values

array([[3., 1., 0.],
       [2., 0., 1.],
       [4., 0., 1.],
       [3., 0., 1.]])

In [27]:
in_t = torch.from_numpy(input.values)
out_t = torch.from_numpy(output.values)

print(output)
print(input)

0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64
   NumRooms  Alley_Pave  Alley_nan
0       3.0           1          0
1       2.0           0          1
2       4.0           0          1
3       3.0           0          1


In [28]:
in_t[0]

tensor([3., 1., 0.], dtype=torch.float64)

In [29]:
in_t[:,0]

tensor([3., 2., 4., 3.], dtype=torch.float64)

## Exercise

In [32]:
data = pd.read_csv('../data/house_tiny.csv')
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [34]:
data.isnull().sum().max()

3

In [42]:
nan_num = data.isnull().sum()
nan_num

NumRooms    2
Alley       3
Price       0
dtype: int64

In [44]:
nan_num.loc['NumRooms']

2

In [48]:
nan_num.idxmax()

'Alley'

In [49]:
nan_num.argmax()

1

In [50]:
data

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [52]:
del data[nan_num.idxmax()]

In [53]:
data

Unnamed: 0,NumRooms,Price
0,,127500
1,2.0,106000
2,4.0,178100
3,,140000


In [54]:
data = data.fillna(data.mean())

In [55]:
data

Unnamed: 0,NumRooms,Price
0,3.0,127500
1,2.0,106000
2,4.0,178100
3,3.0,140000


In [56]:
tensor = torch.from_numpy(data.values)

In [57]:
tensor

tensor([[3.0000e+00, 1.2750e+05],
        [2.0000e+00, 1.0600e+05],
        [4.0000e+00, 1.7810e+05],
        [3.0000e+00, 1.4000e+05]], dtype=torch.float64)