# Create a raw dataset with more rows and columns.

In [89]:
import os

os.makedirs(os.path.join('../data/', 'data-for-2.2'), exist_ok=True)
data_file = os.path.join('../data/', 'data-for-2.2', 'raw_set.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price,NorthFacing\n')
    f.write('2,Pave,127500,Yes\n')
    f.write('2,NA,106000,No\n')
    f.write('4,NA,178100,Yes\n')
    f.write('3,Pave,140000,NA\n')
    f.write('5,NA,190000,No\n')
    f.write('2,NA,110000,No\n')
    f.write('3,NA,140000,NA\n')
    

In [90]:
import pandas as pd
df = pd.read_csv(data_file)
df

Unnamed: 0,NumRooms,Alley,Price,NorthFacing
0,2,Pave,127500,Yes
1,2,,106000,No
2,4,,178100,Yes
3,3,Pave,140000,
4,5,,190000,No
5,2,,110000,No
6,3,,140000,


# Q1. Delete the column with the most missing values.

In [91]:
df.isnull().sum()

NumRooms       0
Alley          5
Price          0
NorthFacing    2
dtype: int64

From the above we see that Alley has the maximum number of missing values.


In [92]:
df.drop('Alley',
  axis='columns', inplace=True)
df

Unnamed: 0,NumRooms,Price,NorthFacing
0,2,127500,Yes
1,2,106000,No
2,4,178100,Yes
3,3,140000,
4,5,190000,No
5,2,110000,No
6,3,140000,


# Q2. Convert the preprocessed dataset to the tensor format.


In [93]:
df = pd.get_dummies(df, dummy_na=True)
inputs = df.loc[:, df.columns!='Price']
outputs = df['Price']

df

Unnamed: 0,NumRooms,Price,NorthFacing_No,NorthFacing_Yes,NorthFacing_nan
0,2,127500,0,1,0
1,2,106000,1,0,0
2,4,178100,0,1,0
3,3,140000,0,0,1
4,5,190000,1,0,0
5,2,110000,1,0,0
6,3,140000,0,0,1


In [94]:
import torch

X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[2, 0, 1, 0],
         [2, 1, 0, 0],
         [4, 0, 1, 0],
         [3, 0, 0, 1],
         [5, 1, 0, 0],
         [2, 1, 0, 0],
         [3, 0, 0, 1]]),
 tensor([127500, 106000, 178100, 140000, 190000, 110000, 140000]))