In [1]:
import os

# create a dataset manually
os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('NumRooms,Alley,Price\n')  # row name
    f.write('NA,Pava,127500\n')  # samples
    f.write('2,NA,106000\n')
    f.write('4,NA,178100\n')
    f.write('NA,NA,140000\n')

In [2]:
import pandas as pd

data = pd.read_csv(data_file)
print(data)

   NumRooms Alley   Price
0       NaN  Pava  127500
1       2.0   NaN  106000
2       4.0   NaN  178100
3       NaN   NaN  140000


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
# make first two cols as inputs, last col as output
inputs, outpus = data.iloc[:, 0:2], data.iloc[:, 2]
# fill NaNs with the avg
# inputs = inputs.fillna(inputs.mean()) <-- this occurs TypeError"can only concatenate str (not "int") to str"
inputs = inputs.fillna(inputs.mean(numeric_only=True))
print(inputs)

   NumRooms Alley
0       3.0  Pava
1       2.0   NaN
2       4.0   NaN
3       3.0   NaN


In [7]:
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)

   NumRooms  Alley_Pava  Alley_nan
0       3.0        True      False
1       2.0       False       True
2       4.0       False       True
3       3.0       False       True


In [8]:
import torch

# nums to tensors
X = torch.tensor(inputs.to_numpy(dtype=float))
y = torch.tensor(outpus.to_numpy(dtype=float))
X, y

(tensor([[3., 1., 0.],
         [2., 0., 1.],
         [4., 0., 1.],
         [3., 0., 1.]], dtype=torch.float64),
 tensor([127500., 106000., 178100., 140000.], dtype=torch.float64))