In [7]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset

In [2]:
# load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
train.info()
# fully existent attributes: id, title, city, postalCode, latitude, longitude, areaSqm, firstSeenAt, lastSeenAt, rawAvailability, propertyType, coverImageUrl, rent
# not full: isRoomActive (63 missing), postedAgo (6 missing), descriptionNonTranslated (111), descriptionTranslated (10.140), rentDetail (7896), furnish (214), energyLabel (63), gender (536), internet (63), roommates (536), shower (63), toilet (63), kitchen (63), living (63), pets (63), smokingInside (63), matchAge (63), matchGender (63), matchCapacity (63), matchLanguages (63), matchStatus (63)
# omitted_columns = ["id", "title", "city", "postalCode", "latitude", "longitude", "firstSeenAt", "lastSeenAt", "isRoomActive", "rawAvailability", "postedAgo", "descriptionNonTranslated", "descriptionTranslated", ""]...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27915 entries, 0 to 27914
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        27915 non-null  int64  
 1   title                     27915 non-null  object 
 2   city                      27915 non-null  object 
 3   postalCode                27915 non-null  object 
 4   latitude                  27915 non-null  float64
 5   longitude                 27915 non-null  float64
 6   areaSqm                   27915 non-null  int64  
 7   firstSeenAt               27915 non-null  object 
 8   lastSeenAt                27915 non-null  object 
 9   isRoomActive              27852 non-null  object 
 10  rawAvailability           27915 non-null  object 
 11  postedAgo                 27909 non-null  object 
 12  descriptionNonTranslated  27804 non-null  object 
 13  descriptionTranslated     17775 non-null  object 
 14  rentDe

In [4]:
train.head()

Unnamed: 0,id,title,city,postalCode,latitude,longitude,areaSqm,firstSeenAt,lastSeenAt,isRoomActive,...,living,pets,smokingInside,matchAge,matchGender,matchCapacity,matchLanguages,matchStatus,coverImageUrl,rent
0,0,West-Varkenoordseweg,Rotterdam,3074HN,51.896601,4.514993,14,2019-07-14 11:25:46.511000+00:00,2019-07-26 22:18:23.142000+00:00,True,...,,No,No,16 years - 99 years,Not important,1 person,Not important,Not important,https://resources.kamernet.nl/image/913b4b03-5...,500
1,3,Ruiterakker,Assen,9407BG,53.013494,6.561012,16,2019-07-14 11:25:46.988000+00:00,2019-07-18 22:00:31.174000+00:00,False,...,,No,Yes,18 years - 32 years,Female,1 person,Not important,"Student, Working student",https://resources.kamernet.nl/image/84e95365-6...,290
2,8,Brusselseweg,Maastricht,6217GX,50.860841,5.671673,16,2019-07-14 11:25:47.814000+00:00,2019-08-10 00:14:27.130000+00:00,True,...,,No,No,16 years - 40 years,Male,4 persons,Dutch English,Student,https://resources.kamernet.nl/image/6e625591-d...,425
3,10,Donkerslootstraat,Rotterdam,3074WL,51.893195,4.516478,25,2019-07-14 11:25:48.140000+00:00,2019-07-16 06:05:32.183000+00:00,False,...,,No,No,21 years - 99 years,Not important,4 persons,Dutch English Spanish French Italian German Po...,"Student, Working student, Working, Looking for...",https://resources.kamernet.nl/image/ea3aea77-0...,600
4,12,Vorselenburgstraat,Alphen aan den Rijn,2405XJ,52.122335,4.661434,10,2019-07-14 11:25:48.465000+00:00,2019-08-01 00:02:40.516000+00:00,True,...,,No,Yes,22 years - 40 years,Not important,1 person,Dutch English,"Student, Working student, Working",https://resources.kamernet.nl/image/d0780298-b...,425


In [5]:
train["rentDetail"].unique()
# this probably means NaN means utilities excl.

array([nan, 'Utilities incl.'], dtype=object)

In [6]:
train["propertyType"].unique()

array(['Room', 'Studio', 'Apartment', 'Anti-squat', 'Student residence'],
      dtype=object)

In [7]:
train["furnish"].unique()

array(['Unfurnished', 'Furnished', 'Uncarpeted', nan], dtype=object)

In [8]:
train["matchAge"].unique()

array(['16 years - 99 years', '18 years - 32 years',
       '16 years - 40 years', '21 years - 99 years',
       '22 years - 40 years', '18 years - 30 years',
       '16 years - 30 years', '18 years - 40 years',
       '16 years - 22 years', '16 years - 37 years',
       '16 years - 25 years', '24 years - 35 years',
       '16 years - 21 years', '18 years - 35 years',
       '25 years - 60 years', '16 years - 49 years',
       '26 years - 60 years', '19 years - 25 years',
       '16 years - 24 years', '17 years - 30 years', nan,
       '18 years - 28 years', '22 years - 99 years',
       '24 years - 99 years', '18 years - 29 years',
       '20 years - 40 years', '17 years - 22 years',
       '18 years - 27 years', 'Not important - Not important',
       '16 years - 28 years', '17 years - 35 years',
       '16 years - 35 years', '20 years - 46 years',
       '25 years - 99 years', '18 years - 36 years',
       '30 years - 99 years', '16 years - 50 years',
       '17 years - 20 years', '

In [9]:
# great features from informational value: areaSqm, rentDetail, propertyType, furnish, internet, roommates, shower, toilet, kitchen, living, pets, smokingInside, matchCapacity, matchStatus
# not many values filled in for rentDetail, but it is still should be useful
# good features: energyLabel, gender, matchGender, matchLanguages
# decent features: city, postalCode, latitude, longitude, firstSeenAt, lastSeenAt, isRoomActive, rawAvailability, postedAgo, matchAge
# to predict: rent

In [10]:
from sklearn.preprocessing import LabelBinarizer

df = pd.read_csv("train.csv")
lb = LabelBinarizer(sparse_output=True)

df = df.join(pd.DataFrame.sparse.from_spmatrix(lb.fit_transform(df.pop('propertyType')), index=df.index, columns=lb.classes_))

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27915 entries, 0 to 27914
Data columns (total 38 columns):
 #   Column                    Non-Null Count  Dtype           
---  ------                    --------------  -----           
 0   id                        27915 non-null  int64           
 1   title                     27915 non-null  object          
 2   city                      27915 non-null  object          
 3   postalCode                27915 non-null  object          
 4   latitude                  27915 non-null  float64         
 5   longitude                 27915 non-null  float64         
 6   areaSqm                   27915 non-null  int64           
 7   firstSeenAt               27915 non-null  object          
 8   lastSeenAt                27915 non-null  object          
 9   isRoomActive              27852 non-null  object          
 10  rawAvailability           27915 non-null  object          
 11  postedAgo                 27909 non-null  object      

In [12]:
df.head()

Unnamed: 0,id,title,city,postalCode,latitude,longitude,areaSqm,firstSeenAt,lastSeenAt,isRoomActive,...,matchCapacity,matchLanguages,matchStatus,coverImageUrl,rent,Anti-squat,Apartment,Room,Student residence,Studio
0,0,West-Varkenoordseweg,Rotterdam,3074HN,51.896601,4.514993,14,2019-07-14 11:25:46.511000+00:00,2019-07-26 22:18:23.142000+00:00,True,...,1 person,Not important,Not important,https://resources.kamernet.nl/image/913b4b03-5...,500,0,0,1,0,0
1,3,Ruiterakker,Assen,9407BG,53.013494,6.561012,16,2019-07-14 11:25:46.988000+00:00,2019-07-18 22:00:31.174000+00:00,False,...,1 person,Not important,"Student, Working student",https://resources.kamernet.nl/image/84e95365-6...,290,0,0,1,0,0
2,8,Brusselseweg,Maastricht,6217GX,50.860841,5.671673,16,2019-07-14 11:25:47.814000+00:00,2019-08-10 00:14:27.130000+00:00,True,...,4 persons,Dutch English,Student,https://resources.kamernet.nl/image/6e625591-d...,425,0,0,1,0,0
3,10,Donkerslootstraat,Rotterdam,3074WL,51.893195,4.516478,25,2019-07-14 11:25:48.140000+00:00,2019-07-16 06:05:32.183000+00:00,False,...,4 persons,Dutch English Spanish French Italian German Po...,"Student, Working student, Working, Looking for...",https://resources.kamernet.nl/image/ea3aea77-0...,600,0,0,1,0,0
4,12,Vorselenburgstraat,Alphen aan den Rijn,2405XJ,52.122335,4.661434,10,2019-07-14 11:25:48.465000+00:00,2019-08-01 00:02:40.516000+00:00,True,...,1 person,Dutch English,"Student, Working student, Working",https://resources.kamernet.nl/image/d0780298-b...,425,0,0,1,0,0


In [4]:
class FCGDataset(Dataset):
    def __init__(self, file_dir):
        data = pd.read_csv(file_dir)
        used_columns = ["areaSqm", "rentDetail", "propertyType", "furnish", "internet", "roommates", "shower", "toilet", "kitchen", "living", "pets", "smokingInside", "matchCapacity", "rent"]
        one_hot_columns = ["rentDetail", "propertyType", "furnish", "internet", "roommates", "shower", "toilet", "kitchen", "living", "pets", "smokingInside", "matchCapacity"]
        data = data[used_columns]
        ohe_df = pd.get_dummies(data[one_hot_columns], drop_first=True)
        num_df = data.drop(one_hot_columns, axis=1)
        data = pd.concat([ohe_df, num_df], axis=1)
        self.features = data.drop("rent", axis=1).values
        self.target = torch.tensor(data["rent"].values)

    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        return self.features[idx], self.target[idx]

In [5]:
# neural network class
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        # First fully connected layer
        # input features: areaSqm (1) + rentDetail (2) + propertyType (5) + furnish (4) + internet (3) + roommates (11) + shower (4) + toilet (4) + kitchen (4) + living (4) + pets (4) + smokingInside (4) + matchCapacity (8) (+ matchStatus (4)) = 58
        self.fc1 = nn.Linear(41, 512)
        # self.fc2 = nn.Linear(512, 512)
        # self.dropout1 = nn.Dropout()
        self.fc3 = nn.Linear(512, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)

        # x = self.fc2(x)
        # x = F.relu(x)

        # x = self.dropout1(x)

        x = self.fc3(x)
        x = F.relu(x)

        # x = self.dropout2(x)
        x = self.fc4(x)

        output = x
        return output

In [6]:
def train_loop(dataset, model, loss_fn, optimizer, batch_size):
    size = len(dataset)
    for batch in range(size // batch_size):
        X, y = dataset[batch * batch_size : (batch + 1) * batch_size]
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [7]:
learning_rate = 1e-3
batch_size = 2
epochs = 2
loss_fn = torch.nn.MSELoss(reduction='sum')
net = Net()
optimizer = optim.AdamW(net.parameters(), lr=learning_rate)

In [1]:
train_data = FCGDataset("train.csv")
print("Done!")
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_data, net, loss_fn, optimizer, batch_size)
print("Done!")

NameError: name 'FCGDataset' is not defined

In [5]:
import pandas as pd
import torch
import numpy as np
file_dir = "train.csv"
data = pd.read_csv(file_dir)
used_columns = ["areaSqm", "rentDetail", "propertyType", "furnish", "internet", "roommates", "shower", "toilet", "kitchen", "living", "pets", "smokingInside", "matchCapacity", "rent"]
one_hot_columns = ["rentDetail", "propertyType", "furnish", "internet", "roommates", "shower", "toilet", "kitchen", "living", "pets", "smokingInside", "matchCapacity"]
data = data[used_columns]
ohe_df = pd.get_dummies(data[one_hot_columns], drop_first=True)
num_df = data.drop(one_hot_columns, axis=1)
data = pd.concat([ohe_df, num_df], axis=1)

In [9]:
dropped = data.drop("rent", axis=1)
values = dropped.values
values

array([[ 0,  1,  0, ...,  0,  0, 14],
       [ 0,  1,  0, ...,  0,  0, 16],
       [ 0,  1,  0, ...,  0,  0, 16],
       ...,
       [ 0,  1,  0, ...,  1,  0, 28],
       [ 0,  1,  0, ...,  0,  0, 35],
       [ 0,  0,  0, ...,  0,  0, 25]])

In [12]:
# features = torch.tensor(data.drop("rent", axis=1).values)

In [10]:
targets = torch.tensor(data["rent"].values)

In [1]:
targets

NameError: name 'targets' is not defined

In [6]:
torch.from_numpy(values)

tensor([[ 0,  1,  0,  ...,  0,  0, 14],
        [ 0,  1,  0,  ...,  0,  0, 16],
        [ 0,  1,  0,  ...,  0,  0, 16],
        ...,
        [ 0,  1,  0,  ...,  1,  0, 28],
        [ 0,  1,  0,  ...,  0,  0, 35],
        [ 0,  0,  0,  ...,  0,  0, 25]])

In [4]:
type(values)

numpy.ndarray

In [12]:
values_first_40 = np.interp(values[:40], (0, 1), (0.0001, 0.9999))
values_last = np.interp(values[40], (np.min(values[40]), np.max(values[40])), (0.0001, 0.9999))

In [12]:
value_tensor = torch.Tensor(values)

In [13]:
values_first_40

array([[1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01],
       [1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01],
       [1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01],
       ...,
       [1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01],
       [1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01],
       [1.000e-04, 9.999e-01, 1.000e-04, ..., 1.000e-04, 1.000e-04,
        9.999e-01]])

In [14]:
values_last

array([1.34306667e-02, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.34306667e-02,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.34306667e-02,
       1.00000000e-04, 1.34306667e-02, 1.00000000e-04, 1.00000000e-04,
       1.00000000e-04, 1.34306667e-02, 1.00000000e-04, 1.34306667e-02,
       1.00000000e-04, 1.00000000e-04, 1.34306667e-02, 1.00000000e-04,
       1.00000000e-04, 1.34306667e-02, 1.34306667e-02, 1.00000000e-04,
       1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
       9.99900000e-01])