In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from matplotlib import pyplot as plt
import time

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [2]:
def get_file_names(folder):
    #Listing entries present in given folder
    entries = os.listdir(folder)
    for i in entries:
        if 'csv' not in i:
            entries.remove(i)
    return sorted(entries, reverse=True)[0:12]

train_path = "../data/train/"
test_path = "../data/test/"
val_path = "../data/validation/"

train_files = get_file_names(train_path)
test_files = get_file_names(test_path)
val_files = get_file_names(val_path)

frames = []
for i in train_files:
    frames.append(pd.read_csv(train_path+i))
train_df = pd.concat(frames)
train_df = train_df.reset_index(drop=True)

frames = []
for i in test_files:
    frames.append(pd.read_csv(test_path+i))
test_df = pd.concat(frames)
test_df = test_df.reset_index(drop=True)

frames = []
for i in val_files:
    frames.append(pd.read_csv(val_path+i))
val_df = pd.concat(frames)
val_df = val_df.reset_index(drop=True)

In [3]:
class airbnb_dataset (Dataset):
    def __init__(self, df, purpose):
        self.df = df
        self.price = self.df["price"]
        self.df = self.df.drop(columns="price")
        self.purpose = purpose
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        # Returns specific sample as a dict
        if torch.is_tensor(index):
            index = index.tolist()
        sample = torch.tensor(self.df.iloc[index]).float()
        label = torch.tensor([self.price.iloc[index]]).float()
        return {"sample": sample, "label": label}

In [9]:
train_ds = airbnb_dataset(train_df, "train")
test_ds = airbnb_dataset(test_df, "test")
val_ds = airbnb_dataset(val_df, "test")

train_loader = DataLoader(train_ds, batch_size=1000, num_workers=4)
test_loader = DataLoader(test_ds, batch_size=1000, num_workers=4)
val_loader = DataLoader(val_ds, batch_size=1000, num_workers=4)

In [10]:
class airbnb_net (nn.Module):
    
    def __init__(self):
        super(airbnb_net, self).__init__()
        self.layer1 = nn.Linear(29, 100)
        self.layer2 = nn.Linear(100, 20)
        self.layer3 = nn.Linear(20, 1)
    
    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.layer3(x)

In [26]:
def train(net, train_loader, val_loader):
    criterion = nn.MSELoss()
    optimizer = optim.SGD(net.parameters(), lr=0.000001, momentum = 0.9)
    
    print("Starting Training...")
    start_time = time.time()
    
    for epoch in range(200):
        epoch_loss = 0
        epoch_time = time.time()
        batch_loss = 0
        batch_time = time.time()
        for i, data in enumerate(train_loader):
            sample = data["sample"]
            label = data["label"]
            output = net(sample)
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            batch_loss += loss
            epoch_loss += loss
            break
            
            if i % 10 == 0:
                batch_loss = batch_loss/10
                print("---[ITER %d] loss: %.3f  time: %.3f" % (i, batch_loss, time.time()-batch_time))
                batch_loss = 0
                batch_time = time.time()
        epoch_loss /= len(train_loader)
        
        print("[EPOCH %d] loss: %.3f  time: %.3f" % (epoch+1, epoch_loss, time.time()-epoch_time))
    
    print("=========================================")
    print("Training Completed...")
    print("[FINAL] loss: %.3f  time: %.3f" % (epoch_loss, time.time()-start_time()))
            

In [27]:
net = airbnb_net()

In [28]:
train(net, train_loader, val_loader)

Starting Training...
[EPOCH 1] loss: 251.529  time: 3.160
[EPOCH 2] loss: 251.458  time: 3.846
[EPOCH 3] loss: 251.324  time: 2.911
[EPOCH 4] loss: 251.136  time: 3.433
[EPOCH 5] loss: 250.905  time: 3.160
[EPOCH 6] loss: 250.643  time: 2.979
[EPOCH 7] loss: 250.359  time: 2.922
[EPOCH 8] loss: 250.061  time: 3.503
[EPOCH 9] loss: 249.756  time: 3.191
[EPOCH 10] loss: 249.462  time: 4.277
[EPOCH 11] loss: 249.202  time: 2.958
[EPOCH 12] loss: 248.940  time: 2.718
[EPOCH 13] loss: 248.664  time: 3.093
[EPOCH 14] loss: 248.371  time: 2.814
[EPOCH 15] loss: 248.058  time: 3.454
[EPOCH 16] loss: 247.722  time: 3.636
[EPOCH 17] loss: 247.362  time: 3.447
[EPOCH 18] loss: 246.974  time: 2.958
[EPOCH 19] loss: 246.557  time: 2.833
[EPOCH 20] loss: 246.107  time: 3.153
[EPOCH 21] loss: 245.621  time: 3.633
[EPOCH 22] loss: 245.095  time: 3.242
[EPOCH 23] loss: 244.526  time: 3.073
[EPOCH 24] loss: 243.909  time: 3.181
[EPOCH 25] loss: 243.239  time: 3.103
[EPOCH 26] loss: 242.509  time: 2.846


NameError: name 'start' is not defined

In [29]:
train_ds[0]

{'sample': tensor([ 1.,  1.,  1.,  1.,  1., 22.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,
          1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.]), 'label': tensor([40.])}

In [47]:
accum = 0
accum1 = 0
for i in range(1000):
    error = net(train_ds[i]["sample"]).item() - train_ds[i]["label"].item()
    accum += error
    accum1 += abs(error)
print("Error:", accum/1000)
print("Abs Error:", accum1/1000)

Error: -1.1141770362854004
Abs Error: 52.260437908172605


In [42]:
net(train_ds[4]["sample"]).item()

181.83848571777344

In [43]:
train_ds[4]["label"].item()

135.0