In [2]:
from lib.utils import get_train_split_data, load_all_resale_data, get_cleaned_normalized_data
from lib.eval import get_regression_metrics

In [3]:
X, y = load_all_resale_data()

X, y = get_cleaned_normalized_data(X, y)

# Change the order to match what the function returns
X_train, X_test, y_train, y_test = get_train_split_data(X, y, 0.2)

Loading data from c:\users\wzaww\projects\nus-cs3244-assignment1\scripts\lib\../data\Resale Flat Prices (Based on Approval Date), 1990 - 1999.csv...
Loading data from c:\users\wzaww\projects\nus-cs3244-assignment1\scripts\lib\../data\Resale Flat Prices (Based on Approval Date), 2000 - Feb 2012.csv...
Loading data from c:\users\wzaww\projects\nus-cs3244-assignment1\scripts\lib\../data\Resale Flat Prices (Based on Registration Date), From Mar 2012 to Dec 2014.csv...
Loading data from c:\users\wzaww\projects\nus-cs3244-assignment1\scripts\lib\../data\Resale Flat Prices (Based on Registration Date), From Jan 2015 to Dec 2016.csv...
Loading data from c:\users\wzaww\projects\nus-cs3244-assignment1\scripts\lib\../data\Resale flat prices based on registration date from Jan-2017 onwards.csv...
Combined dataset shape: (948962, 11)
Features shape: (948962, 10)
Target shape: (948962,)
Selected features: month, town, flat_type, block, street_name, storey_range, floor_area_sqm, flat_model, lease_com

In [21]:
X_train.head()

Unnamed: 0,storey_range,floor_area_sqm,relative_month,flat_age,flat_type_ordered,town_BEDOK,town_BISHAN,town_BUKIT BATOK,town_BUKIT MERAH,town_BUKIT PANJANG,...,block_997B,block_997C,block_998A,block_998B,block_999B,block_99A,block_99B,block_99C,block_9A,block_9B
132286,0.0,0.115146,0.187204,0.216667,0.333333,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
670651,0.0,0.188958,0.64455,0.5,0.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
361721,0.25,0.265722,0.341232,0.316667,0.666667,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
452735,0.1875,0.135813,0.424171,0.316667,0.333333,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
476872,0.125,0.212578,0.445498,0.183333,0.5,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [31]:
X_train.dtypes[X_train.dtypes != 'float64'][X_train.dtypes != 'bool']

Series([], dtype: object)

All columns have either type float or type boolean. Convert bool to float since tensors have to be numerical.

In [15]:
import pandas as pd
import torch

In [33]:
# determine the supported device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu') # don't have GPU 
    return device

# convert a df to tensor to be used in pytorch
def df_to_tensor(df):
    device = get_device()
    return torch.from_numpy(df.astype('float64').values).float().to(device)

X_train_tensor = df_to_tensor(X_train)
y_train_tensor = df_to_tensor(y_train).reshape(-1,1)

In [35]:
X_train_tensor

tensor([[0.0000, 0.1151, 0.1872,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.1890, 0.6445,  ..., 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2657, 0.3412,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.2500, 0.1860, 0.4408,  ..., 0.0000, 0.0000, 0.0000],
        [0.1250, 0.1151, 0.4621,  ..., 0.0000, 0.0000, 0.0000],
        [0.0625, 0.1594, 0.1825,  ..., 0.0000, 0.0000, 0.0000]])

In [39]:
y_train_tensor = y_train_tensor.reshape(-1,1)

In [41]:
y_train_tensor

tensor([[162000.],
        [410000.],
        [302000.],
        ...,
        [258000.],
        [153000.],
        [315000.]])

In [43]:
X_train_tensor.shape, y_train_tensor.shape

(torch.Size([189792, 2800]), torch.Size([189792, 1]))

In [45]:
import torch.nn as nn

class HousePricePredictor(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 64),  # Input layer
            nn.ReLU(),                  # Activation
            nn.Linear(64, 32),          # Hidden layer
            nn.ReLU(),
            nn.Linear(32, 1)           # Output layer (single value for price)
        )
    
    def forward(self, x):
        return self.layers(x)

In [47]:
device = get_device()  # From your earlier code

# Get input size from the training data
input_size = X_train_tensor.shape[1]

# Initialize model
model = HousePricePredictor(input_size).to(device)

# Loss function (MSE for regression)
criterion = nn.MSELoss()

# Optimizer (Adam is a good default)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [49]:
from torch.utils.data import TensorDataset, DataLoader

# Create dataset and loader
dataset = TensorDataset(X_train_tensor, y_train_tensor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [51]:
num_epochs = 100

for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    
    for batch_X, batch_y in dataloader:
        # Forward pass
        predictions = model(batch_X)
        loss = criterion(predictions, batch_y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print progress
    avg_loss = total_loss / len(dataloader)
    if (epoch+1) % 10 == 0:
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

KeyboardInterrupt: 