In [2]:
#Importing all the different python modules needed

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader, random_split
from ffModel import FeedForward
from torch.optim.lr_scheduler import ExponentialLR, StepLR, MultiStepLR, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler

In [5]:
#importing data into pandas data frame
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Save the Id column from the test data for kaggle purposes.
savedID = test_data['Id'].astype('int32')
test_data.drop(['Id'], axis=1, inplace=True)

# Separate target variable from training data
Ytrain = train_data['SalePrice']
Xtrain = train_data.drop(['SalePrice', 'Id'], axis=1)

# Combine training and test data for consistent encoding
combinedData = pd.concat([Xtrain, test_data], axis=0)

# Handle categorical variables by applying one hot encoding
combinedData = pd.get_dummies(combinedData)

# Fill missing values with column means
combinedData = combinedData.fillna(combinedData.mean())

# Separate the combined data back into training and test sets
Xtrain = combinedData.iloc[:len(Xtrain), :]
Xtest = combinedData.iloc[len(Xtrain):, :]

# Standardize the data using scikit-learn
scaler = StandardScaler()
Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

# Convert to torch tensors
Xtrain = torch.tensor(Xtrain, dtype=torch.float32)
Ytrain = torch.tensor(Ytrain.values, dtype=torch.float32).view(-1, 1)
Xtest = torch.tensor(Xtest, dtype=torch.float32)

# Create DataLoader for the test data
testValuesFinal = DataLoader(TensorDataset(Xtest), shuffle=False)

#create dataloader for training data
trainValuesFinal = DataLoader(TensorDataset(Xtrain, Ytrain),batch_size= 28,  shuffle=True)

In [3]:
'''
Now its time to set up the model, optimizer, and the loss function to actually preform the training.
For the loss equation as im doing uni-variate normal regression to determine the correct numerical value
I will at first use default hyperparameters and assess how well the model is doing then adjust from there using optuna in separate notebook
'''
model = FeedForward(0.1262981918417337).to('cuda')
model.init_weights()
optimizer = optim.AdamW(model.parameters(), lr=7.216877087985235e-05, betas=(0.3765401693820542, 0.5171372098061646), eps=1e-9, weight_decay=1.6014659044365067e-05)
lossf = nn.MSELoss()
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=81, factor=0.0032423913478675614)

In [4]:
#Now to actually train the model in 174 epochs
#Set up average_loss variable again for scheduler
average_loss = 0

#Set model to training mode
model.train()

#Actual training loop
for epoch in range(174):
    for inputs, targets in trainValuesFinal:
        inputs = inputs.to('cuda')
        targets = targets.to('cuda')
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = lossf(outputs, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        average_loss += loss.item()
    #remember to step scheduler after each epoch
    scheduler.step(average_loss/len(trainValuesFinal))

In [5]:
#now to set up the testing to submit to kaggle.
model.eval()
predictions = []
with torch.no_grad():
    for item in testValuesFinal:
        item = item[0].to('cuda')
        prediction = model.forward(item)
        predictions.append(prediction.cpu().numpy())
submission = np.concatenate(predictions).reshape(-1, 1)
submission = pd.DataFrame(submission, columns=['SalePrice'])
final = pd.concat([savedID, submission], axis=1)
final.to_csv('submission.csv', index=False)