In [None]:
# kaggle competition: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

In [25]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


In [4]:
def load_array(data_arrays, batch_size, is_train=True):
    """Construct a PyTorch data loader"""
    dataset = torch.utils.data.TensorDataset(*data_arrays)
    return torch.utils.data.DataLoader(dataset, batch_size, shuffle=is_train)

In [8]:
class NeuralNet(nn.Module): # Fully connected neural network with one hidden layer
    def __init__(self, input_size, hidden_size, num_classes):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [124]:
def train_and_pred(train, test, k, learning_rate, batch_size, model, num_epochs):
    
    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # pre-process the datasets
    # features can not include SalePrice
    
    train_all_features = train.iloc[:, 1:-1]
    test_all_features = test.iloc[:, 1:]
    
    
    numeric_features = train_all_features.describe().columns

    train_all_features[numeric_features] = train_all_features[numeric_features].apply(
        lambda x: (x - x.mean()) / (x.std()))

    test_all_features[numeric_features] = test_all_features[numeric_features].apply(
        lambda x: (x - x.mean()) / (x.std()))
    
    
    # After standardizing the data all means vanish, hence we can set missing
    # values to 0

    # Dummy_na=True refers to a missing value being a legal eigenvalue, and
    # creates an indicative feature for it

    all_features = pd.concat((train_all_features, test_all_features))

    all_features[numeric_features] = all_features[numeric_features].fillna(0)

    all_features = pd.get_dummies(all_features, dummy_na=True)
    
    real_test_features_tensor = torch.tensor(all_features[train_all_features.shape[0]:].values,dtype=torch.float32)
    
    # k-fold validations
    train_logrmse = []
    test_logrmse = []
    for f in range(0,k):
        # split the original train dataset into k folds
        X_train, X_test, y_train, y_test = train_test_split(all_features[:train_all_features.shape[0]], 
                                                            train.SalePrice, test_size = 1/k, 
                                                            random_state = f)
        
        # use torch.any(torch.isnan(x)) to check wheter have nan value or not 
        train_features_tensor = torch.tensor(X_train.values,dtype=torch.float32)
        train_labels = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)
        test_features_tensor = torch.tensor(X_test.values,dtype=torch.float32)
        #test_labels = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)
        
        # training the one hidden layer nueral network model------------------------------------------------
        # transform dataframes into iterator which is used as inputs in neural network
        train_iter = load_array((train_features_tensor, train_labels), batch_size)
        
        # criterion and optimizer
        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)  
        
        # train the model
        total_step = len(train_iter)
        for epoch in range(num_epochs):
            for i, (data,label) in enumerate(train_iter):  
                # Move tensors to the configured device
                data = data.to(device)
                label = label.to(device)

                # Forward pass
                outputs = model(data)
                #print(outputs)
                loss = criterion(outputs, label)
                log_loss = criterion(torch.log(outputs),torch.log(label))

                # Backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                logrmse = torch.sqrt(torch.mean(log_loss))

                '''if ((i+1) % 10 == 0) | ((i+1) % 23 == 0):
                    print ('Epoch [{}/{}], Step [{}/{}], train_Log_rmse: {:.4f}' 
                           .format(epoch+1, num_epochs, i+1, total_step, logrmse))'''
                if (epoch == num_epochs -1) & ((i+1) % 17 == 0):
                    
                    train_logrmse.append(logrmse.item())
                    
        # get the prediciton of X_test (or y_test_hat)
        y_test_hat = model(test_features_tensor.cuda()).detach()
        y_test_hat = y_test_hat.cpu().numpy()
        
        #temp = torch.nn.MSELoss(torch.log(test_labels) - torch.log(y_test_hat.cpu()))
        #test_logrmse.append(torch.sqrt(torch.mean(temp)))
        mse = mean_squared_error(np.log(y_test), np.log(y_test_hat))
        test_logrmse.append(np.sqrt(mse))
    
        print('fold %d, train rmse: %f, test rmse: %f' % (f, train_logrmse[-1], test_logrmse[-1]))
    print('total train rmse: %f, total test rmse: %f' % (np.mean(train_logrmse), np.mean(test_logrmse)))

    '''preds = model(real_test_features_tensor.cuda()).detach()
    preds = preds.cpu().numpy()
    test['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test['Id'], test['SalePrice']], axis=1)
    submission.to_csv('/home/dm/Downloads/submission.csv', index=False)'''
        

In [125]:
def main():
    # load the datasets
    train_dataset = pd.read_csv('/home/dm/Downloads/house_price/train.csv')
    test_dataset = pd.read_csv('/home/dm/Downloads/house_price/test.csv')

    # Device configuration
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # train_features_tensor.shape = [1460,331]
    input_size = 331
    # tune this hidden_size
    hidden_size = 100
    num_classes = 1 
    learning_rate = 5
    batch_size = 64
    k = 4
    num_epochs = 1000

    #model = NeuralNet1(input_size, num_classes).to(device)
    model = NeuralNet(input_size, hidden_size, num_classes).to(device)
    
    train_and_pred(train_dataset, test_dataset, k, learning_rate, batch_size, model, num_epochs)



In [126]:
main()

fold 0, train rmse: 0.087998, test rmse: 0.223163
fold 1, train rmse: 0.084573, test rmse: 0.154721
fold 2, train rmse: 0.098414, test rmse: 0.156710
fold 3, train rmse: 0.085395, test rmse: 0.165029
total train rmse: 0.089095, total test rmse: 0.174906
