# Imports

In [150]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
import utils.preprocessing as pr
from sklearn.model_selection import train_test_split

# Load data

In [151]:
df_train = pd.read_csv('data/ready/train.csv')
df_train

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,0,65.0,8450,0,0,0,0,0,...,0,0,0,0,0,2,2008,0,0,208500
1,2,20,0,80.0,9600,0,0,0,0,0,...,0,0,0,0,0,5,2007,0,0,181500
2,3,60,0,68.0,11250,0,0,1,0,0,...,0,0,0,0,0,9,2008,0,0,223500
3,4,70,0,60.0,9550,0,0,1,0,0,...,0,0,0,0,0,2,2006,0,1,140000
4,5,60,0,84.0,14260,0,0,1,0,0,...,0,0,0,0,0,12,2008,0,0,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,0,62.0,7917,0,0,0,0,0,...,0,0,0,0,0,8,2007,0,0,175000
1456,1457,20,0,85.0,13175,0,0,0,0,0,...,0,0,1,0,0,2,2010,0,0,210000
1457,1458,70,0,66.0,9042,0,0,0,0,0,...,0,0,3,1,2500,5,2010,0,0,266500
1458,1459,20,0,68.0,9717,0,0,0,0,0,...,0,0,0,0,0,4,2010,0,0,142125


# Prepare

In [152]:
cols_x = list(df_train.drop(columns=['Id', 'SalePrice']).columns)
cols_y = 'SalePrice'
cols_x[:5]

['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street']

## Scale X

In [153]:
scaler = MinMaxScaler()
df_train_new = pd.DataFrame(scaler.fit_transform(df_train[cols_x])
                            , columns=df_train[cols_x].columns
                            , index=df_train.index)



In [154]:
df_train_new[['Id', 'SalePrice']] = df_train[['Id', 'SalePrice']]
df_train_new.describe()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Id,SalePrice
count,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,0.217043,0.085103,0.186133,0.04308,0.00411,0.045205,0.136073,0.065068,0.000685,0.149144,...,0.003425,0.083219,0.010788,0.002806,0.483811,0.453938,0.029366,0.07,730.5,180921.19589
std,0.248827,0.199577,0.111338,0.046653,0.063996,0.186075,0.194099,0.214003,0.026171,0.25259,...,0.052972,0.198329,0.061496,0.032008,0.245784,0.332024,0.100569,0.177574,421.610009,79442.502883
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,34900.0
25%,0.0,0.0,0.136943,0.029229,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.363636,0.25,0.0,0.0,365.75,129975.0
50%,0.176471,0.0,0.203822,0.038227,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.454545,0.5,0.0,0.0,730.5,163000.0
75%,0.294118,0.0,0.254777,0.04815,0.0,0.0,0.333333,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.636364,0.75,0.0,0.0,1095.25,214000.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1460.0,755000.0


## Log target

In [155]:
df_train_new.loc[:, 'SalePrice'] = df_train_new['SalePrice'].apply(pr.one_plus_log)
df_train_new['SalePrice']

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1460, dtype: float64

# Make data loader

## Train-val split

In [156]:
df_train, df_val = train_test_split(df_train_new, test_size=0.2, random_state=16)
print(df_train.shape, df_val.shape)
df_val.head(3)

(1168, 81) (292, 81)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Id,SalePrice
1346,0.0,0.0,0.0,0.091056,0.0,0.0,0.666667,0.0,0.0,0.75,...,0.0,0.0,0.0,0.0,0.454545,0.0,0.0,0.0,1347,12.47801
1218,0.176471,0.25,0.16879,0.02309,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.545455,0.0,0.0,0.0,1219,11.296025
397,0.235294,0.0,0.22293,0.0294,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.545455,0.25,0.0,0.0,398,12.040614


## Make data loaders

In [157]:
class MyDataset(Dataset):

    def __init__(self,df, x_cols, y_cols):
        

        x=df[x_cols]
        y=df[y_cols]

        self.x_train=torch.tensor(x.values
                                  , dtype=torch.float32)
        self.y_train=torch.tensor(y.values
                                  , dtype=torch.float32)

    def __len__(self):
        return len(self.y_train)
  
    def __getitem__(self,idx):
        return self.x_train[idx],self.y_train[idx]

In [158]:
ds_train = MyDataset(df_train, cols_x, 'SalePrice')
train_loader=DataLoader(ds_train,batch_size=10,shuffle=False)

ds_val = MyDataset(df_val, cols_x, cols_y)
val_loader = DataLoader(ds_val, batch_size=10, shuffle=False)

# Train model

In [159]:
model = nn.Sequential(nn.Linear(79, 480),
                      nn.ReLU(),
                      nn.Linear(480, 240),
                      nn.ReLU(),
                      nn.Linear(240, 64),
                      nn.ReLU(),
                      nn.Linear(64, 1))

In [160]:
def RMSELoss(yhat,y):
    return torch.sqrt(torch.mean((yhat-y)**2))

criterion = RMSELoss

train_dataiter = iter(train_loader)

optimizer = optim.Adam(model.parameters(), lr=0.003)

In [161]:
epochs = 100

for e in range(epochs):
    running_loss = 0
    for x, y in train_loader:
        
        optimizer.zero_grad()
        
        y_hat = model(x)
        loss = criterion(y_hat, y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        ##
        #print(y[:10])
        #print(y_hat[:10])
        ##
        
    else:
        print(f"Training loss: {running_loss/len(train_dataiter)}")

Training loss: 1.905118124352561
Training loss: 0.7483076521983514
Training loss: 0.6853323543173635
Training loss: 0.6273826347966479
Training loss: 0.61325538973523
Training loss: 0.5695919486192557
Training loss: 0.5483001829721988
Training loss: 0.4847000299865364
Training loss: 0.4745110302654087
Training loss: 0.4743799228444059
Training loss: 0.46991775586054874
Training loss: 0.5255972985019032
Training loss: 0.5685535468097426
Training loss: 0.5476770564022228
Training loss: 0.5261263776029277
Training loss: 0.4793789710244562
Training loss: 0.5236467950873904
Training loss: 0.5634657845028446
Training loss: 0.5309885281782883
Training loss: 0.46058787163506204
Training loss: 0.4783708516858582
Training loss: 0.488475571712877
Training loss: 0.4822581710978451
Training loss: 0.5819262378236167
Training loss: 0.6617676776189071
Training loss: 0.5834257875242804
Training loss: 0.485757193249515
Training loss: 0.4960193504125644
Training loss: 0.5997159358782645
Training loss: 0.

# Premature nn prediction

In [163]:
df_test = pd.read_csv('data/ready/test.csv')
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,4,80.0,11622,0,-1,0,0,0,...,120,0,-1,1,-1,0,6,2010,0,0
1,1462,20,0,81.0,14267,0,-1,1,0,0,...,0,0,-1,-1,2,12500,6,2010,0,0
2,1463,60,0,74.0,13830,0,-1,1,0,0,...,0,0,-1,1,-1,0,3,2010,0,0
3,1464,60,0,78.0,9978,0,-1,1,0,0,...,0,0,-1,-1,-1,0,6,2010,0,0
4,1465,120,0,43.0,5005,0,-1,1,3,0,...,144,0,-1,-1,-1,0,1,2010,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,1,21.0,1936,0,-1,0,0,0,...,0,0,-1,-1,-1,0,6,2006,0,0
1455,2916,160,1,21.0,1894,0,-1,0,0,0,...,0,0,-1,-1,-1,0,4,2006,0,1
1456,2917,20,0,160.0,20000,0,-1,0,0,0,...,0,0,-1,-1,-1,0,9,2006,0,1
1457,2918,85,0,62.0,10441,0,-1,0,0,0,...,0,0,-1,1,1,700,7,2006,0,0


In [166]:
df_test[df_test.drop(columns=['Id'], inplace=False).columns]
scaler.transform(df_test[cols_x])

array([[0.        , 1.        , 0.25796178, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.2611465 , ..., 1.        , 0.        ,
        0.        ],
       [0.23529412, 0.        , 0.2388535 , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.51273885, ..., 0.        , 0.        ,
        0.2       ],
       [0.38235294, 0.        , 0.20063694, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.        , 0.2388535 , ..., 0.        , 0.        ,
        0.        ]])

In [168]:
df_test[cols_x] = scaler.transform(df_test[cols_x])
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,0.000000,1.00,0.257962,0.048246,0.0,-0.5,0.000000,0.0,0.0,...,0.25,0.0,-0.333333,0.25,-0.25,0.000000,0.454545,1.0,0.0,0.0
1,1462,0.000000,0.00,0.261146,0.060609,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,0.50,0.806452,0.454545,1.0,0.0,0.0
2,1463,0.235294,0.00,0.238854,0.058566,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,0.25,-0.25,0.000000,0.181818,1.0,0.0,0.0
3,1464,0.235294,0.00,0.251592,0.040562,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.454545,1.0,0.0,0.0
4,1465,0.588235,0.00,0.140127,0.017318,0.0,-0.5,0.333333,1.0,0.0,...,0.30,0.0,-0.333333,-0.25,-0.25,0.000000,0.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,0.823529,0.25,0.070064,0.002973,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.454545,0.0,0.0,0.0
1455,2916,0.823529,0.25,0.070064,0.002776,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.272727,0.0,0.0,0.2
1456,2917,0.000000,0.00,0.512739,0.087406,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.727273,0.0,0.0,0.2
1457,2918,0.382353,0.00,0.200637,0.042726,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,0.25,0.25,0.045161,0.545455,0.0,0.0,0.0


In [169]:
df_test_new = pd.DataFrame(
                scaler.fit_transform(df_test[df_test.drop(columns='Id').columns])
                , columns=df_test.drop(columns='Id').columns
                , index=df_test.index)
df_test_new['Id'] = df_test['Id']
df_test_new

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,Id
0,0.000000,1.0,0.402985,0.184147,0.0,0.0,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.4,0.00,0.000000,0.454545,1.0,0.111111,0.0,1461
1,0.000000,0.2,0.407960,0.232124,0.0,0.0,0.333333,0.0,1.0,0.5,...,0.0,0.0,0.0,0.75,0.735294,0.454545,1.0,0.111111,0.0,1462
2,0.235294,0.2,0.373134,0.224197,0.0,0.0,0.333333,0.0,1.0,0.0,...,0.0,0.0,0.4,0.00,0.000000,0.181818,1.0,0.111111,0.0,1463
3,0.235294,0.2,0.393035,0.154326,0.0,0.0,0.333333,0.0,1.0,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.454545,1.0,0.111111,0.0,1464
4,0.588235,0.2,0.218905,0.064121,0.0,0.0,0.333333,1.0,1.0,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.000000,1.0,0.111111,0.0,1465
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,0.823529,0.4,0.109453,0.008453,0.0,0.0,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.454545,0.0,0.111111,0.0,2915
1455,0.823529,0.4,0.109453,0.007691,0.0,0.0,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.272727,0.0,0.111111,0.2,2916
1456,0.000000,0.2,0.800995,0.336115,0.0,0.0,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.0,0.00,0.000000,0.727273,0.0,0.111111,0.2,2917
1457,0.382353,0.2,0.313433,0.162724,0.0,0.0,0.000000,0.0,1.0,0.0,...,0.0,0.0,0.4,0.50,0.041176,0.545455,0.0,0.111111,0.0,2918


In [174]:
df_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,0.000000,1.00,0.257962,0.048246,0.0,-0.5,0.000000,0.0,0.0,...,0.25,0.0,-0.333333,0.25,-0.25,0.000000,0.454545,1.0,0.0,0.0
1,1462,0.000000,0.00,0.261146,0.060609,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,0.50,0.806452,0.454545,1.0,0.0,0.0
2,1463,0.235294,0.00,0.238854,0.058566,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,0.25,-0.25,0.000000,0.181818,1.0,0.0,0.0
3,1464,0.235294,0.00,0.251592,0.040562,0.0,-0.5,0.333333,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.454545,1.0,0.0,0.0
4,1465,0.588235,0.00,0.140127,0.017318,0.0,-0.5,0.333333,1.0,0.0,...,0.30,0.0,-0.333333,-0.25,-0.25,0.000000,0.000000,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,0.823529,0.25,0.070064,0.002973,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.454545,0.0,0.0,0.0
1455,2916,0.823529,0.25,0.070064,0.002776,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.272727,0.0,0.0,0.2
1456,2917,0.000000,0.00,0.512739,0.087406,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,-0.25,-0.25,0.000000,0.727273,0.0,0.0,0.2
1457,2918,0.382353,0.00,0.200637,0.042726,0.0,-0.5,0.000000,0.0,0.0,...,0.00,0.0,-0.333333,0.25,0.25,0.045161,0.545455,0.0,0.0,0.0


In [175]:
test = torch.tensor(df_test.drop(['Id'], axis=1, inplace=False).values.astype(np.float32))

In [177]:
submission = model(test)

In [181]:
df_test['SalePrice'] = submission.data

In [184]:
df_test['SalePrice'] = df_test['SalePrice'].apply(pr.one_plus_log_reverse)

In [185]:
df_test[['Id', 'SalePrice']].to_csv('data/results/nn.csv', index=False)