# Deep neural network application

Este notebook contiene un ejemplo del uso de la arquitectura previamente creada en el dataset: [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course/data) para predecir el precio de ventas "La variable SalePrice".

## Preprocessing the data

Loading imports

In [17]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
from model import cost_function as cost
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [18]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

data_train = pd.read_csv(TRAIN_PATH, index_col= 'Id')
data_test = pd.read_csv(TEST_PATH, index_col= 'Id')

Exploring data

In [26]:
print("Train size: {}".format(data_train.shape))
print("Test size: {} \n\n".format(data_test.shape))

#Dropping NA values in SalePrice
data_train.dropna(subset = ["SalePrice"], inplace = True)

y_train_full = data_train.SalePrice
x_train_full = data_train.drop(labels = ["SalePrice"], axis = 1)

x_test_full = data_test

print("X train head \n\n{}".format(x_train_full.head))
print("\nX test head \n\n{}".format(x_test_full.head))

print("Data info")
x_train_full.info()

print("\n")

numerical_cols = [col for col in x_train_full.columns if x_train_full[col].dtype in ["float64", "int64"]]
categorical_cols = [col for col in x_train_full.columns if x_train_full[col].dtype == "object"]

print("Categorical columns: \n{} \n".format(categorical_cols))
print("Numerical columns: \n{}".format(numerical_cols))

#Splitting x_train_full in categorical and numerical parts

x_train_numerical = x_train_full.drop(labels = categorical_cols, axis = 1)
x_test_numerical = x_test_full.drop(labels = categorical_cols, axis = 1)

train_numerical_missing = [col for col in x_train_numerical.columns if x_train_numerical[col].isnull().any()]
test_numerical_missing = [col for col in x_test_numerical.columns if x_test_numerical[col].isnull().any()]

print("\nDifferent numerical missing: \n")
print(set(train_numerical_missing).symmetric_difference(test_numerical_missing))
print("\nDifferent numerical missing in train: \n")
print(set(train_numerical_missing).difference(test_numerical_missing))
print("\nDifferent numerical missing in test: \n")
print(set(test_numerical_missing).difference(train_numerical_missing))

Train size: (1460, 80)
Test size: (1459, 79) 


X train head 

<bound method NDFrame.head of       MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                      
1             60       RL         65.0     8450   Pave   NaN      Reg   
2             20       RL         80.0     9600   Pave   NaN      Reg   
3             60       RL         68.0    11250   Pave   NaN      IR1   
4             70       RL         60.0     9550   Pave   NaN      IR1   
5             60       RL         84.0    14260   Pave   NaN      IR1   
...          ...      ...          ...      ...    ...   ...      ...   
1456          60       RL         62.0     7917   Pave   NaN      Reg   
1457          20       RL         85.0    13175   Pave   NaN      Reg   
1458          70       RL         66.0     9042   Pave   NaN      Reg   
1459          20       RL         68.0     9717   Pave   NaN      Reg   
1460          20       RL      

Preproccesing with imputator modified

In [27]:
imputer = SimpleImputer() #Imputer with mean strategy

for col in train_numerical_missing:
    x_train_numerical[col + "_was_missing"] = x_train_numerical[col].isnull()
    x_test_numerical[col + "_was_missing"] = x_test_numerical[col].isnull()
    
imputed_train = pd.DataFrame(imputer.fit_transform(x_train_numerical))
imputed_test= pd.DataFrame(imputer.transform(x_test_numerical))

imputed_train.columns = x_train_numerical.columns
imputed_test.columns = x_test_numerical.columns

print("\nimputed_train shape: {}".format(imputed_train.shape))
print("\nimputed_test shape: {}".format(imputed_test.shape))
print("numerical columns: \n{}".format(imputed_train.columns))


imputed_train shape: (1460, 39)

imputed_test shape: (1459, 39)
numerical columns: 
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'LotFrontage_was_missing', 'MasVnrArea_was_missing',
       'GarageYrBlt_was_missing'],
      dtype='object')
