# Deep neural network application

Este notebook contiene un ejemplo del uso de la arquitectura previamente creada en el dataset: [Housing Prices Competition for Kaggle Learn Users](https://www.kaggle.com/c/home-data-for-ml-course/data) para predecir el precio de ventas "La variable SalePrice".

## Preprocessing the data

##### Loading imports

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
from model import cost_function as cost
from modelArchitecture import L_layer_model as Lmodel
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [2]:
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

data_train = pd.read_csv(TRAIN_PATH, index_col= 'Id')
data_test = pd.read_csv(TEST_PATH, index_col= 'Id')

##### Exploring data

In [3]:
print("Train size: {}".format(data_train.shape))
print("Test size: {} \n".format(data_test.shape))

#Dropping NA values in SalePrice
data_train.dropna(subset = ["SalePrice"], inplace = True)

y_train_full = data_train.SalePrice
x_train_full = data_train.drop(labels = ["SalePrice"], axis = 1)

x_test_full = data_test

print("X train head \n\n{}\n".format(x_train_full.head(3)))

print("Data info\n")
x_train_full.info()

print("\n")

numerical_cols = [col for col in x_train_full.columns if x_train_full[col].dtype in ["float64", "int64"]]
categorical_cols = [col for col in x_train_full.columns if x_train_full[col].dtype == "object"]

print("Categorical columns: \n{} \n".format(categorical_cols))
print("Numerical columns: \n{}".format(numerical_cols))

#Splitting x_train_full in categorical and numerical parts

x_train_numerical = x_train_full.drop(labels = categorical_cols, axis = 1)
x_test_numerical = x_test_full.drop(labels = categorical_cols, axis = 1)

train_numerical_missing = [col for col in x_train_numerical.columns if x_train_numerical[col].isnull().any()]
test_numerical_missing = [col for col in x_test_numerical.columns if x_test_numerical[col].isnull().any()]
print("\nNumerical missing in train: \n")
print(train_numerical_missing)
print("\nNumerical missing in test: \n")
print(test_numerical_missing)
print("\nDifferent numerical missing: \n")
print(set(train_numerical_missing).symmetric_difference(test_numerical_missing))
print("\nDifferent numerical missing in train: \n")
print(set(train_numerical_missing).difference(test_numerical_missing))
print("\nDifferent numerical missing in test: \n")
print(set(test_numerical_missing).difference(train_numerical_missing))

Train size: (1460, 80)
Test size: (1459, 79) 

X train head 

    MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
Id                                                                    
1           60       RL         65.0     8450   Pave   NaN      Reg   
2           20       RL         80.0     9600   Pave   NaN      Reg   
3           60       RL         68.0    11250   Pave   NaN      IR1   

   LandContour Utilities LotConfig  ... ScreenPorch PoolArea PoolQC Fence  \
Id                                  ...                                     
1          Lvl    AllPub    Inside  ...           0        0    NaN   NaN   
2          Lvl    AllPub       FR2  ...           0        0    NaN   NaN   
3          Lvl    AllPub    Inside  ...           0        0    NaN   NaN   

   MiscFeature MiscVal  MoSold  YrSold  SaleType  SaleCondition  
Id                                                               
1          NaN       0       2    2008        WD         Normal 

##### Preproccesing numerical columns

In [4]:
imputer = SimpleImputer() #Imputer with mean strategy

for col in train_numerical_missing:
    x_train_numerical[col + "_was_missing"] = x_train_numerical[col].isnull()
    x_test_numerical[col + "_was_missing"] = x_test_numerical[col].isnull()
    
imputed_train = pd.DataFrame(imputer.fit_transform(x_train_numerical))
imputed_test= pd.DataFrame(imputer.transform(x_test_numerical))

imputed_train.columns = x_train_numerical.columns
imputed_test.columns = x_test_numerical.columns

print("\nimputed_train shape: {}".format(imputed_train.shape))
print("\nimputed_test shape: {}".format(imputed_test.shape))
print("\nnumerical columns in train: \n{}".format(imputed_train.columns))
print("\nnumerical columns in test: \n{}".format(imputed_test.columns))


imputed_train shape: (1460, 39)

imputed_test shape: (1459, 39)

numerical columns in train: 
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'LotFrontage_was_missing', 'MasVnrArea_was_missing',
       'GarageYrBlt_was_missing'],
      dtype='object')

numerical columns in test: 
Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrS

##### Preprocessing categorical columns

In [5]:
x_train_categorical = x_train_full.drop(labels = numerical_cols, axis = 1)
x_test_categorical = x_test_full.drop(labels = numerical_cols, axis = 1)

categorical_train_missing = [col for col in x_train_categorical.columns if x_train_categorical[col].isnull().any()]
categorical_test_missing = [col for col in x_test_categorical.columns if x_test_categorical[col].isnull().any()]
print("\nCategorical missing in train: \n")
print(categorical_train_missing)
print("\nCategorical missin in test: \n")
print(categorical_test_missing)
print("\nDifferent categorical missing: \n")
print(set(categorical_train_missing).symmetric_difference(categorical_test_missing))
print("\nDifferent categorical missing in train: \n")
print(set(categorical_train_missing).difference(categorical_test_missing))
print("\nDifferent categorical missing in test: \n")
print(set(categorical_test_missing).difference(categorical_train_missing))

#Dropping missing and differences to make things easier

x_train_categorical.drop(labels = categorical_train_missing, axis = 1, inplace = True)
x_test_categorical.drop(labels = categorical_test_missing, axis = 1, inplace = True)
x_train_categorical.drop(labels = set(categorical_test_missing).difference(categorical_train_missing), axis = 1, inplace = True)
x_test_categorical.drop(labels = set(categorical_train_missing).difference(categorical_test_missing), axis = 1, inplace = True)

#Verifying that doesn't contain any Nan values and have the same col count
print("\nNaN values for categorical in train]:\n")
[print(col) for col in x_train_categorical.columns if x_train_categorical[col].isnull().any()]
print("\nNaN values for categorical in test]:\n")
[print(col) for col in x_test_categorical.columns if x_test_categorical[col].isnull().any()]
print("\ncategorical train shape: {}\n".format(x_train_categorical.shape))
print("\ncategorical test shape: {}\n".format(x_test_categorical.shape))
print("\ncategorical columns in train:")
print(x_train_categorical.columns)
print("\ncategorical columns in test:")
print(x_test_categorical.columns)


Categorical missing in train: 

['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']

Categorical missin in test: 

['MSZoning', 'Alley', 'Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType']

Different categorical missing: 

{'MSZoning', 'Exterior2nd', 'Functional', 'Utilities', 'SaleType', 'KitchenQual', 'Exterior1st', 'Electrical'}

Different categorical missing in train: 

{'Electrical'}

Different categorical missing in test: 

{'MSZoning', 'Exterior2nd', 'Utilities', 'SaleType', 'KitchenQual', 'Functional', 'Exterior1st'}

NaN values for categorical in train]:


NaN values for categorical in test]:


Getting cardinality in each categorical column

In [6]:
categorical_unique = list(map(lambda col: x_train_categorical[col].nunique(), x_train_categorical.columns))
dict_nunique = dict(zip(categorical_unique, x_train_categorical.columns))
print(sorted(dict_nunique.items(), key= lambda x: x[1]))

#Selecting only < 10 cardinality

low_cardinality = [col for col in x_train_categorical.columns if x_train_categorical[col].nunique() < 10]
high_cardinality = list(set(x_train_categorical.columns).difference(set(low_cardinality)))
print("\nLow cardinality columns:\n{}".format(low_cardinality))
print("\nHigh cardinality columns: \n{}".format(high_cardinality))

#Dropping high cardinality
x_train_categorical.drop(labels= high_cardinality, axis = 1, inplace = True)
x_test_categorical.drop(labels = high_cardinality, axis = 1, inplace = True)

[(2, 'CentralAir'), (9, 'Condition1'), (4, 'ExterQual'), (5, 'HeatingQC'), (25, 'Neighborhood'), (3, 'PavedDrive'), (8, 'RoofMatl'), (6, 'SaleCondition')]

Low cardinality columns:
['Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'PavedDrive', 'SaleCondition']

High cardinality columns: 
['Neighborhood']


Aplying one hot encoding

In [7]:
oH_encoder=  OneHotEncoder(sparse = False)

oH_train = pd.DataFrame(oH_encoder.fit_transform(x_train_categorical))
oH_test = pd.DataFrame(oH_encoder.transform(x_test_categorical))

training_data = pd.concat([oH_train, imputed_train], axis = 1)
test_data = pd.concat([oH_test, imputed_test], axis = 1)

In [8]:
def load_data(train_size):
    '''
    Separa los datos de entrenamiento en train y test
    Arguments:
    train_size -- Porcentaje de los datos que serán asignados a la sección de entrenamiento
    
    Returns:
    X_t -- input para training
    X_e -- input para evaluación
    y_t -- true label para X_t
    y_e -- true label para X_e
    '''
    
    print("\ntraining_data shape: {}".format(training_data.shape))
    print("\ny_train_full.shape: {}\n".format(y_train_full.shape))
    
    #print("\ntraining_data e.g.:\n{}\n".format(training_data.loc[[192]]))
    #print("\ny_train_full e.g.:\n{}\n\n".format(y_train_full.loc[[192]]))
    
    X_t, X_e, y_t, y_e = train_test_split(training_data, y_train_full, train_size = train_size, random_state = 1)
    
    '''
    X_t = np.array(X_t).T
    X_e = np.array(X_e).T
    y_t = np.array(y_t)
    y_e = np.array(y_e)
    
    y_t = np.expand_dims(y_t, axis = 0)
    y_e = np.expand_dims(y_e, axis = 0)
    '''
    
    X_t = X_t.to_numpy().T
    X_e = X_e.to_numpy().T
    y_t = y_t.to_numpy()
    y_e = y_e.to_numpy()
    
    y_t = np.expand_dims(y_t, axis = 0)
    y_e = np.expand_dims(y_e, axis = 0)
    
    return X_t, X_e, y_t, y_e   
    

In [9]:
X_train, X_evaluate, y_train, y_evaluate = load_data(0.80)

index = 192

print("X_train size: {}\n".format(X_train.shape))
print("y_train size: {}\n".format(y_train.shape))
print("X_evaluate size: {}\n".format(X_evaluate.shape))
print("y_evaluate size: {}\n".format(y_evaluate.shape))

print("Example # {}:\n\ndata: {}\nSalePrice: {}".format(index, X_train[:,index], y_train[0, index]))


training_data shape: (1460, 138)

y_train_full.shape: (1460,)

X_train size: (138, 1168)

y_train size: (1, 1168)

X_evaluate size: (138, 292)

y_evaluate size: (1, 292)

Example # 192:

data: [0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+00
 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00
 1.000e+00 0.000e+00 0.000e+00 1.000e+00 0.000e+00 0.000e+00 0.000e+00
 0.000e+00 1.000e+00 0.00

##### Creating the model

In [10]:
#CONSTANTS

layers_dims = [138, 20, 10, 5, 1]

In [19]:
#model

#parameters = Lmodel(X_train, y_train, layers_dims, "Mse", "Relu", learning_rate= 0.002, num_iterations = 2500, print_cost = True)

#Falta arreglar backpropagation