## Red neuronal utilizando Keras

Ahora construiremos una red neuronal utilizando la libreria de Keras para implementar tensorflow.
Utilizaremos la base de datos sin valores faltantes para entrenar el modelo, y mediante validación cruzada probaremos su efectividad.

### Cargando librerias necesarias

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
tf.enable_eager_execution()
import numpy as np # linear algebra
import pandas as pd
import keras
import os
import qgrid
import functools


In [2]:
train = pd.read_csv('data/dataset_tae_final_no_na_mod.csv')
test = pd.read_csv('data/test_tae_no_na_mod.csv')

In [3]:
qqview = qgrid.show_grid(train)
qqview

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

Definimos nuestra variable objetivo y nuestras caracteristicas

In [4]:
column = list(train.columns)
features = column[:-1]
label = column[-1]
#Chequeo de tipos de las variables
train.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
marital_status    object
ocupation         object
ethnicity         object
gender            object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

Vemos que tenemos variables categoricas, debemos transformarlas a variables dummies par apoder crear la red neuronal:

In [5]:
catego_columns = ['education', 'workclass', 'marital_status', 
             'ethnicity', 'income','gender',
                  'native_country', 'ocupation']
#Transformamos variables object a categoricas
for col in catego_columns:
    train[col] = pd.Categorical(train[col])
    test[col] = pd.Categorical(test[col])

#Transformamos nuestra variable de objetivo para la clasificacion
train[label] = train[label].cat.codes
test[label] = test[label].cat.codes

#Creamos variables dummies con las categoricas
train_dataset=pd.get_dummies(train)
test_dataset=pd.get_dummies(test)

Chequeamos que las variables de nuestro conjunto de entrenamiento sean las mismas que las de validación

In [6]:
for elem in list(train_dataset.columns):
    if not((elem in list(test_dataset.columns))):
        print(elem)
#Everything is fine

In [7]:
columns = train_dataset.columns
columns
#target = train_dataset.pop("income")

Index(['age', 'fnlwgt', 'capital_gain', 'capital_loss', 'hours_per_week',
       'income', 'workclass_ Federal-gov', 'workclass_ Local-gov',
       'workclass_ Private', 'workclass_ Self-emp-inc',
       'workclass_ Self-emp-not-inc', 'workclass_ State-gov',
       'workclass_ Without-pay', 'education_ 10th', 'education_ 11th',
       'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th',
       'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm',
       'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate',
       'education_ HS-grad', 'education_ Masters', 'education_ Preschool',
       'education_ Prof-school', 'education_ Some-college',
       'marital_status_ Divorced', 'marital_status_ Never-married',
       'marital_status_ Separated', 'marital_status_ Widowed',
       'marital_status_Married', 'ocupation_ Adm-clerical',
       'ocupation_ Armed-Forces', 'ocupation_ Craft-repair',
       'ocupation_ Exec-managerial', 'ocupation_ Farming-fishi

Finalmente convertimos nuestra base de datos al formato de tensorflow.

In [8]:
list(test_dataset)

['age',
 'fnlwgt',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'income',
 'workclass_ Federal-gov',
 'workclass_ Local-gov',
 'workclass_ Private',
 'workclass_ Self-emp-inc',
 'workclass_ Self-emp-not-inc',
 'workclass_ State-gov',
 'workclass_ Without-pay',
 'education_ 10th',
 'education_ 11th',
 'education_ 12th',
 'education_ 1st-4th',
 'education_ 5th-6th',
 'education_ 7th-8th',
 'education_ 9th',
 'education_ Assoc-acdm',
 'education_ Assoc-voc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Preschool',
 'education_ Prof-school',
 'education_ Some-college',
 'marital_status_ Divorced',
 'marital_status_ Never-married',
 'marital_status_ Separated',
 'marital_status_ Widowed',
 'marital_status_Married',
 'ocupation_ Adm-clerical',
 'ocupation_ Armed-Forces',
 'ocupation_ Craft-repair',
 'ocupation_ Exec-managerial',
 'ocupation_ Farming-fishing',
 'ocupation_ Handlers-cleaners',
 'ocupation_ Machine-op-insp

In [9]:
train_dataset2 = train_dataset.copy(deep=True)
target = train_dataset2.pop('income')
final_train = tf.data.Dataset.from_tensor_slices((train_dataset2.values, target.values))
final_train = final_train.shuffle(len(train)).batch(64)
type(final_train)

tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter

In [21]:
train_dataset.to_csv('train_dummies.csv')

### Creando la red neuronal

In [17]:
from keras.optimizers import adam
from keras.models import Sequential
def get_compiled_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
    ])
    #opt = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
    

    model.compile(optimizer="SGD",
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [18]:
x_train = train_dataset.copy(deep=True)
x_val = test_dataset.copy(deep=True)
y_train = x_train.pop('income').values
y_val = x_val.pop('income').values

#val_data = tf.data.Dataset.from_tensor_slices((x_test.values, y_test.values))

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
#xtrain = scaler.fit_transform(x_train.values)

model = get_compiled_model()

model.fit(x=x_val.values, y=y_val,epochs = 64,
          batch_size=1,
         shuffle=True)

Epoch 1/64
 3437/15059 [=====>........................] - ETA: 18s - loss: 2.9065e-08 - acc: 0.2438

KeyboardInterrupt: 

In [None]:
# find columns of type int
mask = df.dtypes==int
# select columns for for the same
cols = df.dtypes[mask].index
# select these columns and convert to float
new_cols_df = df[cols].apply(lambda x: x.astype(float), axis=1)
# Replace these columns in original df
df[new_cols_df.columns] = new_cols_df

In [36]:
sgd = keras.optimizers.SGD(lr=0.01, clipvalue=0.5)

In [16]:
x_val = test_dataset.copy(deep=True)

y_pred=model(train_dataset.copy(deep=True).values)
y_pred =(y_pred>0.5)

InvalidArgumentError: cannot compute MatMul as input #0(zero-based) was expected to be a float tensor but is a int64 tensor [Op:MatMul]

## Data cross validation

In [None]:
model.predict()

In [28]:
print("hola")

hola


In [25]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(4,)),  # input shape required
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(3)
])
model.fit(final_train, epochs = 20, steps_per_epoch=472)

Epoch 1/20


RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.

## Entrenando el modelo

In [20]:
def loss(model, x, y):
  y_ = model(x)
  return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_)


l = loss(model, features, target)
print("Loss test: {}".format(l))

NameError: name 'model' is not defined

In [50]:
model = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation=tf.nn.relu, input_shape=(4,)),  # input shape required
  tf.keras.layers.Dense(10, activation=tf.nn.relu),
  tf.keras.layers.Dense(3)
])

['age',
 'workclass',
 'fnlwgt',
 'education',
 'marital_status',
 'ocupation',
 'ethnicity',
 'gender',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country']