In [7]:
# example of one hot encoding for a neural network
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	print(data.info())
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	ohe = OneHotEncoder()
	ohe.fit(X_train)
	X_train_enc = ohe.transform(X_train)
	X_test_enc = ohe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# load the dataset
X, y = load_dataset('breast-cancer.csv')
print(X.shape, y.shape)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
print(X_train_enc.shape, X_test_enc.shape)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
print(y_train_enc.shape, y_test_enc.shape)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       286 non-null    object
 1   1       286 non-null    object
 2   2       286 non-null    object
 3   3       286 non-null    object
 4   4       278 non-null    object
 5   5       286 non-null    object
 6   6       286 non-null    object
 7   7       285 non-null    object
 8   8       286 non-null    object
 9   9       286 non-null    object
dtypes: object(10)
memory usage: 22.5+ KB
None
(286, 9) (286, 1)
(191, 9) (95, 9) (191, 1) (95, 1)
(191, 43) (95, 43)
(191,) (95,)
Epoch 1/100


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


12/12 - 1s - loss: 0.6784 - accuracy: 0.5393 - 653ms/epoch - 54ms/step
Epoch 2/100
12/12 - 0s - loss: 0.6442 - accuracy: 0.6649 - 31ms/epoch - 3ms/step
Epoch 3/100
12/12 - 0s - loss: 0.6141 - accuracy: 0.7225 - 17ms/epoch - 1ms/step
Epoch 4/100
12/12 - 0s - loss: 0.5926 - accuracy: 0.7330 - 20ms/epoch - 2ms/step
Epoch 5/100
12/12 - 0s - loss: 0.5736 - accuracy: 0.7277 - 18ms/epoch - 2ms/step
Epoch 6/100
12/12 - 0s - loss: 0.5614 - accuracy: 0.7277 - 27ms/epoch - 2ms/step
Epoch 7/100
12/12 - 0s - loss: 0.5504 - accuracy: 0.7330 - 21ms/epoch - 2ms/step
Epoch 8/100
12/12 - 0s - loss: 0.5414 - accuracy: 0.7330 - 21ms/epoch - 2ms/step
Epoch 9/100
12/12 - 0s - loss: 0.5344 - accuracy: 0.7330 - 23ms/epoch - 2ms/step
Epoch 10/100
12/12 - 0s - loss: 0.5277 - accuracy: 0.7330 - 27ms/epoch - 2ms/step
Epoch 11/100
12/12 - 0s - loss: 0.5220 - accuracy: 0.7330 - 23ms/epoch - 2ms/step
Epoch 12/100
12/12 - 0s - loss: 0.5164 - accuracy: 0.7330 - 24ms/epoch - 2ms/step
Epoch 13/100
12/12 - 0s - loss: 0.5