In [106]:
# https://machinelearningmastery.com/how-to-prepare-categorical-data-for-deep-learning-in-python/

In [108]:
# 유일한 "올바른" 접근 방식은 다른 접근 방식보다 더 나은 성능을 발휘하도록 시도하는 접근 방식입니다.
# 다양한 접근 방식의 프로토타입을 만들고 특정 데이터와 모델 선택에 가장 적합한 것이 무엇인지 알아보는 것이 좋습니다.

In [109]:
# 먼저 Ordinalencoding과 Labelencoding을 사용하여 데이터를 변환한 다음 훈련 세트와 테스트 세트로 분할할 수 있습니까? 미리 감사드립니다
# 예. 그러나 이러한 종류의 변환은 데이터가 유출되지 않도록 주의해야 합니다. 
# 이는 출력을 입력의 일부로 만들지 않고, 전처리 파이프라인을 생성하기 위해 테스트와 훈련 세트를 혼합하지 않는다는 것을 의미합니다. 출력의 힌트가 입력으로 *누출*되어서는 안 됩니다.

# 범주형 -> 순서형 임베딩

In [1]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from keras.models import Sequential
from keras.layers import Dense

In [83]:
# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [84]:
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/100


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


12/12 - 0s - loss: 0.8367 - accuracy: 0.6597 - 374ms/epoch - 31ms/step
Epoch 2/100
12/12 - 0s - loss: 0.8089 - accuracy: 0.6387 - 9ms/epoch - 750us/step
Epoch 3/100
12/12 - 0s - loss: 0.7894 - accuracy: 0.6387 - 10ms/epoch - 833us/step
Epoch 4/100
12/12 - 0s - loss: 0.7727 - accuracy: 0.6387 - 8ms/epoch - 667us/step
Epoch 5/100
12/12 - 0s - loss: 0.7588 - accuracy: 0.6440 - 9ms/epoch - 750us/step
Epoch 6/100
12/12 - 0s - loss: 0.7467 - accuracy: 0.6387 - 9ms/epoch - 750us/step
Epoch 7/100
12/12 - 0s - loss: 0.7333 - accuracy: 0.6597 - 8ms/epoch - 667us/step
Epoch 8/100
12/12 - 0s - loss: 0.7207 - accuracy: 0.6649 - 9ms/epoch - 750us/step
Epoch 9/100
12/12 - 0s - loss: 0.7095 - accuracy: 0.6754 - 9ms/epoch - 750us/step
Epoch 10/100
12/12 - 0s - loss: 0.6988 - accuracy: 0.6859 - 9ms/epoch - 750us/step
Epoch 11/100
12/12 - 0s - loss: 0.6877 - accuracy: 0.6859 - 9ms/epoch - 750us/step
Epoch 12/100
12/12 - 0s - loss: 0.6785 - accuracy: 0.7016 - 9ms/epoch - 750us/step
Epoch 13/100
12/12 - 0s

In [10]:
X_train[0]

array(["'50-59'", "'ge40'", "'25-29'", "'0-2'", "'no'", "'1'", "'left'",
       "'right_low'", "'no'"], dtype='<U11')

In [87]:
X_train_enc[0]

array([3., 0., 4., 0., 0., 0., 0., 3., 0.])

# 범주형 -> 원핫인코딩

In [1]:
# example of one hot encoding for a neural network
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense

In [2]:
# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	ohe = OneHotEncoder()
	ohe.fit(X_train)
	X_train_enc = ohe.transform(X_train)
	X_test_enc = ohe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [4]:
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/100


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


12/12 - 0s - loss: 0.7224 - accuracy: 0.4764 - 307ms/epoch - 26ms/step
Epoch 2/100
12/12 - 0s - loss: 0.6772 - accuracy: 0.5445 - 11ms/epoch - 917us/step
Epoch 3/100
12/12 - 0s - loss: 0.6419 - accuracy: 0.6126 - 8ms/epoch - 667us/step
Epoch 4/100
12/12 - 0s - loss: 0.6194 - accuracy: 0.6545 - 9ms/epoch - 750us/step
Epoch 5/100
12/12 - 0s - loss: 0.6000 - accuracy: 0.7068 - 9ms/epoch - 750us/step
Epoch 6/100
12/12 - 0s - loss: 0.5891 - accuracy: 0.7330 - 9ms/epoch - 750us/step
Epoch 7/100
12/12 - 0s - loss: 0.5780 - accuracy: 0.7382 - 8ms/epoch - 667us/step
Epoch 8/100
12/12 - 0s - loss: 0.5708 - accuracy: 0.7330 - 8ms/epoch - 667us/step
Epoch 9/100
12/12 - 0s - loss: 0.5642 - accuracy: 0.7330 - 10ms/epoch - 833us/step
Epoch 10/100
12/12 - 0s - loss: 0.5583 - accuracy: 0.7330 - 8ms/epoch - 667us/step
Epoch 11/100
12/12 - 0s - loss: 0.5531 - accuracy: 0.7330 - 9ms/epoch - 750us/step
Epoch 12/100
12/12 - 0s - loss: 0.5487 - accuracy: 0.7382 - 8ms/epoch - 667us/step
Epoch 13/100
12/12 - 0

In [5]:
X_train

array([["'50-59'", "'ge40'", "'25-29'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'30-39'", "'premeno'", "'5-9'", ..., "'left'", "'right_low'",
        "'no'"],
       ["'50-59'", "'premeno'", "'50-54'", ..., "'right'", "'left_up'",
        "'yes'"],
       ...,
       ["'60-69'", "'ge40'", "'10-14'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'40-44'", ..., "'right'", "'left_low'",
        "'no'"],
       ["'60-69'", "'ge40'", "'45-49'", ..., "'left'", "'central'",
        "'no'"]], dtype='<U11')

In [103]:
# ohe = OneHotEncoder()
# ohe.fit(X_train)

In [104]:
X_train[0]

array(["'50-59'", "'ge40'", "'25-29'", "'0-2'", "'no'", "'1'", "'left'",
       "'right_low'", "'no'"], dtype='<U11')

In [78]:
ohe.get_feature_names_out()

array(["x0_'20-29'", "x0_'30-39'", "x0_'40-49'", "x0_'50-59'",
       "x0_'60-69'", "x0_'70-79'", "x1_'ge40'", "x1_'lt40'",
       "x1_'premeno'", "x2_'0-4'", "x2_'10-14'", "x2_'15-19'",
       "x2_'20-24'", "x2_'25-29'", "x2_'30-34'", "x2_'35-39'",
       "x2_'40-44'", "x2_'45-49'", "x2_'5-9'", "x2_'50-54'", "x3_'0-2'",
       "x3_'12-14'", "x3_'15-17'", "x3_'24-26'", "x3_'3-5'", "x3_'6-8'",
       "x3_'9-11'", "x4_'no'", "x4_'yes'", 'x4_nan', "x5_'1'", "x5_'2'",
       "x5_'3'", "x6_'left'", "x6_'right'", "x7_'central'",
       "x7_'left_low'", "x7_'left_up'", "x7_'right_low'", "x7_'right_up'",
       'x7_nan', "x8_'no'", "x8_'yes'"], dtype=object)

In [79]:
print(X_train_enc[0].toarray())

[[0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0.]]


In [81]:
print(X_train_enc[0])

  (0, 3)	1.0
  (0, 6)	1.0
  (0, 13)	1.0
  (0, 20)	1.0
  (0, 27)	1.0
  (0, 30)	1.0
  (0, 33)	1.0
  (0, 38)	1.0
  (0, 41)	1.0


In [105]:
ohe.inverse_transform(X_train_enc[0])

array([["'50-59'", "'ge40'", "'25-29'", "'0-2'", "'no'", "'1'", "'left'",
        "'right_low'", "'no'"]], dtype='<U11')

# 범주형 단어 임베딩

In [65]:
# example of learned embedding encoding for a neural network
from numpy import unique
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Embedding
from tensorflow.keras.layers import concatenate
from keras.utils import plot_model

In [93]:
# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	# reshape target to be a 2d array
	y = y.reshape((len(y), 1))
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	X_train_enc, X_test_enc = list(), list()
	# label encode each column
	for i in range(X_train.shape[1]):
		le = LabelEncoder()
		le.fit(X_train[:, i])
		# encode
		train_enc = le.transform(X_train[:, i])
		test_enc = le.transform(X_test[:, i])
		# store
		X_train_enc.append(train_enc)
		X_test_enc.append(test_enc)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

In [94]:
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))
# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
	# calculate the number of unique inputs
	n_labels = len(unique(X_train_enc[i]))
	# define input layer
	in_layer = Input(shape=(1,))
	# define embedding layer
	em_layer = Embedding(n_labels, 10)(in_layer)
	# store layers
	in_layers.append(in_layer)
	em_layers.append(em_layer)
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# plot graph
plot_model(model, show_shapes=True, to_file='embeddings.png')
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


Epoch 1/20
12/12 - 1s - loss: 0.6817 - accuracy: 0.6806 - 1s/epoch - 91ms/step
Epoch 2/20
12/12 - 0s - loss: 0.6568 - accuracy: 0.7277 - 12ms/epoch - 1000us/step
Epoch 3/20
12/12 - 0s - loss: 0.6263 - accuracy: 0.7277 - 12ms/epoch - 1ms/step
Epoch 4/20
12/12 - 0s - loss: 0.6008 - accuracy: 0.7277 - 12ms/epoch - 1ms/step
Epoch 5/20
12/12 - 0s - loss: 0.5801 - accuracy: 0.7277 - 12ms/epoch - 1ms/step
Epoch 6/20
12/12 - 0s - loss: 0.5601 - accuracy: 0.7277 - 12ms/epoch - 1000us/step
Epoch 7/20
12/12 - 0s - loss: 0.5473 - accuracy: 0.7277 - 13ms/epoch - 1ms/step
Epoch 8/20
12/12 - 0s - loss: 0.5363 - accuracy: 0.7277 - 12ms/epoch - 1ms/step
Epoch 9/20
12/12 - 0s - loss: 0.5268 - accuracy: 0.7330 - 12ms/epoch - 1ms/step
Epoch 10/20
12/12 - 0s - loss: 0.5168 - accuracy: 0.7592 - 11ms/epoch - 917us/step
Epoch 11/20
12/12 - 0s - loss: 0.5087 - accuracy: 0.7644 - 11ms/epoch - 917us/step
Epoch 12/20
12/12 - 0s - loss: 0.5026 - accuracy: 0.7749 - 11ms/epoch - 917us/step
Epoch 13/20
12/12 - 0s - l

In [97]:
X_train[0]

array(["'50-59'", "'ge40'", "'25-29'", "'0-2'", "'no'", "'1'", "'left'",
       "'right_low'", "'no'"], dtype='<U11')

In [98]:
X_train_enc[0]

array([3, 1, 3, 1, 2, 2, 3, 3, 3, 1, 2, 2, 2, 2, 4, 2, 1, 4, 1, 4, 4, 3,
       2, 3, 2, 4, 3, 2, 4, 2, 1, 2, 1, 4, 3, 1, 5, 3, 2, 2, 3, 3, 3, 2,
       4, 3, 1, 4, 4, 5, 3, 2, 3, 3, 4, 2, 2, 3, 2, 2, 2, 4, 1, 2, 3, 3,
       2, 2, 3, 3, 4, 2, 2, 3, 2, 1, 2, 1, 4, 2, 3, 1, 3, 3, 3, 4, 2, 4,
       3, 2, 3, 3, 3, 3, 5, 4, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 3, 4, 3, 1,
       3, 4, 4, 1, 3, 2, 3, 4, 3, 3, 4, 4, 3, 3, 4, 2, 3, 2, 4, 1, 2, 3,
       2, 1, 2, 1, 4, 4, 4, 3, 3, 2, 1, 2, 2, 3, 2, 3, 2, 3, 3, 3, 4, 1,
       1, 2, 3, 3, 3, 2, 0, 2, 4, 3, 4, 2, 4, 2, 3, 3, 2, 2, 2, 5, 5, 3,
       5, 1, 2, 2, 2, 2, 4, 4, 3, 2, 2, 3, 4, 4, 4])

In [99]:
X_train_enc[1]

array([0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2,
       0, 2, 2, 0, 0, 2, 1, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2,
       0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0,
       2, 2, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 0, 2, 0,
       1, 2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 2, 2,
       1, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 2, 0, 0,
       2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 0, 2,
       2, 2, 0, 0, 0, 2, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 2, 0, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0])