Skip to content

Commit

Permalink
fix entity embedding error
Browse files Browse the repository at this point in the history
  • Loading branch information
AxeldeRomblay committed Aug 2, 2017
1 parent 682f09c commit e2d47dc
Showing 1 changed file with 159 additions and 153 deletions.
312 changes: 159 additions & 153 deletions python-package/mlbox/encoding/categorical_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,220 +91,226 @@ def fit(self, df_train, y_train):
self.__Lnum = df_train.dtypes[df_train.dtypes != 'object'].index

if (len(self.__Lcat) == 0):
pass

#################################################
# Label Encoding
#################################################

if (self.strategy == 'label_encoding'):

for col in self.__Lcat:
self.__fitOK = True

d = dict()
levels = list(df_train[col].unique())
nan = False
else:

if np.NaN in levels:
nan = True
levels.remove(np.NaN)
#################################################
# Label Encoding
#################################################

for enc, level in enumerate([np.NaN]*nan + sorted(levels)):
d[level] = enc # TODO: Optimize loop?
if (self.strategy == 'label_encoding'):

self.__Enc[col] = d
for col in self.__Lcat:

self.__fitOK = True
d = dict()
levels = list(df_train[col].unique())
nan = False

#################################################
# Dummification
#################################################
if np.NaN in levels:
nan = True
levels.remove(np.NaN)

elif (self.strategy == 'dummification'):
for enc, level in enumerate([np.NaN]*nan + sorted(levels)):
d[level] = enc # TODO: Optimize loop?

for col in self.__Lcat:
# TODO: Optimize?
self.__Enc[col] = list(df_train[col].dropna().unique())
self.__Enc[col] = d

self.__fitOK = True
self.__fitOK = True

#################################################
# Entity Embedding
#################################################
#################################################
# Dummification
#################################################

elif (self.strategy == 'entity_embedding'):
elif (self.strategy == 'dummification'):

# Parameters
A = 10 # 15 : more complex
B = 5 # 2 or 3 : more complex
for col in self.__Lcat:
# TODO: Optimize?
self.__Enc[col] = list(df_train[col].dropna().unique())

# Number of neurons for layer 1 and 2
sum_ = sum([1. * np.log(k) for k in self.__K.values()])
# TODO: Add reference for this formula?
n_layer1 = min(1000,
int(A * (len(self.__K) ** 0.5) * sum_ + 1))
n_layer2 = int(n_layer1 / B) + 2
self.__fitOK = True

# Dropouts
dropout1 = 0.1
dropout2 = 0.1
#################################################
# Entity Embedding
#################################################

# Learning parameters
epochs = 20 # 25 : more iterations
batch_size = 128 # 256 : gradient more stable
elif (self.strategy == 'entity_embedding'):

# Creating the neural network
# Parameters
A = 10 # 15 : more complex
B = 5 # 2 or 3 : more complex

embeddings = []
inputs = []
# computing interactions
self.__K = {}
for col in self.__Lcat:
exp_ = np.exp(-df_train[col].nunique() * 0.05)
self.__K[col] = np.int(5 * (1 - exp_) + 1)

for col in self.__Lcat:
sum_ = sum([1. * np.log(k) for k in self.__K.values()])
# TODO: Add reference for this formula?

exp_ = np.exp(-df_train[col].nunique() * 0.05)
self.__K[col] = np.int(5 * (1 - exp_) + 1)
# Number of neurons for layer 1 and 2
n_layer1 = min(1000,
int(A * (len(self.__K) ** 0.5) * sum_ + 1))
n_layer2 = int(n_layer1 / B) + 2

d = dict()
levels = list(df_train[col].unique())
nan = False
# Dropouts
dropout1 = 0.1
dropout2 = 0.1

if np.NaN in levels:
nan = True
levels.remove(np.NaN)
# Learning parameters
epochs = 20 # 25 : more iterations
batch_size = 128 # 256 : gradient more stable

for enc, level in enumerate([np.NaN]*nan + sorted(levels)):
d[level] = enc # TODO: Optimize loop?
# Creating the neural network

self.__Enc[col] = d
embeddings = []
inputs = []

var = Input(shape=(1,))
inputs.append(var)
for col in self.__Lcat:

emb = Embedding(input_dim=len(self.__Enc[col]),
output_dim=self.__K[col],
input_length=1)(var)
emb = Reshape(target_shape=(self.__K[col],))(emb)
d = dict()
levels = list(df_train[col].unique())
nan = False

embeddings.append(emb)
if np.NaN in levels:
nan = True
levels.remove(np.NaN)

if (len(self.__Lcat) > 1):
emb_layer = concatenate(embeddings)
else:
emb_layer = embeddings[0]
for enc, level in enumerate([np.NaN]*nan + sorted(levels)):
d[level] = enc # TODO: Optimize loop?

lay1 = Dense(n_layer1,
kernel_initializer='uniform',
activation='relu')(emb_layer)
lay1 = Dropout(dropout1)(lay1)
self.__Enc[col] = d

lay2 = Dense(n_layer2,
kernel_initializer='uniform',
activation='relu')(lay1)
lay2 = Dropout(dropout2)(lay2)
var = Input(shape=(1,))
inputs.append(var)

# Learning the weights
emb = Embedding(input_dim=len(self.__Enc[col]),
output_dim=self.__K[col],
input_length=1)(var)
emb = Reshape(target_shape=(self.__K[col],))(emb)

if ((y_train.dtype == object) | (y_train.dtype == 'int')):
embeddings.append(emb)

# Classification
if (y_train.nunique() == 2):
if (len(self.__Lcat) > 1):
emb_layer = concatenate(embeddings)
else:
emb_layer = embeddings[0]

lay1 = Dense(n_layer1,
kernel_initializer='uniform',
activation='relu')(emb_layer)
lay1 = Dropout(dropout1)(lay1)

lay2 = Dense(n_layer2,
kernel_initializer='uniform',
activation='relu')(lay1)
lay2 = Dropout(dropout2)(lay2)

# Learning the weights

if ((y_train.dtype == object) | (y_train.dtype == 'int')):

# Classification
if (y_train.nunique() == 2):

outputs = Dense(1,
kernel_initializer='normal',
activation='sigmoid')(lay2)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy',
optimizer='adam')
model.fit(
[df_train[col].apply(lambda x: self.__Enc[col][x]).values
for col in self.__Lcat],
pd.get_dummies(y_train,
drop_first=True).astype(int).values,
epochs=epochs,
batch_size=batch_size,
verbose=int(self.verbose)
)

outputs = Dense(1,
kernel_initializer='normal',
activation='sigmoid')(lay2)
else:

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy',
optimizer='adam')
model.fit(
[df_train[col].apply(lambda x: self.__Enc[col][x]).values
for col in self.__Lcat],
pd.get_dummies(y_train,
drop_first=True).astype(int).values,
epochs=epochs,
batch_size=batch_size,
verbose=int(self.verbose)
)
outputs = Dense(y_train.nunique(),
kernel_initializer='normal',
activation='sigmoid')(lay2)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy',
optimizer='adam')
model.fit(
[df_train[col].apply(lambda x: self.__Enc[col][x]).values
for col in self.__Lcat],
pd.get_dummies(y_train,
drop_first=False).astype(int).values,
epochs=epochs,
batch_size=batch_size,
verbose=int(self.verbose)
)

else:

outputs = Dense(y_train.nunique(),
kernel_initializer='normal',
activation='sigmoid')(lay2)

# Regression
outputs = Dense(1, kernel_initializer='normal')(lay2)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='binary_crossentropy',
optimizer='adam')
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(
[df_train[col].apply(lambda x: self.__Enc[col][x]).values
for col in self.__Lcat],
pd.get_dummies(y_train,
drop_first=False).astype(int).values,
y_train.values,
epochs=epochs,
batch_size=batch_size,
verbose=int(self.verbose)
)

else:

# Regression
outputs = Dense(1, kernel_initializer='normal')(lay2)
model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(
[df_train[col].apply(lambda x: self.__Enc[col][x]).values
for col in self.__Lcat],
y_train.values,
epochs=epochs,
batch_size=batch_size,
verbose=int(self.verbose)
)

self.__weights = model.get_weights()
self.__weights = model.get_weights()

self.__fitOK = True

#################################################
# Random Projection
#################################################
self.__fitOK = True

elif(self.strategy == 'random_projection'):
#################################################
# Random Projection
#################################################

for col in self.__Lcat:
elif(self.strategy == 'random_projection'):

exp_ = np.exp(-df_train[col].nunique() * 0.05)
# TODO: Add reference to formula used here below?
self.__K[col] = np.int(5 * (1 - exp_)) + 1
for col in self.__Lcat:

d = dict()
levels = list(df_train[col].unique())
nan = False
exp_ = np.exp(-df_train[col].nunique() * 0.05)
# TODO: Add reference to formula used here below?
self.__K[col] = np.int(5 * (1 - exp_)) + 1

if np.NaN in levels:
nan = True
levels.remove(np.NaN)
d = dict()
levels = list(df_train[col].unique())
nan = False

for k in range(self.__K[col]):
if np.NaN in levels:
nan = True
levels.remove(np.NaN)

if (k == 0):
levels = sorted(levels)
for k in range(self.__K[col]):

else:
np.random.seed(k)
np.random.shuffle(levels)
if (k == 0):
levels = sorted(levels)

for enc, level in enumerate([np.NaN] * nan + levels):
if(k == 0):
d[level] = [enc]
else:
d[level] = d[level] + [enc]
np.random.seed(k)
np.random.shuffle(levels)

self.__Enc[col] = d
for enc, level in enumerate([np.NaN] * nan + levels):
if(k == 0):
d[level] = [enc]
else:
d[level] = d[level] + [enc]

self.__fitOK = True
self.__Enc[col] = d

else:
self.__fitOK = True

else:

raise ValueError("Strategy for categorical encoding is not valid")
raise ValueError("Strategy for categorical encoding is not valid")

return self

Expand Down

0 comments on commit e2d47dc

Please sign in to comment.