### FFM
FFM用のDatasetを作成し、kerasで実行する

In [5]:
import sys
import pandas as pd
import numpy as np
import datetime
import glob
import gc
import os

#========================================================================
# Args
#========================================================================
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month']

win_path = f'../features/4_winner/*.gz'
fname=''
# submit = pd.read_csv('../input/sample_submission.csv')
submit = []

HOME = os.path.expanduser('~')

sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from preprocessing import get_ordinal_mapping, get_dummies
from utils import logger_func
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

2019-01-12 06:10:03,607 utils 366 [INFO]    [logger_func] start 


In [30]:
#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base*')
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
test_feature_list = utils.parallel_load_data(path_list=test_path_list)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)

y = train[[key, target]]
train.drop(target, axis=1, inplace=True)
test.drop(target, axis=1, inplace=True)
train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

# FFMは最後の列がラベルになる
train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)

train = train.merge(y, how='inner', on=key)
train.head()
#========================================================================


100%|██████████| 3/3 [00:00<00:00, 89.19it/s]


In [1]:
from sklearn.base import BaseEstimator
from keras.layers import Input, Embedding, Dense,Flatten, Activation, dot, add
from keras.models import Model
from keras.regularizers import l2 as l2_reg
from keras import initializers
import itertools
from sklearn.model_selection import train_test_split


def make_batches(size, batch_size):
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]


def batch_generator(X,y,batch_size=128,shuffle=True):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    while 1:
        if shuffle:
            np.random.shuffle(index_array)
        batches = make_batches(sample_size, batch_size)
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            batch_ids = index_array[batch_start:batch_end]
            X_batch = [X[i][batch_ids] for i in range(len(X))]
            y_batch = y[batch_ids]
            yield X_batch,y_batch


def test_batch_generator(X,y,batch_size=128):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    batches = make_batches(sample_size, batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]
        X_batch = [X[i][batch_ids] for i in range(len(X))]
        y_batch = y[batch_ids]
        yield X_batch,y_batch


def predict_batch(model,X_t,batch_size=128):
    outcome = []
    for X_batch,y_batch in test_batch_generator(X_t,np.zeros(X_t[0].shape[0]),batch_size=batch_size):
        outcome.append(model.predict(X_batch,batch_size=batch_size))
    outcome = np.concatenate(outcome).ravel()
    return outcome



def build_model(input_len, max_features,K=8,solver='adam',l2=0.0,l2_fm = 0.0):

    inputs = []
    flatten_layers=[]
    columns = range(len(max_features))
    for c in columns:
        inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%c)
        num_c = max_features[c]

        embed_c = Embedding(
                        input_dim=num_c, # 埋め込む特徴の次元
                        output_dim=K, # 何次元に埋め込むか
                        input_length=1,
#                         input_length=1,
                        name = 'embed_%s'%c,
                        W_regularizer=l2_reg(l2_fm)
                        )(inputs_c)

              
        flatten_c = Flatten()(embed_c)

        inputs.append(inputs_c)
        flatten_layers.append(flatten_c)

    fm_layers = []

    for emb1,emb2 in itertools.combinations(flatten_layers, 2):
        
#         dot_layer = merge([emb1,emb2], mode='dot', dot_axes=1)
        dot_layer = dot(inputs=[emb1, emb2], axes=1)
        
        fm_layers.append(dot_layer)

        
    for c in columns:
        num_c = max_features[c]
        
        embed_c = Embedding(
                        num_c,
                        1,
                        input_length=1,
#                         input_length=input_len,
                        name = 'linear_%s'%c,
                        W_regularizer=l2_reg(l2)
                        )(inputs[c])

        flatten_c = Flatten()(embed_c)

        fm_layers.append(flatten_c)
        
#     flatten = merge(fm_layers, mode='sum')
    flatten = add(fm_layers) 
    outputs = Activation('sigmoid',name='outputs')(flatten)
    
    model = Model(input=inputs, output=outputs)

    model.compile(
                optimizer=solver,
                loss= 'binary_crossentropy'
              )

    return model


class KerasFM(BaseEstimator):
    def __init__(self, input_len, max_features=[], K=8, solver='adam', l2=0.0, l2_fm=0.0):
        self.model = build_model(input_len, max_features,K,solver,l2=l2,l2_fm = l2_fm)

    def fit(self, X, y, batch_size=128, nb_epoch=10, shuffle=True, verbose=1, validation_data=None):
        self.model.fit(X,y,batch_size=batch_size,nb_epoch=nb_epoch,shuffle=shuffle,verbose=verbose,validation_data=None)

    def fit_generator(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None,callbacks=None):
        tr_gen = batch_generator(X,y,batch_size=batch_size,shuffle=shuffle)
        if validation_data:
            X_test,y_test = validation_data
            te_gen = batch_generator(X_test,y_test,batch_size=batch_size,shuffle=False)
            nb_val_samples = X_test[-1].shape[0]
        else:
            te_gen = None
            nb_val_samples = None

        self.model.fit_generator(
                tr_gen, 
                samples_per_epoch=X[-1].shape[0], 
                nb_epoch=nb_epoch, 
                verbose=verbose, 
                callbacks=callbacks, 
                validation_data=te_gen, 
                nb_val_samples=nb_val_samples, 
                max_q_size=10
                )

    def predict(self,X,batch_size=128):
        y_preds = predict_batch(self.model,X,batch_size=batch_size)
        return y_preds

Using TensorFlow backend.


### Data Load & FFM Build

In [3]:
import pandas as pd
train = pd.read_csv('../../house_prise/input/train.csv')
test = pd.read_csv('../../house_prise/input/test.csv')

num_list = [col for col in train.columns if str(train[col].dtype).count('int') or str(train[col].dtype).count('float') ]

train = train[num_list]
num_list.remove('SalePrice')
test = test[num_list]
train.drop('Id', axis=1, inplace=True)
test.drop('Id', axis=1, inplace=True)

train, valid = train_test_split(train, test_size=0.2)
tmp_y_train = train['SalePrice'].map(lambda x: 1 if x<120000 else 0)
tmp_y_valid = valid['SalePrice'].map(lambda x: 1 if x<120000 else 0)
train.drop('SalePrice', axis=1, inplace=True)
valid.drop('SalePrice', axis=1, inplace=True)
print(train.shape)
print(valid.shape)
print(test.shape)
# sys.exit()

max_features = [len(train[col]) for col in train.columns]
model = KerasFM(input_len=len(train), max_features=max_features)

(1168, 36)
(292, 36)
(1459, 36)




In [5]:
len_train = len(train)
len_valid = len(valid)
len_test = len(test)

train_val = train.values
y_train_val = tmp_y_train.values
valid_val = valid.values
y_valid_val = tmp_y_valid.values
test_val = test.values

x_train = train_val.reshape(len_train, 36)
y_train = y_train_val.reshape(len_train, 1)
x_valid = valid_val.reshape(len_valid, 36)
y_valid = y_valid_val.reshape(len_valid, 1)
x_test = test_val.reshape(len_test, 36)

print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)
# (1168, 36)
# (1168, 1)
# (292, 36)
# (292, 1)

x_train = [i for i in x_train.T]
x_valid = [i for i in x_valid.T]
x_test = [i for i in x_test.T]

print(len(x_train))
print(len(x_valid))
print(len(x_test))
# 36
# 36
# 36

model.fit(X=x_train, y=y_train, validation_data=(x_valid, y_valid))
pred = model.predict(X=x_test)

(1168, 36)
(1168, 1)
(292, 36)
(292, 1)
36
36
36
Epoch 1/10
 128/1168 [==>...........................] - ETA: 0s - loss: 2.3788



Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()

model.add(Embedding(1, 8, input_length=36))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer="rmsprop", loss='binary_crossentropy', metrics=['acc'])
model.summary()

histotry = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 36, 8)             8         
_________________________________________________________________
flatten_74 (Flatten)         (None, 288)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 289       
Total params: 297
Trainable params: 297
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking model input: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), but instead got the following list of 36 arrays: [array([[50.],
       [20.],
       [20.],
       ...,
       [20.],
       [50.],
       [20.]]), array([[51.],
       [80.],
       [75.],
       ...,
       [40.],
       [60.],
       [98.]]), arr...

In [23]:
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])