### FFM
FFM用のDatasetを作成し、kerasで実行する

In [2]:
import sys
import pandas as pd
import numpy as np
import datetime
import glob
import gc
import os

#========================================================================
# Args
#========================================================================
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month']

HOME = os.path.expanduser('~')

sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from preprocessing import get_ordinal_mapping, get_dummies
from utils import logger_func
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

2019-01-12 17:25:36,940 utils 366 [INFO]    [logger_func] start 


In [12]:
#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base*')
win_path = f'../features/4_winner/*.gz'
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

train_path_list = sorted(train_path_list)[:20]
test_path_list  = sorted(test_path_list)[:20]
        
base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
test_feature_list = utils.parallel_load_data(path_list=test_path_list)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)

train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

train.head()
#========================================================================

100%|██████████| 3/3 [00:00<00:00, 36.92it/s]


Unnamed: 0,card_id,target,110_ker_auth_category_2_1_0_mean,110_ker_auth_category_2_2_0_mean,110_ker_auth_category_2_3_0_mean,110_ker_auth_category_2_4_0_mean,110_ker_auth_category_2_5_0_mean,110_ker_auth_category_3_A_mean,110_ker_auth_category_3_B_mean,110_ker_hist_category_2_1_0_mean,...,110_ker_hist_category_2_4_0_mean,110_ker_hist_category_2_5_0_mean,110_ker_hist_category_3_A_mean,110_ker_hist_category_3_B_mean,110_ker_hist_category_3_C_mean,110_ker_new_category_2_1_0_mean,110_ker_new_category_2_2_0_mean,110_ker_new_category_2_3_0_mean,110_ker_new_category_2_4_0_mean,110_ker_new_category_2_5_0_mean
0,C_ID_92a2005557,-0.820283,0.987854,0.0,0.0,0.0,0.012146,1.0,0.0,1.0,...,0.0,0.0,0.692308,0.307692,0.0,1.0,0.0,0.0,0.0,0.0
1,C_ID_3d0044924f,0.392913,1.0,0.0,0.0,0.0,0.0,0.0059,0.80236,1.0,...,0.0,0.0,0.0,0.363636,0.636364,1.0,0.0,0.0,0.0,0.0
2,C_ID_d639edf6cd,0.688056,0.097561,0.0,0.0,0.0,0.902439,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,C_ID_186d6a6901,0.142495,0.311688,0.0,0.0,0.688312,0.0,0.025974,0.883117,1.0,...,0.0,0.0,0.333333,0.318182,0.0,0.142857,0.0,0.0,0.857143,0.0
4,C_ID_cdbd2c0db2,-0.159749,0.171875,0.0,0.0,0.820312,0.007812,0.0,0.96875,0.6,...,0.4,0.0,0.0,0.4,0.6,0.111111,0.0,0.194444,0.694444,0.0


### Data Load & FFM Build

In [14]:
from sklearn.model_selection import train_test_split

num_list = [col for col in train.columns if str(train[col].dtype).count('int') or str(train[col].dtype).count('float') ]

train = train[num_list]
num_list.remove(target)
test = test[num_list]

train, valid = train_test_split(train, test_size=0.2)
tmp_y_train = train[target].map(lambda x: 1 if x<-30 else 0)
tmp_y_valid = valid[target].map(lambda x: 1 if x<-30 else 0)
train.drop(target, axis=1, inplace=True)
valid.drop(target, axis=1, inplace=True)
print(train.shape)
print(valid.shape)
print(test.shape)

(161533, 20)
(40384, 20)
(123623, 20)


In [15]:
len_train = len(train)
len_valid = len(valid)
len_test = len(test)
len_feats = len(train.columns)

train_val = train.values
y_train_val = tmp_y_train.values
valid_val = valid.values
y_valid_val = tmp_y_valid.values
test_val = test.values

x_train = train_val.reshape(len_train, len_feats)
y_train = y_train_val.reshape(len_train, 1)
x_valid = valid_val.reshape(len_valid, len_feats)
y_valid = y_valid_val.reshape(len_valid, 1)
x_test = test_val.reshape(len_test, len_feats)

print(x_train.shape)
print(y_train.shape)
print(x_valid.shape)
print(y_valid.shape)

x_train = [i for i in x_train.T]
x_valid = [i for i in x_valid.T]
x_test = [i for i in x_test.T]

print(len(x_train))
print(len(x_valid))
print(len(x_test))

(161533, 20)
(161533, 1)
(40384, 20)
(40384, 1)
20
20
20


In [1]:
from sklearn.base import BaseEstimator
from keras.layers import Input, Embedding, Dense,Flatten, Activation, dot, add
from keras.models import Model
from keras.regularizers import l2 as l2_reg
from keras import initializers
import itertools


def make_batches(size, batch_size):
    nb_batch = int(np.ceil(size/float(batch_size)))
    return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]


def batch_generator(X,y,batch_size=128,shuffle=True):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    while 1:
        if shuffle:
            np.random.shuffle(index_array)
        batches = make_batches(sample_size, batch_size)
        for batch_index, (batch_start, batch_end) in enumerate(batches):
            batch_ids = index_array[batch_start:batch_end]
            X_batch = [X[i][batch_ids] for i in range(len(X))]
            y_batch = y[batch_ids]
            yield X_batch,y_batch


def test_batch_generator(X,y,batch_size=128):
    sample_size = X[0].shape[0]
    index_array = np.arange(sample_size)
    batches = make_batches(sample_size, batch_size)
    for batch_index, (batch_start, batch_end) in enumerate(batches):
        batch_ids = index_array[batch_start:batch_end]
        X_batch = [X[i][batch_ids] for i in range(len(X))]
        y_batch = y[batch_ids]
        yield X_batch,y_batch


def predict_batch(model,X_t,batch_size=128):
    outcome = []
    for X_batch,y_batch in test_batch_generator(X_t,np.zeros(X_t[0].shape[0]),batch_size=batch_size):
        outcome.append(model.predict(X_batch,batch_size=batch_size))
    outcome = np.concatenate(outcome).ravel()
    return outcome



def build_model(input_len, max_features,K=8,solver='adam',l2=0.0,l2_fm = 0.0):

    inputs = []
    flatten_layers=[]
    columns = range(len(max_features))
    for c in columns:
        inputs_c = Input(shape=(1,), dtype='int32',name = 'input_%s'%c)
        num_c = max_features[c]

        embed_c = Embedding(
                        input_dim=num_c, # 埋め込む特徴の次元
                        output_dim=K, # 何次元に埋め込むか
                        input_length=1,
#                         input_length=1,
                        name = 'embed_%s'%c,
                        W_regularizer=l2_reg(l2_fm)
                        )(inputs_c)

              
        flatten_c = Flatten()(embed_c)

        inputs.append(inputs_c)
        flatten_layers.append(flatten_c)

    fm_layers = []

    for emb1,emb2 in itertools.combinations(flatten_layers, 2):
        
#         dot_layer = merge([emb1,emb2], mode='dot', dot_axes=1)
        dot_layer = dot(inputs=[emb1, emb2], axes=1)
        
        fm_layers.append(dot_layer)

        
    for c in columns:
        num_c = max_features[c]
        
        embed_c = Embedding(
                        num_c,
                        1,
                        input_length=1,
#                         input_length=input_len,
                        name = 'linear_%s'%c,
                        W_regularizer=l2_reg(l2)
                        )(inputs[c])

        flatten_c = Flatten()(embed_c)

        fm_layers.append(flatten_c)
        
#     flatten = merge(fm_layers, mode='sum')
    flatten = add(fm_layers) 
    outputs = Activation('sigmoid',name='outputs')(flatten)
    
    model = Model(input=inputs, output=outputs)

    model.compile(
                optimizer=solver,
                loss= 'binary_crossentropy'
              )

    return model


class KerasFM(BaseEstimator):
    def __init__(self, input_len, max_features=[], K=8, solver='adam', l2=0.0, l2_fm=0.0):
        self.model = build_model(input_len, max_features,K,solver,l2=l2,l2_fm = l2_fm)

    def fit(self, X, y, batch_size=128, nb_epoch=10, shuffle=True, verbose=1, validation_data=None):
        self.model.fit(X,y,batch_size=batch_size,nb_epoch=nb_epoch,shuffle=shuffle,verbose=verbose,validation_data=None)

    def fit_generator(self,X,y,batch_size=128,nb_epoch=10,shuffle=True,verbose=1,validation_data=None,callbacks=None):
        tr_gen = batch_generator(X,y,batch_size=batch_size,shuffle=shuffle)
        if validation_data:
            X_test,y_test = validation_data
            te_gen = batch_generator(X_test,y_test,batch_size=batch_size,shuffle=False)
            nb_val_samples = X_test[-1].shape[0]
        else:
            te_gen = None
            nb_val_samples = None

        self.model.fit_generator(
                tr_gen, 
                samples_per_epoch=X[-1].shape[0], 
                nb_epoch=nb_epoch, 
                verbose=verbose, 
                callbacks=callbacks, 
                validation_data=te_gen, 
                nb_val_samples=nb_val_samples, 
                max_q_size=10
                )

    def predict(self,X,batch_size=128):
        y_preds = predict_batch(self.model,X,batch_size=batch_size)
        return y_preds
    
max_features = [len(train[col]) for col in train.columns]
model = KerasFM(input_len=len(train), max_features=max_features)

NameError: name 'sys' is not defined

In [None]:
model.fit(X=x_train, y=y_train, validation_data=(x_valid, y_valid))
pred = model.predict(X=x_test)