### FFM

In [None]:
import sys
import pandas as pd
import numpy as np
import datetime
import glob
import gc
import os

#========================================================================
# Args
#========================================================================
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month']

win_path = f'../features/4_winner/*.gz'
fname=''
# submit = pd.read_csv('../input/sample_submission.csv')
submit = []

HOME = os.path.expanduser('~')

sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from preprocessing import get_ordinal_mapping
from utils import logger_func
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base*')
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
test_feature_list = utils.parallel_load_data(path_list=test_path_list)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)

y = train[[key, target]]

train.drop(target, axis=1, inplace=True)
test.drop(target, axis=1, inplace=True)

train.fillna(train.median(), inplace=True)
test.fillna(test.median(), inplace=True)

# FFMは最後の列がラベルになる
train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)

train = train.merge(y, how='inner', on=key)
train.head()

In [16]:
from sklearn.model_selection import StratifiedKFold
seed = 1208

train['outliers'] = train[target].map(lambda x: 1 if x<-30 else 0)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
outliers = train['outliers'].values
train.drop('outliers', axis=1, inplace=True)
kfold = folds.split(train, outliers)

train[target] =  train[target].map(lambda x: 1 if x<-30 else 0)
y = train[target]
test.head(10000).to_csv(f'../ffm_input/ffm_test.csv', index=False, header=False)

use_cols = [col for col in train.columns if col not in ignore_list]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train, y_train = train[use_cols].iloc[trn_idx, :].values, y.iloc[trn_idx].values
    x_val, y_val = train[use_cols].iloc[val_idx, :].values, y.iloc[val_idx].values
    
    y_train = y_train.astype('int8')
    y_val = y_val.astype('int8')
    
    tmp = np.hstack((x_train, y_train.reshape(len(y_train), 1)))
    tmp = pd.DataFrame(tmp).head(20000)
    tmp.to_csv(f'../ffm_input/ffm_train_{n_fold}.csv', index=False, header=False)
    del tmp
    gc.collect()
    
    tmp = np.hstack((x_val, y_val.reshape(len(y_val), 1)))
    tmp = pd.DataFrame(tmp).head(5000)
    tmp.to_csv(f'../ffm_input/ffm_val_{n_fold}.csv', index=False, header=False)
    del tmp
    gc.collect()
    sys.exit()
    
train.to_csv('../ffm_input/ffm_train.csv', index=False)
test.to_csv('../ffm_input/ffm_test.csv', index=False)

SystemExit: 

In [18]:
import xlearn as xl

ffm_model = xl.create_ffm()
ffm_model.setTrain("../ffm_input/ffm_train_0.csv")
ffm_model.setValidate("../ffm_input/ffm_val_0.csv")
ffm_model.setTest("../ffm_input/ffm_test.csv")

param = {
#     'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
    'task':'binary', # ‘binary’ for classification, ‘reg’ for Regression
    'k':2,           # Size of latent factor
    'lr':0.1,        # Learning rate for GD
    'lambda':0.0002, # L2 Regularization Parameter
    'metric':'auc',  # Metric for monitoring validation set performance
#     'metric':'rmse', # Metric for monitoring validation set performance
    'epoch':25       # Maximum number of Epochs
}

ffm_model.fit(param, "../ffm_output/model.out")

ffm_model.predict("../ffm_output/model.out", "../ffm_output/output.txt")