In [2]:
import sys
import pandas as pd
import numpy as np
import datetime
import glob
import gc
import os

#========================================================================
# Args
#========================================================================
key = 'card_id'
target = 'target'
ignore_list = [key, target, 'merchant_id', 'first_active_month']

win_path = f'../features/4_winner/*.gz'
fname=''
# submit = pd.read_csv('../input/sample_submission.csv')
submit = []

HOME = os.path.expanduser('~')

sys.path.append(f"{HOME}/kaggle/data_analysis/library/")
import utils
from preprocessing import get_ordinal_mapping
from utils import logger_func
try:
    if not logger:
        logger=logger_func()
except NameError:
    logger=logger_func()

start_time = "{0:%Y%m%d_%H%M%S}".format(datetime.datetime.now())

2018-12-31 00:18:54,125 utils 366 [INFO]    [logger_func] start 


In [None]:
#========================================================================
# Data Load
base = utils.read_df_pkl('../input/base*')
win_path_list = glob.glob(win_path)
train_path_list = []
test_path_list = []
for path in win_path_list:
    if path.count('train'):
        train_path_list.append(path)
    elif path.count('test'):
        test_path_list.append(path)

base_train = base[~base[target].isnull()].reset_index(drop=True)
base_test = base[base[target].isnull()].reset_index(drop=True)
train_feature_list = utils.parallel_load_data(path_list=train_path_list)
test_feature_list = utils.parallel_load_data(path_list=test_path_list)
train = pd.concat(train_feature_list, axis=1)
train = pd.concat([base_train, train], axis=1)
test = pd.concat(test_feature_list, axis=1)
test = pd.concat([base_test, test], axis=1)

train_id = train[key].values
test_id = test[key].values

y = train[[key, target]]
# y[target] = y[target].map(lambda x: 1 if x<-30 else 0)
train.drop(target, axis=1, inplace=True)

for col in train.columns:
    if len(train[train[col].isnull()])==0 or len(test[test[col].isnull()])==0:
        continue
        
    imp_train = train[col].median()
    imp_test = test[col].median()
    
    train[col].fillna(imp_train, inplace=True)
    test[col].fillna(imp_test, inplace=True)

print(train.shape)
# FFMは最後の列がラベルになる
train.sort_index(axis=1, inplace=True)
test.sort_index(axis=1, inplace=True)
train = train.merge(y, how='inner', on=key)

from sklearn.model_selection import StratifiedKFold
seed = 1208

train['outliers'] = train[target].map(lambda x: 1 if x<-30 else 0)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
outliers = train['outliers'].values
train.drop('outliers', axis=1, inplace=True)
kfold = folds.split(train, outliers)

train[target] =  train[target].map(lambda x: 1 if x<-30 else 0)
y = train[target]

use_cols = [col for col in train.columns if col not in ignore_list]

for n_fold, (trn_idx, val_idx) in enumerate(kfold):
    x_train, y_train = train[use_cols].iloc[trn_idx, :].values, y.iloc[trn_idx].values
    x_val, y_val = train[use_cols].iloc[val_idx, :].values, y.iloc[val_idx].values
    
    y_train = y_train.astype('int8')
    y_val = y_val.astype('int8')
    
    tmp = np.hstack((x_train, y_train.reshape(len(y_train), 1)))
    tmp = pd.DataFrame(tmp)
    tmp.to_csv(f'../input/ffm_train_{n_fold}.csv', index=False, header=False)
    del tmp
    gc.collect()
    
    tmp = np.hstack((x_val, y_val.reshape(len(y_val), 1)))
    tmp = pd.DataFrame(tmp)
    tmp.to_csv(f'../input/ffm_val_{n_fold}.csv', index=False, header=False)
    del tmp
    gc.collect()
    sys.exit()
    
train.to_csv('../input/ffm_train.csv', index=False)
test.to_csv('../input/ffm_test.csv', index=False)

In [4]:
import xlearn as xl

ffm_model = xl.FFMModel(task='binary', 
                        lr=0.2, 
                        epoch=10, 
                        reg_lambda=0.02,
                        metric='rmse')
# Start to train
# Directly use string to specify data source
ffm_model.fit('../input/ffm_train_0.csv', 
              eval_set='../input/ffm_val_0.csv'
             )

# print model weights
print(ffm_model.weights)

# Generate predictions
y_pred = ffm_model.predict('../input/ffm_test.csv')
sys.exit()


# Training task
ffm_model = xl.create_ffm()  # Use field-aware factorization machine
ffm_model.setTrain("../input/ffm_train.csv")   # Training data
# ffm_model.setValidate("../input/titanic_test.txt")  # Validation data

# param:
#  0. binary classification
#  1. learning rate : 0.2
#  2. regular lambda : 0.002
param = {'task':'binary', 'lr':0.1, 'lambda':0.02, 'metric':'auc'}
# param = {'task':'reg', 'lr':0.1, 'lambda':0.02}

# Train model
ffm_model.cv(param)

SystemExit: 