In [13]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import gc

gc.enable()

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 60,
    'min_data_in_leaf': 20,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.96,
    'bagging_freq': 1,
    'lambda_l1': 0,
    'verbose': 0
}

#params = {
#    'task': 'train',
#    'boosting_type': 'rf',
#    'objective': 'binary',
#    'metric': {'auc'},
#    'num_leaves': 400,
#    'min_data_in_leaf': 2,
#    'feature_fraction': 0.3,
#    'bagging_fraction': 0.7,
#    'bagging_freq': 1,
#    'lambda_l1': 0,
#    'min_gain_to_split': 0,
#    'verbose': 0
#}

In [1]:
import pandas as pd
import numpy as np

data = "/mnt/d/Data/mangaki-data-challenge/latest/"

In [14]:
def training(train, valid):
    X = lgb.Dataset(train.drop(['user_id', 'work_id', 'rating'], axis=1), train['rating'])
    V = lgb.Dataset(valid.drop(['user_id', 'work_id', 'rating'], axis=1), valid['rating'], reference=X)
    gbdt = lgb.train(params, X, valid_sets=[X,V], num_boost_round=200, early_stopping_rounds=20, verbose_eval=True)
    Yvp = gbdt.predict(valid.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    Ytp = gbdt.predict(train.drop(['user_id', 'work_id', 'rating'], axis=1), num_iteration=gbdt.best_iteration)
    return (roc_auc_score(train['rating'].values, Ytp), roc_auc_score(valid['rating'].values, Yvp))

In [4]:
def cv(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            t = pd.read_csv(data+'train_{0}.csv'.format(str(fold)))
            v = pd.read_csv(data+'valid_{0}.csv'.format(str(fold)))
            tv[fold-1], vv[fold-1] = training(t, v)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [15]:
cv('feature_fraction', [0.6])

[1]	training's auc: 0.784762	valid_1's auc: 0.720833
Training until validation scores don't improve for 20 rounds.
[2]	training's auc: 0.811098	valid_1's auc: 0.742044
[3]	training's auc: 0.830911	valid_1's auc: 0.756794
[4]	training's auc: 0.840179	valid_1's auc: 0.761628
[5]	training's auc: 0.84904	valid_1's auc: 0.768451
[6]	training's auc: 0.855168	valid_1's auc: 0.772217
[7]	training's auc: 0.859819	valid_1's auc: 0.773952
[8]	training's auc: 0.864501	valid_1's auc: 0.776772
[9]	training's auc: 0.868825	valid_1's auc: 0.779215
[10]	training's auc: 0.873385	valid_1's auc: 0.780585
[11]	training's auc: 0.876527	valid_1's auc: 0.781114
[12]	training's auc: 0.881223	valid_1's auc: 0.78335
[13]	training's auc: 0.884312	valid_1's auc: 0.782719
[14]	training's auc: 0.889617	valid_1's auc: 0.785207
[15]	training's auc: 0.893603	valid_1's auc: 0.787159
[16]	training's auc: 0.897321	valid_1's auc: 0.786861
[17]	training's auc: 0.901607	valid_1's auc: 0.787091
[18]	training's auc: 0.904693	v

[41]	training's auc: 0.961509	valid_1's auc: 0.80529
[42]	training's auc: 0.962834	valid_1's auc: 0.805481
[43]	training's auc: 0.964121	valid_1's auc: 0.805576
[44]	training's auc: 0.965524	valid_1's auc: 0.806144
[45]	training's auc: 0.966966	valid_1's auc: 0.806629
[46]	training's auc: 0.968115	valid_1's auc: 0.806559
[47]	training's auc: 0.969496	valid_1's auc: 0.807082
[48]	training's auc: 0.970793	valid_1's auc: 0.806851
[49]	training's auc: 0.971939	valid_1's auc: 0.807248
[50]	training's auc: 0.973029	valid_1's auc: 0.807514
[51]	training's auc: 0.974361	valid_1's auc: 0.807186
[52]	training's auc: 0.975376	valid_1's auc: 0.807217
[53]	training's auc: 0.976868	valid_1's auc: 0.807019
[54]	training's auc: 0.977683	valid_1's auc: 0.807309
[55]	training's auc: 0.978603	valid_1's auc: 0.80709
[56]	training's auc: 0.979804	valid_1's auc: 0.807117
[57]	training's auc: 0.980709	valid_1's auc: 0.807223
[58]	training's auc: 0.981874	valid_1's auc: 0.80746
[59]	training's auc: 0.982741	v

[107]	training's auc: 0.998276	valid_1's auc: 0.809761
[108]	training's auc: 0.998359	valid_1's auc: 0.810173
[109]	training's auc: 0.998492	valid_1's auc: 0.810527
[110]	training's auc: 0.998564	valid_1's auc: 0.810464
[111]	training's auc: 0.998633	valid_1's auc: 0.811064
[112]	training's auc: 0.998708	valid_1's auc: 0.811022
[113]	training's auc: 0.998753	valid_1's auc: 0.810849
[114]	training's auc: 0.998797	valid_1's auc: 0.810767
[115]	training's auc: 0.998854	valid_1's auc: 0.810819
[116]	training's auc: 0.998898	valid_1's auc: 0.810976
[117]	training's auc: 0.998951	valid_1's auc: 0.810921
[118]	training's auc: 0.998996	valid_1's auc: 0.810624
[119]	training's auc: 0.999077	valid_1's auc: 0.81074
[120]	training's auc: 0.999135	valid_1's auc: 0.81057
[121]	training's auc: 0.999198	valid_1's auc: 0.810472
[122]	training's auc: 0.999241	valid_1's auc: 0.810354
[123]	training's auc: 0.999291	valid_1's auc: 0.810562
[124]	training's auc: 0.999321	valid_1's auc: 0.810849
[125]	traini

LibFM

In [26]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_svmlight_file

path = "/mnt/d/Data/mangaki-data-challenge/libfm/"

In [27]:
train, ty = load_svmlight_file(path+"train_1.csv")
valid, vy = load_svmlight_file(path+"valid_1.csv")

In [33]:
train_append = pd.read_csv(data+"train_1.csv").fillna(0)
valid_append = pd.read_csv(data+"valid_1.csv").fillna(0)

In [31]:
train_append.ix[:, 15:].head()

Unnamed: 0,itemw2vpos_0_x,itemw2vpos_1_x,itemw2vpos_2_x,itemw2vpos_3_x,itemw2vpos_4_x,itemw2vpos_5_x,itemw2vpos_6_x,itemw2vpos_7_x,itemw2vpos_8_x,itemw2vpos_9_x,...,user_ldapos_10,user_ldapos_11,user_ldapos_12,user_ldapos_13,user_ldapos_14,user_ldapos_15,user_ldapos_16,user_ldapos_17,user_ldapos_18,user_ldapos_19
0,-0.158782,0.011554,-0.196929,-0.139314,-0.196677,-0.253198,0.124238,-0.289648,0.055328,0.41772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.075337,0.0
1,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07727
2,,,,,,,,,,,...,0.0,0.0,0.0,0.285676,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.232903,0.074238,-0.257882,-0.262738,-0.26249,-0.365255,0.189333,-0.459551,0.111113,0.768426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.044899,0.066113,-0.091652,-0.098183,-0.142543,-0.129436,0.072879,-0.166809,0.05712,0.27453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.422931


In [34]:
from scipy.sparse import hstack

hstack([train, train_append.ix[:, 15:].values])

<7441x12015 sparse matrix of type '<class 'numpy.float64'>'
	with 711750 stored elements in COOrdinate format>

In [59]:
params = {
    'n_iter':200, 
    'init_stdev':0.001,
    'l2_reg_w':0.01,
    'l2_reg_V':0.1,
    'rank':10,
    'step_size':0.1
}

In [8]:
from fastFM import sgd

fm = sgd.FMClassification(**params)

In [9]:
fm.l2_reg_w

0.01

In [56]:
fm.fit(train, np.require(ty*2-1, dtype=np.int))
print("Training set AUC {0}; validation set AUC {1}.".format(roc_auc_score(ty, fm.predict_proba(train)), roc_auc_score(vy, fm.predict_proba(valid))))

Training set AUC 0.8856164958057352; validation set AUC 0.795728824146936.


In [24]:
%pdb

Automatic pdb calling has been turned ON


In [62]:
from scipy.sparse import hstack
from fastFM import sgd

def train_fm(train, valid, trainy, validy):
    fm = sgd.FMClassification(**params)
    fm.fit(train, np.require(trainy*2-1, dtype=np.int))
    return (roc_auc_score(trainy, fm.predict_proba(train)), roc_auc_score(validy, fm.predict_proba(valid)))

def cv_fm(param, paramlst):
    trainauc = [0.0]*len(paramlst)
    validauc = [0.0]*len(paramlst)
    for i, p in enumerate(paramlst):
        params[param]=p
        tv = [0,0,0]
        vv = [0,0,0]
        for fold in [1,2,3]:
            train, ty = load_svmlight_file(path+"train_{0}.csv".format(fold))
            valid, vy = load_svmlight_file(path+"valid_{0}.csv".format(fold))
            ta = pd.read_csv(data+'train_{0}.csv'.format(str(fold))).fillna(0)
            va = pd.read_csv(data+'valid_{0}.csv'.format(str(fold))).fillna(0)
            #train = hstack([train, ta.ix[:, 15:].values])
            #valid = hstack([valid, va.ix[:, 15:].values])
            tv[fold-1], vv[fold-1] = train_fm(train, valid, ty, vy)
        trainauc[i]=np.mean(tv)
        validauc[i]=np.mean(vv)
    paramtable = pd.DataFrame({
        'TrainingSet': trainauc,
        'ValidationSet': validauc
    }, columns=['TrainingSet', 'ValidationSet'], index=pd.Index(paramlst, name=param))
    print(paramtable)

In [61]:
cv_fm('step_size', [0.1, 0.05, 0.01])

           TrainingSet  ValidationSet
step_size                            
0.10          0.633931       0.630944
0.05          0.639026       0.635908
0.01          0.611031       0.609930
