In [1]:
import pickle
import pandas as pd
import numpy as np



train_df = pd.read_csv("./input/train.csv")
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_Y = train_y


with open('feat.pkl','rb') as fin:
    f_train_X,f_test_X = pickle.load(fin)
print(f_train_X.shape)

(19579, 1866)


In [2]:
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import log_loss
import lightgbm as lgb

def cv_test(k_cnt=3, s_flag = False):
    rnd = 420
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    test_pred = None
    weighted_test_pred = None
    org_train_pred = None
    avg_k_score = 0
    reverse_score = 0
    best_loss = 100
    best_single_pred = None
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        
        dtx = lgb.Dataset(X_train, label=y_train)
        dtv = lgb.Dataset(X_test, label=y_test)
        
        params = {
            'learning_rate':0.04,
            'max_depth':4,
            'objective':'multiclass',
            'num_class':3,
            'metric':{'multi_logloss'},
            'feature_fraction':0.8,
            'bagging_fraction':0.7,
            'lambda_l2':1.0
        }
        
        m = lgb.train(params, train_set=dtx, valid_sets=dtv, valid_names=['val'],
                      num_boost_round=1000,
                      early_stopping_rounds=50,
                      verbose_eval=50)
        
        # get res
        train_pred = m.predict(X_train)
        print(train_pred.shape,y_train.shape)
        valid_pred = m.predict(X_test)
        tmp_train_pred = m.predict(f_train_X)
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        rev_valid_score = 1.0/valid_score # use for weighted
        reverse_score += rev_valid_score # sum
        print('rev',rev_valid_score)
        
        if test_pred is None:
            test_pred = m.predict(f_test_X)
            weighted_test_pred = test_pred*rev_valid_score
            org_train_pred = tmp_train_pred
            best_loss = valid_score
            best_single_pred = test_pred
        else:
            curr_pred = m.predict(f_test_X)
            test_pred += curr_pred
            weighted_test_pred += curr_pred*rev_valid_score # fix bug here
            org_train_pred += tmp_train_pred
            # find better single model
            if valid_score < best_loss:
                print('BETTER')
                best_loss = valid_score
                best_single_pred = curr_pred

    # avg
    test_pred = test_pred / k_cnt
    print(test_pred.shape)
    print(test_pred[:5])
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt


    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('--------------')
    print(reverse_score)
    # weigthed
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = weighted_test_pred / reverse_score
    weighted_test_pred = np.round(weighted_test_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/weighted_lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    # best single
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = np.round(best_single_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/single_lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [3]:
cv_test(5,True)

Training until validation scores don't improve for 50 rounds.
[50]	val's multi_logloss: 0.360099
[100]	val's multi_logloss: 0.284331
[150]	val's multi_logloss: 0.269445
[200]	val's multi_logloss: 0.264686
[250]	val's multi_logloss: 0.262473
[300]	val's multi_logloss: 0.261332
[350]	val's multi_logloss: 0.260809
[400]	val's multi_logloss: 0.261061
Early stopping, best iteration is:
[357]	val's multi_logloss: 0.260706
(15663, 3) (15663,)
train log loss 0.152493228338 valid log loss 0.260819032209
rev 3.83407603168
Training until validation scores don't improve for 50 rounds.
[50]	val's multi_logloss: 0.365961
[100]	val's multi_logloss: 0.291184
[150]	val's multi_logloss: 0.276315
[200]	val's multi_logloss: 0.269672
[250]	val's multi_logloss: 0.267424
[300]	val's multi_logloss: 0.265944
[350]	val's multi_logloss: 0.265218
[400]	val's multi_logloss: 0.264875
[450]	val's multi_logloss: 0.264679
[500]	val's multi_logloss: 0.265054
Early stopping, best iteration is:
[463]	val's multi_logloss:

In [4]:
#cv_test(3)