In [1]:
import pickle
import pandas as pd
import numpy as np



train_df = pd.read_csv("./input/train.csv")
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_Y = train_y


with open('feat.pkl','rb') as fin:
    f_train_X,f_test_X = pickle.load(fin)
print(f_train_X.shape)

(19579, 781)


In [2]:
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import log_loss
import lightgbm as lgb

def cv_test(k_cnt=3, s_flag = False):
    rnd = 42
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    test_pred = None
    weighted_test_pred = None
    org_train_pred = None
    avg_k_score = 0
    reverse_score = 0
    best_loss = 100
    best_single_pred = None
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        
        dtx = lgb.Dataset(X_train, label=y_train)
        dtv = lgb.Dataset(X_test, label=y_test)
        
        params = {'learning_rate':0.05
         ,'max_depth':4
         ,'objective':'multiclass'
         ,'num_class':3
         ,'metric':{'multi_logloss'}
         ,'num_leaves':128
         ,'min_data_in_leaf':128
         ,'bagging_fraction':0.85 
         ,'feature_fraction':0.85 
         ,'lambda_l1':1.0}
        
        m = lgb.train(params, train_set=dtx, valid_sets=dtv, valid_names=['val'],
                      num_boost_round=1000,
                      early_stopping_rounds=100,
                      verbose_eval=50)
        
        # get res
        train_pred = m.predict(X_train)
        print(train_pred.shape,y_train.shape)
        valid_pred = m.predict(X_test)
        tmp_train_pred = m.predict(f_train_X)
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        rev_valid_score = 1.0/valid_score # use for weighted
        reverse_score += rev_valid_score # sum
        print('rev',rev_valid_score)
        
        if test_pred is None:
            test_pred = m.predict(f_test_X)
            weighted_test_pred = test_pred*rev_valid_score
            org_train_pred = tmp_train_pred
            best_loss = valid_score
            best_single_pred = test_pred
        else:
            curr_pred = m.predict(f_test_X)
            test_pred += curr_pred
            weighted_test_pred += curr_pred*rev_valid_score # fix bug here
            org_train_pred += tmp_train_pred
            # find better single model
            if valid_score < best_loss:
                print('BETTER')
                best_loss = valid_score
                best_single_pred = curr_pred

    # avg
    test_pred = test_pred / k_cnt
    print(test_pred.shape)
    print(test_pred[:5])
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt


    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('--------------')
    print(reverse_score)
    # weigthed
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = weighted_test_pred / reverse_score
    weighted_test_pred = np.round(weighted_test_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/weighted_lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    # best single
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = np.round(best_single_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/single_lgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [None]:
cv_test(5,True)

Training until validation scores don't improve for 100 rounds.
[50]	val's multi_logloss: 0.330587
[100]	val's multi_logloss: 0.276405
[150]	val's multi_logloss: 0.264631
[200]	val's multi_logloss: 0.25943
[250]	val's multi_logloss: 0.257693
[300]	val's multi_logloss: 0.256492
[350]	val's multi_logloss: 0.255642
[400]	val's multi_logloss: 0.255138
[450]	val's multi_logloss: 0.254946
[500]	val's multi_logloss: 0.255105
[550]	val's multi_logloss: 0.255204
Early stopping, best iteration is:
[450]	val's multi_logloss: 0.254946
(15663, 3) (15663,)
train log loss 0.130383614505 valid log loss 0.254890412683
rev 3.92325466256
Training until validation scores don't improve for 100 rounds.
[50]	val's multi_logloss: 0.332671
[100]	val's multi_logloss: 0.282413
[150]	val's multi_logloss: 0.272987
[200]	val's multi_logloss: 0.269362
[250]	val's multi_logloss: 0.268323
[300]	val's multi_logloss: 0.267334
[350]	val's multi_logloss: 0.266816
[400]	val's multi_logloss: 0.267217
[450]	val's multi_loglos

In [None]:
cv_test(3)

Training until validation scores don't improve for 100 rounds.
[50]	val's multi_logloss: 0.335845
[100]	val's multi_logloss: 0.285236
[150]	val's multi_logloss: 0.275847
[200]	val's multi_logloss: 0.27231
[250]	val's multi_logloss: 0.270957
[300]	val's multi_logloss: 0.270868
[350]	val's multi_logloss: 0.271914
Early stopping, best iteration is:
[277]	val's multi_logloss: 0.270739
(13052, 3) (13052,)
train log loss 0.151749886059 valid log loss 0.270773313197
rev 3.69312613638
Training until validation scores don't improve for 100 rounds.
[50]	val's multi_logloss: 0.327873
[100]	val's multi_logloss: 0.274952
[150]	val's multi_logloss: 0.262453
[200]	val's multi_logloss: 0.256936
[250]	val's multi_logloss: 0.254153
[300]	val's multi_logloss: 0.252878
[350]	val's multi_logloss: 0.252223
[400]	val's multi_logloss: 0.252018
[450]	val's multi_logloss: 0.252688
[500]	val's multi_logloss: 0.253508
Early stopping, best iteration is:
[403]	val's multi_logloss: 0.252008
(13053, 3) (13053,)
train