In [1]:
import pickle
import pandas as pd
import numpy as np



train_df = pd.read_csv("./input/train.csv")
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_Y = train_y


with open('feat.pkl','rb') as fin:
    f_train_X,f_test_X = pickle.load(fin)
print(f_train_X.shape)

(19579, 762)


In [4]:
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import log_loss

def cv_test(k_cnt=3, s_flag = False):
    rnd = 42
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    test_pred = None
    weighted_test_pred = None
    org_train_pred = None
    avg_k_score = 0
    reverse_score = 0
    best_loss = 100
    best_single_pred = None
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]

        m = LogisticRegression(solver='newton-cg',multi_class='multinomial')
        #m = SGDClassifier()
        m.fit(X_train,y_train)
        
        # get res
        train_pred = m.predict_proba(X_train)
        print(train_pred.shape,y_train.shape)
        valid_pred = m.predict_proba(X_test)
        tmp_train_pred = m.predict_proba(f_train_X)
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        rev_valid_score = 1.0/valid_score # use for weighted
        reverse_score += rev_valid_score # sum
        print('rev',rev_valid_score)
        
        if test_pred is None:
            test_pred = m.predict_proba(f_test_X)
            weighted_test_pred = test_pred*rev_valid_score
            org_train_pred = tmp_train_pred
            best_loss = valid_score
            best_single_pred = test_pred
        else:
            curr_pred = m.predict_proba(f_test_X)
            test_pred += curr_pred
            weighted_test_pred += curr_pred*rev_valid_score # fix bug here
            org_train_pred += tmp_train_pred
            # find better single model
            if valid_score < best_loss:
                print('BETTER')
                best_loss = valid_score
                best_single_pred = curr_pred

    # avg
    test_pred = test_pred / k_cnt
    test_pred = np.round(test_pred,4)
    print(test_pred.shape)
    print(test_pred[:5])
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt


    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/lr_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('--------------')
    print(reverse_score)
    # weigthed
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = weighted_test_pred / reverse_score
    weighted_test_pred = np.round(weighted_test_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/weighted_lr_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    # best single
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = np.round(best_single_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/single_lr_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [5]:
cv_test(5,True)

(15663, 3) (15663,)
train log loss 0.232129534135 valid log loss 0.299637033651
rev 3.33737117811
(15663, 3) (15663,)
train log loss 0.234716557182 valid log loss 0.289458984507
rev 3.45472088801
BETTER
(15663, 3) (15663,)
train log loss 0.241864875343 valid log loss 0.255324307212
rev 3.91658753888
BETTER
(15663, 3) (15663,)
train log loss 0.235326548594 valid log loss 0.284167317336
rev 3.51905352584
(15664, 3) (15664,)
train log loss 0.233745003524 valid log loss 0.292577190391
rev 3.41790143881
(8392, 3)
[[ 0.0573  0.0247  0.918 ]
 [ 0.9754  0.0084  0.0162]
 [ 0.0045  0.9931  0.0023]
 [ 0.8792  0.1065  0.0143]
 [ 0.8476  0.101   0.0514]]
        id     EAP     HPL     MWS
0  id02310  0.0573  0.0247  0.9180
1  id24541  0.9754  0.0084  0.0162
2  id00134  0.0045  0.9931  0.0023
3  id27757  0.8792  0.1065  0.0143
4  id04081  0.8476  0.1010  0.0514
--------------
17.6456345697
        id     EAP     HPL     MWS
0  id02310  0.0577  0.0248  0.9175
1  id24541  0.9754  0.0083  0.0163
2  id0

In [6]:
cv_test(3)

(13052, 3) (13052,)
train log loss 0.224110674763 valid log loss 0.309719085478
rev 3.22873225089
(13053, 3) (13053,)
train log loss 0.234339690543 valid log loss 0.285335997581
rev 3.50464017326
BETTER
(13053, 3) (13053,)
train log loss 0.234209614404 valid log loss 0.282613701358
rev 3.53839886457
BETTER
(8392, 3)
[[ 0.0561  0.0215  0.9223]
 [ 0.9726  0.0086  0.0189]
 [ 0.0039  0.9935  0.0026]
 [ 0.8756  0.1083  0.0161]
 [ 0.8316  0.1102  0.0581]]
        id     EAP     HPL     MWS
0  id02310  0.0561  0.0215  0.9223
1  id24541  0.9726  0.0086  0.0189
2  id00134  0.0039  0.9935  0.0026
3  id27757  0.8756  0.1083  0.0161
4  id04081  0.8316  0.1102  0.0581
--------------
10.2717712887
        id     EAP     HPL     MWS
0  id02310  0.0557  0.0215  0.9228
1  id24541  0.9725  0.0085  0.0191
2  id00134  0.0039  0.9935  0.0026
3  id27757  0.8750  0.1095  0.0156
4  id04081  0.8302  0.1114  0.0583
---------------
        id     EAP     HPL     MWS
0  id02310  0.0421  0.0127  0.9452
1  id24541 