In [7]:
import pickle
import pandas as pd
import numpy as np



train_df = pd.read_csv("./input/train.csv")
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_Y = train_y


with open('feat.pkl','rb') as fin:
    f_train_X,f_test_X = pickle.load(fin)
print(f_train_X.shape)

f_train_X = np.clip(f_train_X,a_min=0,a_max=10000)
f_test_X = np.clip(f_test_X,a_min=0,a_max=10000)

(19579, 762)


In [12]:
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.metrics import log_loss

def cv_test(k_cnt=3, s_flag = False):
    rnd = 42
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    test_pred = None
    weighted_test_pred = None
    org_train_pred = None
    avg_k_score = 0
    reverse_score = 0
    best_loss = 100
    best_single_pred = None
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]

        #m = LogisticRegression(solver='newton-cg',multi_class='multinomial')
        #m = SGDClassifier()
        m = MultinomialNB()
        m.fit(X_train,y_train)
        
        # get res
        train_pred = m.predict_proba(X_train)
        print(train_pred.shape,y_train.shape)
        valid_pred = m.predict_proba(X_test)
        tmp_train_pred = m.predict_proba(f_train_X)
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        rev_valid_score = 1.0/valid_score # use for weighted
        reverse_score += rev_valid_score # sum
        print('rev',rev_valid_score)
        
        if test_pred is None:
            test_pred = m.predict_proba(f_test_X)
            weighted_test_pred = test_pred*rev_valid_score
            org_train_pred = tmp_train_pred
            best_loss = valid_score
            best_single_pred = test_pred
        else:
            curr_pred = m.predict_proba(f_test_X)
            test_pred += curr_pred
            weighted_test_pred += curr_pred*rev_valid_score # fix bug here
            org_train_pred += tmp_train_pred
            # find better single model
            if valid_score < best_loss:
                print('BETTER')
                best_loss = valid_score
                best_single_pred = curr_pred

    # avg
    test_pred = test_pred / k_cnt
    test_pred = np.round(test_pred,4)
    print(test_pred.shape)
    print(test_pred[:5])
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt

    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/nb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('--------------')
    print(reverse_score)
    # weigthed
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = weighted_test_pred / reverse_score
    weighted_test_pred = np.round(weighted_test_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/weighted_nb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    # best single
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = np.round(best_single_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/single_nb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [13]:
cv_test(5,True)

(15663, 3) (15663,)
train log loss 2.3580688342 valid log loss 2.52497173399
rev 0.396044037459
(15663, 3) (15663,)
train log loss 2.3622972402 valid log loss 2.51919599821
rev 0.396952043712
BETTER
(15663, 3) (15663,)
train log loss 2.45722836598 valid log loss 2.09727259479
rev 0.476809739699
BETTER
(15663, 3) (15663,)
train log loss 2.40441217296 valid log loss 2.39455171799
rev 0.41761470111
(15664, 3) (15664,)
train log loss 2.36965466292 valid log loss 2.45405938453
rev 0.407488101675
(8392, 3)
[[ 0.      0.      1.    ]
 [ 1.      0.      0.    ]
 [ 0.      1.      0.    ]
 [ 0.9986  0.0014  0.    ]
 [ 1.      0.      0.    ]]
        id     EAP     HPL  MWS
0  id02310  0.0000  0.0000  1.0
1  id24541  1.0000  0.0000  0.0
2  id00134  0.0000  1.0000  0.0
3  id27757  0.9986  0.0014  0.0
4  id04081  1.0000  0.0000  0.0
--------------
2.09490862365
        id     EAP     HPL  MWS
0  id02310  0.0000  0.0000  1.0
1  id24541  1.0000  0.0000  0.0
2  id00134  0.0000  1.0000  0.0
3  id2775