In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

In [2]:
# Clean text
from tqdm import tqdm
tqdm.pandas()
punctuation = ['.', '..', '...', ',', ':', ';', '-', '*', '"', '!', '?']
alphabet = 'abcdefghijklmnopqrstuvwxyz'
def clean_text(x):
    x.lower()
    for p in punctuation:
        x.replace(p, '')
    return x

def extract_features(in_df, train_flag=False):
    df = in_df.copy()
    df['text_cleaned'] = df['text'].apply(lambda x: clean_text(x))
    df['n_.'] = df['text'].str.count('\.')
    df['n_...'] = df['text'].str.count('\...')
    df['n_,'] = df['text'].str.count('\,')
    df['n_:'] = df['text'].str.count('\:')
    df['n_;'] = df['text'].str.count('\;')
    df['n_-'] = df['text'].str.count('\-')
    df['n_?'] = df['text'].str.count('\?')
    df['n_!'] = df['text'].str.count('\!')
    df['n_\''] = df['text'].str.count('\'')
    df['n_"'] = df['text'].str.count('\"')

    # First words in a sentence
    df['n_The '] = df['text'].str.count('The ')
    df['n_I '] = df['text'].str.count('I ')
    df['n_It '] = df['text'].str.count('It ')
    df['n_He '] = df['text'].str.count('He ')
    df['n_Me '] = df['text'].str.count('Me ')
    df['n_She '] = df['text'].str.count('She ')
    df['n_We '] = df['text'].str.count('We ')
    df['n_They '] = df['text'].str.count('They ')
    df['n_You '] = df['text'].str.count('You ')
    df['n_the'] = df['text_cleaned'].str.count('the ')
    df['n_ a '] = df['text_cleaned'].str.count(' a ')
    df['n_appear'] = df['text_cleaned'].str.count('appear')
    df['n_little'] = df['text_cleaned'].str.count('little')
    df['n_was '] = df['text_cleaned'].str.count('was ')
    df['n_one '] = df['text_cleaned'].str.count('one ')
    df['n_two '] = df['text_cleaned'].str.count('two ')
    df['n_three '] = df['text_cleaned'].str.count('three ')
    df['n_ten '] = df['text_cleaned'].str.count('ten ')
    df['n_is '] = df['text_cleaned'].str.count('is ')
    df['n_are '] = df['text_cleaned'].str.count('are ')
    df['n_ed'] = df['text_cleaned'].str.count('ed ')
    df['n_however'] = df['text_cleaned'].str.count('however')
    df['n_ to '] = df['text_cleaned'].str.count(' to ')
    df['n_into'] = df['text_cleaned'].str.count('into')
    df['n_about '] = df['text_cleaned'].str.count('about ')
    df['n_th'] = df['text_cleaned'].str.count('th')
    df['n_er'] = df['text_cleaned'].str.count('er')
    df['n_ex'] = df['text_cleaned'].str.count('ex')
    df['n_an '] = df['text_cleaned'].str.count('an ')
    df['n_ground'] = df['text_cleaned'].str.count('ground')
    df['n_any'] = df['text_cleaned'].str.count('any')
    df['n_silence'] = df['text_cleaned'].str.count('silence')
    df['n_wall'] = df['text_cleaned'].str.count('wall')
    
    # Find numbers of different combinations
    for c in tqdm(alphabet.upper()):
        df['n_' + c] = df['text'].str.count(c)
        df['n_' + c + '.'] = df['text'].str.count(c + '\.')
        df['n_' + c + ','] = df['text'].str.count(c + '\,')

        for c2 in alphabet:
            df['n_' + c + c2] = df['text'].str.count(c + c2)
            df['n_' + c + c2 + '.'] = df['text'].str.count(c + c2 + '\.')
            df['n_' + c + c2 + ','] = df['text'].str.count(c + c2 + '\,')

    for c in tqdm(alphabet):
        df['n_' + c + '.'] = df['text'].str.count(c + '\.')
        df['n_' + c + ','] = df['text'].str.count(c + '\,')
        df['n_' + c + '?'] = df['text'].str.count(c + '\?')
        df['n_' + c + ';'] = df['text'].str.count(c + '\;')
        df['n_' + c + ':'] = df['text'].str.count(c + '\:')

        for c2 in alphabet:
            df['n_' + c + c2 + '.'] = df['text'].str.count(c + c2 + '\.')
            df['n_' + c + c2 + ','] = df['text'].str.count(c + c2 + '\,')
            df['n_' + c + c2 + '?'] = df['text'].str.count(c + c2 + '\?')
            df['n_' + c + c2 + ';'] = df['text'].str.count(c + c2 + '\;')
            df['n_' + c + c2 + ':'] = df['text'].str.count(c + c2 + '\:')
            df['n_' + c + ', ' + c2] = df['text'].str.count(c + '\, ' + c2)

    # And now starting processing of cleaned text
    for c in tqdm(alphabet):
        df['n_' + c] = df['text_cleaned'].str.count(c)
        df['n_' + c + ' '] = df['text_cleaned'].str.count(c + ' ')
        df['n_' + ' ' + c] = df['text_cleaned'].str.count(' ' + c)

        for c2 in alphabet:
            df['n_' + c + c2] = df['text_cleaned'].str.count(c + c2)
            df['n_' + c + c2 + ' '] = df['text_cleaned'].str.count(c + c2 + ' ')
            df['n_' + ' ' + c + c2] = df['text_cleaned'].str.count(' ' + c + c2)
            df['n_' + c + ' ' + c2] = df['text_cleaned'].str.count(c + ' ' + c2)

            for c3 in alphabet:
                df['n_' + c + c2 + c3] = df['text_cleaned'].str.count(c + c2 + c3)
                
    if train_flag:
        df.drop(['text_cleaned','text','author','id'], axis=1, inplace=True)
    else:
        df.drop(['text_cleaned','text','id'], axis=1, inplace=True)
    return df.values
    
print('Processing train...')
train_hand_features = extract_features(train_df,train_flag=True)
print('Processing test...')
test_hand_features = extract_features(test_df)
print(train_hand_features.shape,test_hand_features.shape)

Processing train...


100%|██████████| 26/26 [00:27<00:00,  1.05s/it]
100%|██████████| 26/26 [01:06<00:00,  2.55s/it]
100%|██████████| 26/26 [07:54<00:00, 18.26s/it]


Processing test...


100%|██████████| 26/26 [00:11<00:00,  2.33it/s]
100%|██████████| 26/26 [00:27<00:00,  1.05s/it]
100%|██████████| 26/26 [03:35<00:00,  8.28s/it]


(19579, 26685) (8392, 26685)


In [3]:
# https://spacy.io/usage/models#usage-import
# https://spacy.io/usage/models
import en_core_web_sm
spacy_nlp = en_core_web_sm.load()

# change ne to tag
def get_spacy_text(s):
    pos,tag,dep = '','',''
    for token in spacy_nlp(s):
        pos = pos + ' ' + token.pos_
        tag = tag + ' ' + token.tag_
        dep = dep + ' ' + token.dep_

    return pos,tag,dep

print(get_spacy_text('this is kaggle spooky games.'))

import time
start_t = time.time()
poss,tags,deps = [],[],[]
for s in train_df["text"].values:
    pos,tag,dep = get_spacy_text(s)
    poss.append(pos)
    tags.append(tag)
    deps.append(dep)
train_df['pos_txt'],train_df['tag_txt'],train_df['dep_txt'] = poss, tags, deps
print('train done',time.time() - start_t)


start_t = time.time()
poss,tags,deps = [],[],[]
for s in test_df["text"].values:
    pos,tag,dep = get_spacy_text(s)
    poss.append(pos)
    tags.append(tag)
    deps.append(dep)
test_df['pos_txt'],test_df['tag_txt'],test_df['dep_txt'] = poss, tags, deps
print('test done', time.time() - start_t)

(' DET VERB NOUN ADJ NOUN PUNCT', ' DT VBZ NN JJ NNS .', ' nsubj ROOT nmod amod attr punct')
train done 271.81710720062256
test done 117.49479866027832


In [4]:
# cnt on tag
c_vec3 = CountVectorizer(lowercase=False,ngram_range=(1,1))
c_vec3.fit(train_df['tag_txt'].values.tolist() + test_df['tag_txt'].values.tolist())
train_cvec3 = c_vec3.transform(train_df['tag_txt'].values.tolist()).toarray()
test_cvec3 = c_vec3.transform(test_df['tag_txt'].values.tolist()).toarray()
print(train_cvec3.shape,test_cvec3.shape)

# cnt on ne
c_vec4 = CountVectorizer(lowercase=False,ngram_range=(1,2))
c_vec4.fit(train_df['pos_txt'].values.tolist() + test_df['pos_txt'].values.tolist())
train_cvec4 = c_vec4.transform(train_df['pos_txt'].values.tolist()).toarray()
test_cvec4 = c_vec4.transform(test_df['pos_txt'].values.tolist()).toarray()
print(train_cvec4.shape,test_cvec4.shape)

# cnt on dep
c_vec7 = CountVectorizer(lowercase=False,ngram_range=(1,1))
c_vec7.fit(train_df['dep_txt'].values.tolist() + test_df['dep_txt'].values.tolist())
train_cvec7 = c_vec7.transform(train_df['dep_txt'].values.tolist()).toarray()
test_cvec7 = c_vec7.transform(test_df['dep_txt'].values.tolist()).toarray()
print(train_cvec7.shape,test_cvec7.shape)

# tfidf on tag
tf_vec5 = TfidfVectorizer(lowercase=False,ngram_range=(1,1))
tf_vec5.fit(train_df['tag_txt'].values.tolist() + test_df['tag_txt'].values.tolist())
train_tf5 = tf_vec5.transform(train_df['tag_txt'].values.tolist()).toarray()
test_tf5 = tf_vec5.transform(test_df['tag_txt'].values.tolist()).toarray()
print(train_tf5.shape,test_tf5.shape)

# tfidf on ne
tf_vec6 = TfidfVectorizer(lowercase=False,ngram_range=(1,2))
tf_vec6.fit(train_df['pos_txt'].values.tolist() + test_df['pos_txt'].values.tolist())
train_tf6 = tf_vec6.transform(train_df['pos_txt'].values.tolist()).toarray()
test_tf6 = tf_vec6.transform(test_df['pos_txt'].values.tolist()).toarray()
print(train_tf6.shape,test_tf6.shape)

# tfidf on dep
tf_vec8 = TfidfVectorizer(lowercase=False,ngram_range=(1,1))
tf_vec8.fit(train_df['dep_txt'].values.tolist() + test_df['dep_txt'].values.tolist())
train_tf8 = tf_vec8.transform(train_df['dep_txt'].values.tolist()).toarray()
test_tf8 = tf_vec8.transform(test_df['dep_txt'].values.tolist()).toarray()
print(train_tf8.shape,test_tf8.shape)

(19579, 38) (8392, 38)
(19579, 186) (8392, 186)
(19579, 45) (8392, 45)
(19579, 38) (8392, 38)
(19579, 186) (8392, 186)
(19579, 45) (8392, 45)


In [5]:
all_nlp_train = np.hstack([train_cvec3,train_cvec4,train_tf5,train_tf6,train_cvec7, train_tf8]) 
all_nlp_test = np.hstack([test_cvec3,test_cvec4,test_tf5,test_tf6, test_cvec7, test_tf8]) 
print('nlp feat done')

nlp feat done


In [6]:
eng_stopwords = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"]

In [7]:
# replace
# train_df['text'] = train_df['text'].str.replace('[^a-zA-Z0-9]', ' ')
# test_df['text'] =test_df['text'].str.replace('[^a-zA-Z0-9]', ' ')

## Number of words in the text ##
train_df["num_words"] = train_df["text"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["text"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["text"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["text"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
train_df["num_stopwords"] = train_df["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test_df["num_stopwords"] = test_df["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
import string
train_df["num_punctuations"] =train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

# add features
def add_feat(df):
    df['unique_r'] = df['num_unique_words'] / df['num_words']
    df['w_p'] = df['num_words'] - df['num_punctuations']
    df['w_p_r'] = df['w_p'] / df['num_words']
    df['stop_r'] = df['num_stopwords'] / df['num_words']
    df['w_p_stop'] = df['w_p'] - df['num_stopwords']
    df['w_p_stop_r'] = df['w_p_stop'] / df['num_words']
    df['num_words_upper_r'] = df['num_words_upper'] / df['num_words']
    df['num_words_title_r'] = df['num_words_title'] / df['num_words']

add_feat(train_df)
add_feat(test_df)
print(train_df.columns)

Index(['id', 'text', 'author', 'pos_txt', 'tag_txt', 'dep_txt', 'num_words',
       'num_unique_words', 'num_chars', 'num_stopwords', 'num_punctuations',
       'num_words_upper', 'num_words_title', 'mean_word_len', 'unique_r',
       'w_p', 'w_p_r', 'stop_r', 'w_p_stop', 'w_p_stop_r', 'num_words_upper_r',
       'num_words_title_r'],
      dtype='object')


In [None]:
## Prepare the data for modeling ###
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_id = train_df['id'].values
test_id = test_df['id'].values

## add tfidf and svd
tfidf_vec = TfidfVectorizer(ngram_range=(1,3), max_df=0.8,lowercase=False, sublinear_tf=True)
full_tfidf = tfidf_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['text'].values.tolist())

print(train_tfidf.shape,test_tfidf.shape)

# svd1
n_comp = 20
svd_obj = TruncatedSVD(n_components=n_comp)
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
print(train_svd.shape,test_svd.shape)

## add tfidf char
tfidf_vec = TfidfVectorizer(ngram_range=(3,7), analyzer='char',max_df=0.8, sublinear_tf=True)
full_tfidf2 = tfidf_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_tfidf2 = tfidf_vec.transform(train_df['text'].values.tolist())
test_tfidf2 = tfidf_vec.transform(test_df['text'].values.tolist())
print(train_tfidf2.shape,test_tfidf2.shape)

## add svd2
svd_obj = TruncatedSVD(n_components=n_comp)
svd_obj.fit(full_tfidf2)
train_svd2 = pd.DataFrame(svd_obj.transform(train_tfidf2))
test_svd2 = pd.DataFrame(svd_obj.transform(test_tfidf2))
print(train_svd2.shape,test_svd2.shape)


## add cnt vec
c_vec = CountVectorizer(ngram_range=(1,3),max_df=0.8, lowercase=False)
full_cvec1 = c_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_cvec = c_vec.transform(train_df['text'].values.tolist())
test_cvec = c_vec.transform(test_df['text'].values.tolist())
print(train_cvec.shape,test_cvec.shape)

## add svd3
svd_obj = TruncatedSVD(n_components=n_comp)
svd_obj.fit(full_cvec1)
train_svd3 = pd.DataFrame(svd_obj.transform(train_cvec))
test_svd3 = pd.DataFrame(svd_obj.transform(test_cvec))

# add cnt char
c_vec = CountVectorizer(ngram_range=(3,7), analyzer='char',max_df=0.8)
full_cvec2 = c_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_cvec2 = c_vec.transform(train_df['text'].values.tolist())
test_cvec2 = c_vec.transform(test_df['text'].values.tolist())
print(train_cvec2.shape,test_cvec2.shape)

## add svd4
svd_obj = TruncatedSVD(n_components=n_comp)
svd_obj.fit(full_cvec2)
train_svd4 = pd.DataFrame(svd_obj.transform(train_cvec2))
test_svd4 = pd.DataFrame(svd_obj.transform(test_cvec2))

# add cnt char
c_vec = CountVectorizer(ngram_range=(1,1), analyzer='char',max_df=0.8)
full_cvec3 = c_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_cvec3 = c_vec.transform(train_df['text'].values.tolist())
test_cvec3 = c_vec.transform(test_df['text'].values.tolist())
print(train_cvec3.shape,test_cvec3.shape)

In [10]:
all_svd_train = np.hstack([train_svd,train_svd2,train_svd3,train_svd4,train_cvec3.toarray()])
all_svd_test = np.hstack([test_svd,test_svd2,test_svd3,test_svd4,test_cvec3.toarray()])

In [11]:
# add naive feature
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

feat_cnt = 5
train_Y = train_y

def gen_nb_feats(rnd=1):
    help_tfidf_train,help_tfidf_test = np.zeros((19579,3)),np.zeros((8392,3))
    help_tfidf_train2,help_tfidf_test2 = np.zeros((19579,3)),np.zeros((8392,3))
    help_cnt1_train,help_cnt1_test = np.zeros((19579,3)),np.zeros((8392,3))
    help_cnt2_train,help_cnt2_test = np.zeros((19579,3)),np.zeros((8392,3))
    hand_train, hand_test = np.zeros((19579,3)),np.zeros((8392,3))

    kf = KFold(n_splits=feat_cnt, shuffle=True, random_state=23*rnd)
    for train_index, test_index in kf.split(train_tfidf):
        # tfidf to nb
        X_train, X_test = train_tfidf[train_index], train_tfidf[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        tmp_model = MultinomialNB(alpha=0.025,fit_prior=False)
        tmp_model.fit(X_train,y_train)
        tmp_train_feat = tmp_model.predict_proba(X_test)
        tmp_test_feat = tmp_model.predict_proba(test_tfidf)
        help_tfidf_train[test_index] = tmp_train_feat
        help_tfidf_test += tmp_test_feat/feat_cnt

        # tfidf to nb
        X_train, X_test = train_tfidf2[train_index], train_tfidf2[test_index]
        tmp_model = MultinomialNB(0.025,fit_prior=False)
        tmp_model.fit(X_train,y_train)
        tmp_train_feat = tmp_model.predict_proba(X_test)
        tmp_test_feat = tmp_model.predict_proba(test_tfidf2)
        help_tfidf_train2[test_index] = tmp_train_feat
        help_tfidf_test2 += tmp_test_feat/feat_cnt

        # count vec to nb
        X_train, X_test = train_cvec[train_index], train_cvec[test_index]
        tmp_model = MultinomialNB(0.025,fit_prior=False)
        tmp_model.fit(X_train,y_train)
        tmp_train_feat = tmp_model.predict_proba(X_test)
        tmp_test_feat = tmp_model.predict_proba(test_cvec)
        help_cnt1_train[test_index] = tmp_train_feat
        help_cnt1_test += tmp_test_feat/feat_cnt

        # count vec2 to nb 
        X_train, X_test = train_cvec2[train_index], train_cvec2[test_index]
        tmp_model = MultinomialNB(0.025,fit_prior=False)
        tmp_model.fit(X_train,y_train)
        tmp_train_feat = tmp_model.predict_proba(X_test)
        tmp_test_feat = tmp_model.predict_proba(test_cvec2)
        help_cnt2_train[test_index] = tmp_train_feat
        help_cnt2_test += tmp_test_feat/feat_cnt
        
        # hand feature to nb
        X_train, X_test = train_hand_features[train_index], train_hand_features[test_index]
        tmp_model = MultinomialNB(0.025,fit_prior=False)
        tmp_model.fit(X_train,y_train)
        tmp_train_feat = tmp_model.predict_proba(X_test)
        tmp_test_feat = tmp_model.predict_proba(test_hand_features)
        hand_train[test_index] = tmp_train_feat
        hand_test += tmp_test_feat/feat_cnt
    
    help_train_feat = np.hstack([help_tfidf_train,help_tfidf_train2,help_cnt1_train,help_cnt2_train,hand_train])
    help_test_feat = np.hstack([help_tfidf_test,help_tfidf_test2,help_cnt1_test,help_cnt2_test,hand_test])

    return help_train_feat,help_test_feat

help_train_feat,help_test_feat = gen_nb_feats(1)
print(help_train_feat.shape,help_test_feat.shape)
help_train_feat2,help_test_feat2 = gen_nb_feats(2)
help_train_feat3,help_test_feat3 = gen_nb_feats(3)

(19579, 15) (8392, 15)


In [12]:
# add cnn feat
from keras.layers import Embedding, CuDNNLSTM, Dense, Flatten, Dropout, LSTM
from keras.models import Sequential, load_model
from keras.callbacks import ModelCheckpoint
from keras.layers import Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.metrics import log_loss
import gc
print('import keras done')

Using TensorFlow backend.


import keras done


In [13]:
def get_cnn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 150
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(train_tfidf):
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(Conv1D(16,
                         3,
                         padding='valid',
                         activation='relu',
                         strides=1))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        np.random.seed(42) # for model train
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=64, epochs=10, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/feat_cnt
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/feat_cnt
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

cnn_train1,cnn_test1,cnn_train2,cnn_test2 = get_cnn_feats(1)
cnn_train3,cnn_test3,cnn_train4,cnn_test4 = get_cnn_feats(2)
cnn_train5,cnn_test5,cnn_train6,cnn_test6 = get_cnn_feats(3)

def cnn done
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 1.04313, saving model to /tmp/nn_model.h5
 - 3s - loss: 1.0787 - acc: 0.4097 - val_loss: 1.0431 - val_acc: 0.4710
Epoch 2/10
Epoch 00002: val_loss improved from 1.04313 to 0.76624, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.9084 - acc: 0.5828 - val_loss: 0.7662 - val_acc: 0.7045
Epoch 3/10
Epoch 00003: val_loss improved from 0.76624 to 0.56726, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.6142 - acc: 0.7701 - val_loss: 0.5673 - val_acc: 0.7951
Epoch 4/10
Epoch 00004: val_loss improved from 0.56726 to 0.47139, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.4174 - acc: 0.8587 - val_loss: 0.4714 - val_acc: 0.8258
Epoch 5/10
Epoch 00005: val_loss improved from 0.47139 to 0.43201, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2999 - acc: 0.8991 - val_loss: 0.4320 - val_acc: 0.8315
Epoch 6/10
Epoch 00006: val_loss improved from 0.43201 to 0.42545, saving model to

Epoch 3/10
Epoch 00003: val_loss improved from 0.81015 to 0.55608, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.6406 - acc: 0.7605 - val_loss: 0.5561 - val_acc: 0.7830
Epoch 4/10
Epoch 00004: val_loss improved from 0.55608 to 0.44470, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.4092 - acc: 0.8610 - val_loss: 0.4447 - val_acc: 0.8315
Epoch 5/10
Epoch 00005: val_loss improved from 0.44470 to 0.41083, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2840 - acc: 0.9052 - val_loss: 0.4108 - val_acc: 0.8405
Epoch 6/10
Epoch 00006: val_loss improved from 0.41083 to 0.40680, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2117 - acc: 0.9313 - val_loss: 0.4068 - val_acc: 0.8417
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 1s - loss: 0.1606 - acc: 0.9537 - val_loss: 0.4205 - val_acc: 0.8411
Epoch 8/10
Epoch 00008: val_loss did not improve
 - 1s - loss: 0.1259 - acc: 0.9627 - val_loss: 0.4379 - val_acc: 0.8354
Epoch 9/10
Epoch 00009: val_loss did not improve
 - 1s - loss: 0.1040 

Epoch 6/10
Epoch 00006: val_loss did not improve
 - 1s - loss: 0.2013 - acc: 0.9344 - val_loss: 0.3931 - val_acc: 0.8494
Epoch 7/10
Epoch 00007: val_loss did not improve
 - 1s - loss: 0.1558 - acc: 0.9519 - val_loss: 0.4068 - val_acc: 0.8468
Epoch 8/10
Epoch 00008: val_loss did not improve
 - 1s - loss: 0.1279 - acc: 0.9629 - val_loss: 0.4249 - val_acc: 0.8468
Epoch 9/10
Epoch 00009: val_loss did not improve
 - 1s - loss: 0.1036 - acc: 0.9672 - val_loss: 0.4512 - val_acc: 0.8424
Epoch 10/10
Epoch 00010: val_loss did not improve
 - 1s - loss: 0.0874 - acc: 0.9754 - val_loss: 0.4791 - val_acc: 0.8379
------------------
Train on 14096 samples, validate on 1567 samples
Epoch 1/10
Epoch 00001: val_loss improved from inf to 1.04839, saving model to /tmp/nn_model.h5
 - 2s - loss: 1.0825 - acc: 0.4101 - val_loss: 1.0484 - val_acc: 0.4703
Epoch 2/10
Epoch 00002: val_loss improved from 1.04839 to 0.73035, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.8980 - acc: 0.6041 - val_loss: 0.7303 - va

In [14]:
# add lstm feat
def get_lstm_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 16000
    N = 12
    MAX_LEN = 300
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=2333*rnd)
    for train_index, test_index in kf.split(train_tfidf):
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(LSTM(N, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(Dropout(0.2))
        model.add(Dense(NUM_CLASSES, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        np.random.seed(42) # for model train
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.1,
                  batch_size=128, epochs=6, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/feat_cnt
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/feat_cnt
        
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def lstm done')
lstm_train1,lstm_test1,lstm_train2,lstm_test2 = get_lstm_feats(1)

def lstm done
Train on 14096 samples, validate on 1567 samples
Epoch 1/6
Epoch 00001: val_loss improved from inf to 0.79683, saving model to /tmp/nn_model.h5
 - 40s - loss: 1.0284 - acc: 0.4729 - val_loss: 0.7968 - val_acc: 0.6682
Epoch 2/6
Epoch 00002: val_loss improved from 0.79683 to 0.48785, saving model to /tmp/nn_model.h5
 - 39s - loss: 0.5572 - acc: 0.7762 - val_loss: 0.4879 - val_acc: 0.8079
Epoch 3/6
Epoch 00003: val_loss improved from 0.48785 to 0.43386, saving model to /tmp/nn_model.h5
 - 40s - loss: 0.3120 - acc: 0.8851 - val_loss: 0.4339 - val_acc: 0.8315
Epoch 4/6
Epoch 00004: val_loss did not improve
 - 40s - loss: 0.1997 - acc: 0.9293 - val_loss: 0.4689 - val_acc: 0.8379
Epoch 5/6
Epoch 00005: val_loss did not improve
 - 39s - loss: 0.1420 - acc: 0.9496 - val_loss: 0.4953 - val_acc: 0.8366
Epoch 6/6
Epoch 00006: val_loss did not improve
 - 39s - loss: 0.1065 - acc: 0.9652 - val_loss: 0.5518 - val_acc: 0.8398
------------------
Train on 14096 samples, validate on 1567 sa

In [15]:
def get_nn_feats(rnd=1):
    # return train pred prob and test pred prob 
    train_pred, test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    best_val_train_pred, best_val_test_pred = np.zeros((19579,3)),np.zeros((8392,3))
    FEAT_CNT = 5
    NUM_WORDS = 30000
    N = 10
    MAX_LEN = 100
    NUM_CLASSES = 3
    MODEL_P = '/tmp/nn_model.h5'
    
    tmp_X = train_df['text']
    tmp_Y = train_df['author']
    tmp_X_test = test_df['text']
    
    tokenizer = Tokenizer(num_words=NUM_WORDS)
    tokenizer.fit_on_texts(tmp_X)

    ttrain_x = tokenizer.texts_to_sequences(tmp_X)
    ttrain_x = pad_sequences(ttrain_x, maxlen=MAX_LEN)
    
    ttest_x = tokenizer.texts_to_sequences(tmp_X_test)
    ttest_x = pad_sequences(ttest_x, maxlen=MAX_LEN)

    lb = preprocessing.LabelBinarizer()
    lb.fit(tmp_Y)

    ttrain_y = lb.transform(tmp_Y)
    kf = KFold(n_splits=FEAT_CNT, shuffle=True, random_state=233*rnd)
    for train_index, test_index in kf.split(train_tfidf):
        model = Sequential()
        model.add(Embedding(NUM_WORDS, N, input_length=MAX_LEN))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(30, activation='relu'))
        model.add(Dropout(0.1))
        model.add(Dense(NUM_CLASSES, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        #model.summary()

        model_chk = ModelCheckpoint(filepath=MODEL_P, monitor='val_loss', save_best_only=True, verbose=1)
        np.random.seed(42) # for model train
        model.fit(ttrain_x[train_index], ttrain_y[train_index], 
                  validation_split=0.3,
                  batch_size=64, epochs=20, 
                  verbose=2,
                  callbacks=[model_chk],
                  shuffle=False
                 )
 
        # save feat
        train_pred[test_index] = model.predict(ttrain_x[test_index])
        test_pred += model.predict(ttest_x)/feat_cnt
        
        # best val model
        model = load_model(MODEL_P)
        best_val_train_pred[test_index] = model.predict(ttrain_x[test_index])
        best_val_test_pred += model.predict(ttest_x)/feat_cnt
        
        # release
        del model
        gc.collect()
        print('------------------')
        
    return train_pred,test_pred,best_val_train_pred,best_val_test_pred

print('def cnn done')

nn_train1,nn_test1,nn_train2,nn_test2 = get_nn_feats(4)
nn_train3,nn_test3,nn_train4,nn_test4 = get_nn_feats(5)
nn_train5,nn_test5,nn_train6,nn_test6 = get_nn_feats(6)


all_nn_train = np.hstack([lstm_train1, lstm_train2, 
                          cnn_train1, cnn_train2,cnn_train3, cnn_train4,cnn_train5, cnn_train6,
                          nn_train1,nn_train2,nn_train3,nn_train4,nn_train5,nn_train6
                         ])
all_nn_test = np.hstack([lstm_test1, lstm_test2, 
                         cnn_test1, cnn_test2,cnn_test3, cnn_test4,cnn_test5, cnn_test6,
                         nn_test1,nn_test2,nn_test3,nn_test4,nn_test5,nn_test6
])

def cnn done
Train on 10964 samples, validate on 4699 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 1.07739, saving model to /tmp/nn_model.h5
 - 4s - loss: 1.0881 - acc: 0.3928 - val_loss: 1.0774 - val_acc: 0.4035
Epoch 2/20
Epoch 00002: val_loss improved from 1.07739 to 0.99465, saving model to /tmp/nn_model.h5
 - 1s - loss: 1.0473 - acc: 0.4470 - val_loss: 0.9946 - val_acc: 0.5482
Epoch 3/20
Epoch 00003: val_loss improved from 0.99465 to 0.79467, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.8818 - acc: 0.6535 - val_loss: 0.7947 - val_acc: 0.6995
Epoch 4/20
Epoch 00004: val_loss improved from 0.79467 to 0.64289, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.6579 - acc: 0.7761 - val_loss: 0.6429 - val_acc: 0.7529
Epoch 5/20
Epoch 00005: val_loss improved from 0.64289 to 0.55402, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5033 - acc: 0.8332 - val_loss: 0.5540 - val_acc: 0.7851
Epoch 6/20
Epoch 00006: val_loss improved from 0.55402 to 0.50130, saving model to

Epoch 13/20
Epoch 00013: val_loss did not improve
 - 1s - loss: 0.1278 - acc: 0.9667 - val_loss: 0.4349 - val_acc: 0.8329
Epoch 14/20
Epoch 00014: val_loss did not improve
 - 1s - loss: 0.1121 - acc: 0.9709 - val_loss: 0.4409 - val_acc: 0.8317
Epoch 15/20
Epoch 00015: val_loss did not improve
 - 1s - loss: 0.0987 - acc: 0.9754 - val_loss: 0.4523 - val_acc: 0.8300
Epoch 16/20
Epoch 00016: val_loss did not improve
 - 1s - loss: 0.0867 - acc: 0.9774 - val_loss: 0.4610 - val_acc: 0.8295
Epoch 17/20
Epoch 00017: val_loss did not improve
 - 1s - loss: 0.0763 - acc: 0.9818 - val_loss: 0.4722 - val_acc: 0.8287
Epoch 18/20
Epoch 00018: val_loss did not improve
 - 1s - loss: 0.0677 - acc: 0.9845 - val_loss: 0.4830 - val_acc: 0.8278
Epoch 19/20
Epoch 00019: val_loss did not improve
 - 1s - loss: 0.0601 - acc: 0.9870 - val_loss: 0.4980 - val_acc: 0.8268
Epoch 20/20
Epoch 00020: val_loss did not improve
 - 1s - loss: 0.0524 - acc: 0.9881 - val_loss: 0.5062 - val_acc: 0.8259
------------------
Train

Epoch 00007: val_loss improved from 0.50758 to 0.47506, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.3385 - acc: 0.8933 - val_loss: 0.4751 - val_acc: 0.8074
Epoch 8/20
Epoch 00008: val_loss improved from 0.47506 to 0.45422, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2809 - acc: 0.9129 - val_loss: 0.4542 - val_acc: 0.8159
Epoch 9/20
Epoch 00009: val_loss improved from 0.45422 to 0.43940, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2359 - acc: 0.9303 - val_loss: 0.4394 - val_acc: 0.8221
Epoch 10/20
Epoch 00010: val_loss improved from 0.43940 to 0.43599, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2010 - acc: 0.9412 - val_loss: 0.4360 - val_acc: 0.8261
Epoch 11/20
Epoch 00011: val_loss improved from 0.43599 to 0.43303, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.1731 - acc: 0.9506 - val_loss: 0.4330 - val_acc: 0.8295
Epoch 12/20
Epoch 00012: val_loss did not improve
 - 1s - loss: 0.1497 - acc: 0.9578 - val_loss: 0.4386 - val_acc: 0.8283
Epoch 13/20
Epoch 00013: va

 - 4s - loss: 1.0853 - acc: 0.4052 - val_loss: 1.0738 - val_acc: 0.4126
Epoch 2/20
Epoch 00002: val_loss improved from 1.07375 to 0.99611, saving model to /tmp/nn_model.h5
 - 1s - loss: 1.0467 - acc: 0.4424 - val_loss: 0.9961 - val_acc: 0.5099
Epoch 3/20
Epoch 00003: val_loss improved from 0.99611 to 0.81897, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.9006 - acc: 0.6018 - val_loss: 0.8190 - val_acc: 0.6914
Epoch 4/20
Epoch 00004: val_loss improved from 0.81897 to 0.66220, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.6957 - acc: 0.7592 - val_loss: 0.6622 - val_acc: 0.7674
Epoch 5/20
Epoch 00005: val_loss improved from 0.66220 to 0.56738, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.5343 - acc: 0.8253 - val_loss: 0.5674 - val_acc: 0.7880
Epoch 6/20
Epoch 00006: val_loss improved from 0.56738 to 0.51313, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.4251 - acc: 0.8628 - val_loss: 0.5131 - val_acc: 0.7978
Epoch 7/20
Epoch 00007: val_loss improved from 0.51313 to 0.48126, s

Epoch 15/20
Epoch 00015: val_loss did not improve
 - 1s - loss: 0.1020 - acc: 0.9744 - val_loss: 0.4379 - val_acc: 0.8300
Epoch 16/20
Epoch 00016: val_loss did not improve
 - 1s - loss: 0.0906 - acc: 0.9786 - val_loss: 0.4479 - val_acc: 0.8306
Epoch 17/20
Epoch 00017: val_loss did not improve
 - 1s - loss: 0.0805 - acc: 0.9806 - val_loss: 0.4575 - val_acc: 0.8298
Epoch 18/20
Epoch 00018: val_loss did not improve
 - 1s - loss: 0.0702 - acc: 0.9844 - val_loss: 0.4644 - val_acc: 0.8323
Epoch 19/20
Epoch 00019: val_loss did not improve
 - 1s - loss: 0.0630 - acc: 0.9866 - val_loss: 0.4792 - val_acc: 0.8287
Epoch 20/20
Epoch 00020: val_loss did not improve
 - 1s - loss: 0.0553 - acc: 0.9869 - val_loss: 0.4943 - val_acc: 0.8289
------------------
Train on 10964 samples, validate on 4699 samples
Epoch 1/20
Epoch 00001: val_loss improved from inf to 1.07575, saving model to /tmp/nn_model.h5
 - 5s - loss: 1.0850 - acc: 0.4062 - val_loss: 1.0758 - val_acc: 0.4046
Epoch 2/20
Epoch 00002: val_loss

Epoch 9/20
Epoch 00009: val_loss improved from 0.47006 to 0.44913, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2508 - acc: 0.9226 - val_loss: 0.4491 - val_acc: 0.8215
Epoch 10/20
Epoch 00010: val_loss improved from 0.44913 to 0.44596, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.2152 - acc: 0.9372 - val_loss: 0.4460 - val_acc: 0.8238
Epoch 11/20
Epoch 00011: val_loss improved from 0.44596 to 0.44069, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.1853 - acc: 0.9448 - val_loss: 0.4407 - val_acc: 0.8274
Epoch 12/20
Epoch 00012: val_loss did not improve
 - 1s - loss: 0.1601 - acc: 0.9558 - val_loss: 0.4417 - val_acc: 0.8291
Epoch 13/20
Epoch 00013: val_loss improved from 0.44069 to 0.43350, saving model to /tmp/nn_model.h5
 - 1s - loss: 0.1388 - acc: 0.9619 - val_loss: 0.4335 - val_acc: 0.8315
Epoch 14/20
Epoch 00014: val_loss did not improve
 - 1s - loss: 0.1206 - acc: 0.9673 - val_loss: 0.4426 - val_acc: 0.8291
Epoch 15/20
Epoch 00015: val_loss did not improve
 - 1s - loss: 0

In [25]:
# combine feats
cols_to_drop = ['id','text','tag_txt','pos_txt','dep_txt']
train_X = train_df.drop(cols_to_drop+['author'], axis=1).values
test_X = test_df.drop(cols_to_drop, axis=1).values
train_X = np.hstack([train_X, all_svd_train, all_nlp_train])
test_X = np.hstack([test_X, all_svd_test, all_nlp_test])

f_train_X = np.hstack([train_X, help_train_feat,help_train_feat2,help_train_feat3,all_nn_train])
#f_train_X = np.round(f_train_X,4)
f_test_X = np.hstack([test_X, help_test_feat,help_test_feat2,help_test_feat3,all_nn_test])
#f_test_X = np.round(f_test_X,4)
print(f_train_X.shape, f_test_X.shape)

(19579, 762) (8392, 762)


In [26]:
import pickle
with open('feat.pkl','wb') as fout:
    pickle.dump([f_train_X,f_test_X],fout)
print('dump for xgb')

dump for xgb


In [27]:
from sklearn.model_selection import StratifiedKFold
def cv_test(k_cnt=3, s_flag = False):
    rnd = 42
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=rnd)
    test_pred = None
    weighted_test_pred = None
    org_train_pred = None
    avg_k_score = 0
    reverse_score = 0
    best_loss = 100
    best_single_pred = None
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.04,
                'max_depth': 3,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':3,
                }
        
        # def mat
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_test, y_test)
        d_test = xgb.DMatrix(f_test_X)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        # train model
        m = xgb.train(params, d_train, 2000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=200)
        
        # get res
        train_pred = m.predict(d_train)
        valid_pred = m.predict(d_valid)
        tmp_train_pred = m.predict(xgb.DMatrix(f_train_X))
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        rev_valid_score = 1.0/valid_score # use for weighted
        reverse_score += rev_valid_score # sum
        print('rev',rev_valid_score)
        
        if test_pred is None:
            test_pred = m.predict(d_test)
            weighted_test_pred = test_pred*rev_valid_score
            org_train_pred = tmp_train_pred
            best_loss = valid_score
            best_single_pred = test_pred
        else:
            curr_pred = m.predict(d_test)
            test_pred += curr_pred
            weighted_test_pred += curr_pred*rev_valid_score # fix bug here
            org_train_pred += tmp_train_pred
            # find better single model
            if valid_score < best_loss:
                print('BETTER')
                best_loss = valid_score
                best_single_pred = curr_pred

    # avg
    test_pred = test_pred / k_cnt
    test_pred = np.round(test_pred,4)
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt

    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/xgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('--------------')
    print(reverse_score)
    # weigthed
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = weighted_test_pred / reverse_score
    weighted_test_pred = np.round(weighted_test_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/weighted_xgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    # best single
    submiss=pd.read_csv("./input/sample_submission.csv")
    weighted_test_pred = np.round(best_single_pred,4)
    submiss['EAP']=weighted_test_pred[:,0]
    submiss['HPL']=weighted_test_pred[:,1]
    submiss['MWS']=weighted_test_pred[:,2]
    submiss.to_csv("results/single_xgb_res_{}_{}.csv".format(k_cnt, s_flag),index=False)
    print(submiss.head(5))
    print('---------------')
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [28]:
cv_test(5, True)

[0]	train-mlogloss:1.055	valid-mlogloss:1.05602
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[200]	train-mlogloss:0.22196	valid-mlogloss:0.280404
[400]	train-mlogloss:0.179357	valid-mlogloss:0.270178
Stopping. Best iteration:
[527]	train-mlogloss:0.160523	valid-mlogloss:0.268975

train log loss 0.153875230434 valid log loss 0.269190198116
rev 3.71484551442
[0]	train-mlogloss:1.05507	valid-mlogloss:1.05581
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[200]	train-mlogloss:0.22299	valid-mlogloss:0.272222


KeyboardInterrupt: 

In [None]:
cv_test(10, True)
# 276xx not good