In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")

# replace
# train_df['text'] = train_df['text'].str.replace('[^a-zA-Z0-9]', ' ')
# test_df['text'] =test_df['text'].str.replace('[^a-zA-Z0-9]', ' ')

## Number of words in the text ##
train_df["num_words"] = train_df["text"].apply(lambda x: len(str(x).split()))
test_df["num_words"] = test_df["text"].apply(lambda x: len(str(x).split()))

## Number of unique words in the text ##
train_df["num_unique_words"] = train_df["text"].apply(lambda x: len(set(str(x).split())))
test_df["num_unique_words"] = test_df["text"].apply(lambda x: len(set(str(x).split())))

## Number of characters in the text ##
train_df["num_chars"] = train_df["text"].apply(lambda x: len(str(x)))
test_df["num_chars"] = test_df["text"].apply(lambda x: len(str(x)))

## Number of stopwords in the text ##
eng_stopwords = [
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
    "yourselves"]
train_df["num_stopwords"] = train_df["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))
test_df["num_stopwords"] = test_df["text"].apply(lambda x: len([w for w in str(x).lower().split() if w in eng_stopwords]))

## Number of punctuations in the text ##
import string
train_df["num_punctuations"] =train_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )
test_df["num_punctuations"] =test_df['text'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]) )

## Number of title case words in the text ##
train_df["num_words_upper"] = train_df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test_df["num_words_upper"] = test_df["text"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

## Number of title case words in the text ##
train_df["num_words_title"] = train_df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test_df["num_words_title"] = test_df["text"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

## Average length of the words in the text ##
train_df["mean_word_len"] = train_df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test_df["mean_word_len"] = test_df["text"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [2]:
train_df.head()

Unnamed: 0,id,text,author,num_words,num_unique_words,num_chars,num_stopwords,num_punctuations,num_words_upper,num_words_title,mean_word_len
0,id26305,"This process, however, afforded me no means of...",EAP,41,35,231,23,7,2,3,4.658537
1,id17569,It never once occurred to me that the fumbling...,HPL,14,14,71,10,1,0,1,4.142857
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,36,32,200,16,5,0,1,4.583333
3,id27763,How lovely is spring As we looked from Windsor...,MWS,34,32,206,14,4,0,4,5.088235
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,27,25,174,13,4,0,2,5.481481


In [9]:
## Prepare the data for modeling ###
author_mapping_dict = {'EAP':0, 'HPL':1, 'MWS':2}
train_y = train_df['author'].map(author_mapping_dict)
train_id = train_df['id'].values
test_id = test_df['id'].values

## add tfidf and svd
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_df=0.8,lowercase=False, sublinear_tf=True)
full_tfidf = tfidf_vec.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['text'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['text'].values.tolist())

print(train_tfidf.shape,test_tfidf.shape)

# svd1
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf)
train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
print(train_svd.shape,test_svd.shape)

## add tfidf char
tfidf_vec2 = TfidfVectorizer(stop_words='english', ngram_range=(3,7), analyzer='char',max_df=0.8, sublinear_tf=True)
full_tfidf2 = tfidf_vec2.fit_transform(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_tfidf2 = tfidf_vec2.transform(train_df['text'].values.tolist())
test_tfidf2 = tfidf_vec2.transform(test_df['text'].values.tolist())
print(train_tfidf2.shape,test_tfidf2.shape)

## add svd2
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
svd_obj.fit(full_tfidf2)
train_svd2 = pd.DataFrame(svd_obj.transform(train_tfidf2))
test_svd2 = pd.DataFrame(svd_obj.transform(test_tfidf2))
print(train_svd2.shape,test_svd2.shape)



(19579, 591993) (8392, 591993)
(19579, 30) (8392, 30)
(19579, 1354551) (8392, 1354551)
(19579, 30) (8392, 30)


In [None]:
## add cnt vec
c_vec = CountVectorizer(stop_words='english',ngram_range=(1,3),max_df=0.8,lowercase=False)
c_vec.fit(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_cvec = c_vec.transform(train_df['text'].values.tolist())
test_cvec = c_vec.transform(test_df['text'].values.tolist())
print(train_cvec.shape,test_cvec.shape)

# add cnt char
c_vec2 = CountVectorizer(stop_words='english',ngram_range=(3,7), analyzer='char',max_df=0.8)
c_vec2.fit(train_df['text'].values.tolist() + test_df['text'].values.tolist())
train_cvec2 = c_vec2.transform(train_df['text'].values.tolist())
test_cvec2 = c_vec2.transform(test_df['text'].values.tolist())
print(train_cvec2.shape,test_cvec2.shape)

In [10]:
# add tfidf to svd
cols_to_drop = ['id', 'text']
train_X = train_df.drop(cols_to_drop+['author'], axis=1).values
test_X = test_df.drop(cols_to_drop, axis=1).values
print(train_X.shape, test_X.shape)
train_X = np.hstack([train_X,train_svd,train_svd2])
test_X = np.hstack([test_X,test_svd,test_svd2])
print(train_X.shape, test_X.shape)

(19579, 8) (8392, 8)
(19579, 68) (8392, 68)


In [11]:
# add naive feature
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss

feat_cnt = 5
train_Y = train_y

help_tfidf_train,help_tfidf_test = np.zeros((19579,3)),np.zeros((8392,3))
help_tfidf_train2,help_tfidf_test2 = np.zeros((19579,3)),np.zeros((8392,3))
help_cnt1_train,help_cnt1_test = np.zeros((19579,3)),np.zeros((8392,3))
help_cnt2_train,help_cnt2_test = np.zeros((19579,3)),np.zeros((8392,3))

kf = KFold(n_splits=feat_cnt, shuffle=True, random_state=2017)
for train_index, test_index in kf.split(train_tfidf):
    # tfidf to nb
    X_train, X_test = train_tfidf[train_index], train_tfidf[test_index]
    y_train, y_test = train_Y[train_index], train_Y[test_index]
    tmp_model = MultinomialNB(alpha=0.05,fit_prior=False)
    tmp_model.fit(X_train,y_train)
    tmp_train_feat = tmp_model.predict_proba(X_test)
    tmp_test_feat = tmp_model.predict_proba(test_tfidf)
    help_tfidf_train[test_index] = tmp_train_feat
    help_tfidf_test += tmp_test_feat/feat_cnt
    
    # tfidf to nb
    X_train, X_test = train_tfidf2[train_index], train_tfidf2[test_index]
    tmp_model = MultinomialNB(0.05,fit_prior=False)
    tmp_model.fit(X_train,y_train)
    tmp_train_feat = tmp_model.predict_proba(X_test)
    tmp_test_feat = tmp_model.predict_proba(test_tfidf2)
    help_tfidf_train2[test_index] = tmp_train_feat
    help_tfidf_test2 += tmp_test_feat/feat_cnt
    
    # count vec to nb
    X_train, X_test = train_cvec[train_index], train_cvec[test_index]
    tmp_model = MultinomialNB(0.05,fit_prior=False)
    tmp_model.fit(X_train,y_train)
    tmp_train_feat = tmp_model.predict_proba(X_test)
    tmp_test_feat = tmp_model.predict_proba(test_cvec)
    help_cnt1_train[test_index] = tmp_train_feat
    help_cnt1_test += tmp_test_feat/feat_cnt
    
    # count vec2 to nb 
    X_train, X_test = train_cvec2[train_index], train_cvec2[test_index]
    tmp_model = MultinomialNB(0.05,fit_prior=False)
    tmp_model.fit(X_train,y_train)
    tmp_train_feat = tmp_model.predict_proba(X_test)
    tmp_test_feat = tmp_model.predict_proba(test_cvec2)
    help_cnt2_train[test_index] = tmp_train_feat
    help_cnt2_test += tmp_test_feat/feat_cnt
    
help_train_feat = np.round(np.hstack([help_tfidf_train,help_tfidf_train2,help_cnt1_train,help_cnt2_train]),3)
help_test_feat = np.round(np.hstack([help_tfidf_test,help_tfidf_test2,help_cnt1_test,help_cnt2_test]),3)

print(help_train_feat.shape,help_test_feat.shape)
print(help_train_feat[:5])
print(help_test_feat[:5])

(19579, 12) (8392, 12)
[[ 0.659  0.153  0.189  0.999  0.     0.     1.     0.     0.     1.     0.
   0.   ]
 [ 0.422  0.453  0.126  0.753  0.204  0.043  0.197  0.802  0.     1.     0.
   0.   ]
 [ 0.78   0.167  0.053  0.994  0.006  0.     1.     0.     0.     1.     0.
   0.   ]
 [ 0.041  0.054  0.905  0.     0.     1.     0.     0.     1.     0.     0.
   1.   ]
 [ 0.322  0.416  0.262  0.918  0.066  0.016  0.162  0.792  0.046  1.     0.
   0.   ]]
[[ 0.208  0.114  0.678  0.001  0.     0.999  0.     0.     1.     0.     0.
   1.   ]
 [ 0.488  0.243  0.27   1.     0.     0.     0.987  0.     0.013  1.     0.
   0.   ]
 [ 0.321  0.544  0.135  0.008  0.992  0.     0.162  0.838  0.     0.     1.
   0.   ]
 [ 0.531  0.385  0.084  0.32   0.68   0.     0.799  0.201  0.     0.     1.
   0.   ]
 [ 0.583  0.255  0.162  0.777  0.036  0.187  0.963  0.031  0.006  0.887  0.
   0.113]]


In [12]:
f_train_X = np.hstack([train_X,help_train_feat])
f_test_X = np.hstack([test_X,help_test_feat])
print(f_train_X.shape, f_test_X.shape)

(19579, 80) (8392, 80)


In [15]:
from sklearn.model_selection import StratifiedKFold
def cv_test(k_cnt=3, s_flag = False):
    if s_flag:
        kf = StratifiedKFold(n_splits=k_cnt, shuffle=True, random_state=42)
    else:
        kf = KFold(n_splits=k_cnt, shuffle=True, random_state=42)
    test_pred = None
    org_train_pred = None
    avg_k_score = 0
    for train_index, test_index in kf.split(f_train_X,train_Y):
        X_train, X_test = f_train_X[train_index], f_train_X[test_index]
        y_train, y_test = train_Y[train_index], train_Y[test_index]
        params = {
                'colsample_bytree': 0.7,
                'subsample': 0.8,
                'eta': 0.1,
                'max_depth': 3,
                'eval_metric':'mlogloss',
                'objective':'multi:softprob',
                'num_class':3
                }
        
        # def mat
        d_train = xgb.DMatrix(X_train, y_train)
        d_valid = xgb.DMatrix(X_test, y_test)
        d_test = xgb.DMatrix(f_test_X)
        
        watchlist = [(d_train, 'train'), (d_valid, 'valid')]
        # train model
        m = xgb.train(params, d_train, 1000, watchlist, 
                        early_stopping_rounds=50,
                        verbose_eval=100)
        
        # get res
        train_pred = m.predict(d_train)
        valid_pred = m.predict(d_valid)
        tmp_train_pred = m.predict(xgb.DMatrix(f_train_X))
        
        # cal score
        train_score = log_loss(y_train,train_pred)
        valid_score = log_loss(y_test,valid_pred)
        print('train log loss',train_score,'valid log loss',valid_score)
        avg_k_score += valid_score
        
        
        if test_pred is None:
            test_pred = m.predict(d_test)
            org_train_pred = tmp_train_pred
        else:
            test_pred += m.predict(d_test)
            org_train_pred += tmp_train_pred

    # avg
    test_pred = test_pred / k_cnt
    test_pred = np.round(test_pred,4)
    org_train_pred = org_train_pred / k_cnt
    avg_k_score = avg_k_score/k_cnt

    submiss=pd.read_csv("./input/sample_submission.csv")
    submiss['EAP']=test_pred[:,0]
    submiss['HPL']=test_pred[:,1]
    submiss['MWS']=test_pred[:,2]
    submiss.to_csv("results/xgb_res_{}.csv".format(k_cnt),index=False)
    submiss.head(5)
    
    # train log loss
    print('local average valid loss',avg_k_score)
    print('train log loss', log_loss(train_Y,org_train_pred))
    
print('def done')

def done


In [16]:
cv_test(3, True)

[0]	train-mlogloss:0.993836	valid-mlogloss:0.996151
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.238045	valid-mlogloss:0.303229
[200]	train-mlogloss:0.190093	valid-mlogloss:0.298679
Stopping. Best iteration:
[215]	train-mlogloss:0.18428	valid-mlogloss:0.298514

train log loss 0.166322863506 valid log loss 0.299551336058
[0]	train-mlogloss:0.996323	valid-mlogloss:0.995007
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.253456	valid-mlogloss:0.275187
[200]	train-mlogloss:0.20631	valid-mlogloss:0.268957
[300]	train-mlogloss:0.171897	valid-mlogloss:0.266368
[400]	train-mlogloss:0.144901	valid-mlogloss:0.266681
Stopping. Best iteration:
[350]	train-mlogloss:0.157658	valid-mlogloss:0.26576

train log loss 0.144901359141 valid log loss 0.

In [17]:
cv_test(3, False)

[0]	train-mlogloss:0.994224	valid-mlogloss:0.996224
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.238083	valid-mlogloss:0.30476
[200]	train-mlogloss:0.190803	valid-mlogloss:0.301099
Stopping. Best iteration:
[224]	train-mlogloss:0.181581	valid-mlogloss:0.300595

train log loss 0.164848668485 valid log loss 0.301990002797
[0]	train-mlogloss:0.996036	valid-mlogloss:0.995875
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.248732	valid-mlogloss:0.284614
[200]	train-mlogloss:0.201517	valid-mlogloss:0.27969
[300]	train-mlogloss:0.166822	valid-mlogloss:0.278406
Stopping. Best iteration:
[310]	train-mlogloss:0.163727	valid-mlogloss:0.278243

train log loss 0.149479085233 valid log loss 0.278635743602
[0]	train-mlogloss:0.996063	valid-mloglo

In [19]:
cv_test(5, True)

[0]	train-mlogloss:0.994811	valid-mlogloss:0.997352
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.247303	valid-mlogloss:0.306599
[200]	train-mlogloss:0.20387	valid-mlogloss:0.302533
Stopping. Best iteration:
[164]	train-mlogloss:0.217341	valid-mlogloss:0.301848

train log loss 0.198914973654 valid log loss 0.302496314734
[0]	train-mlogloss:0.995095	valid-mlogloss:0.996233
Multiple eval metrics have been passed: 'valid-mlogloss' will be used for early stopping.

Will train until valid-mlogloss hasn't improved in 50 rounds.
[100]	train-mlogloss:0.250021	valid-mlogloss:0.297402
[200]	train-mlogloss:0.206512	valid-mlogloss:0.29333
Stopping. Best iteration:
[188]	train-mlogloss:0.210546	valid-mlogloss:0.293192

train log loss 0.19374016229 valid log loss 0.293453461026
[0]	train-mlogloss:0.995937	valid-mlogloss:0.994332
Multiple eval metrics have been passed: 'va

In [None]:
#cv_test(10)