# EDA

In [1]:
import pandas as pd
import numpy as np
%config InlineBackend.figure_format = 'svg'
pd.options.display.max_colwidth = 160

## Load data

In [2]:
train = pd.read_csv("train.csv", encoding = 'utf-8')
test = pd.read_csv("test_x.csv", encoding = 'utf-8')
submission = pd.read_csv("sample_submission.csv", encoding = 'utf-8')

In [3]:
train.shape, test.shape

((54879, 3), (19617, 2))

In [4]:
train.head()

Unnamed: 0,index,text,author
0,0,"He was almost choking. There was so much, so much he wanted to say, but strange exclamations were all that came from his lips. The Pole gazed fixedly at him...",3
1,1,"“Your sister asked for it, I suppose?”",2
2,2,"She was engaged one day as she walked, in perusing Jane’s last letter, and dwelling on some passages which proved that Jane had not written in spirits, whe...",1
3,3,"The captain was in the porch, keeping himself carefully out of the way of a treacherous shot, should any be intended. He turned and spoke to us, “Doctor's w...",4
4,4,"“Have mercy, gentlemen!” odin flung up his hands. “Don’t write that, anyway; have some shame. Here I’ve torn my heart asunder before you, and you seize the ...",3


# Preprocessing

In [5]:
import spacy
from spacy.lang.en import English
import nltk

In [6]:
nlp = spacy.lang.en.English()

In [7]:
# add custom stop words
spacy.lang.en.STOP_WORDS.add("'s")
for word in spacy.lang.en.STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [8]:
def replace_ents(doc):
    prefix = 'ent__'
    text = str(doc.doc)
    for ent in doc.ents:
        text = text.replace(ent.orth_, prefix + ent.label_)
    return text

In [9]:
def preprocess(df):
    print('Started parsing...')
    doc = df.text.apply(nlp)
    print('Text parsed')
    
    df['n_char']   = df.text.apply(len)
    df['n_words']  = doc.apply(lambda x: len([t for t in x if not t.is_punct]))
    df['n_punct']  = doc.apply(lambda x: len([t for t in x if t.is_punct]))
    df['n_ents']   = doc.apply(lambda x: len(x.ents))
    df['n_unique_words'] = doc.apply(lambda x: len(set([t.lower_ for t in x if not t.is_punct])))
    df['n_stop_words']   = doc.apply(lambda x: len([t for t in x if t.is_stop]))
    df['char_by_word']   = doc.apply(lambda x: np.mean([len(t.orth_) for t in x if not t.is_punct]))
    print('Features created')
    
    df['text_ent_repl'] = doc.apply(replace_ents)
    print('Entities replaced')
    
    clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop])
    df['text_cleaned'] = doc.apply(clean_and_lemmatize)
    print('Text cleaned')

In [10]:
%%time
preprocess(train)

Started parsing...
Text parsed


  out=out, **kwargs)


Features created
Entities replaced
Text cleaned
Wall time: 19.6 s


In [11]:
%%time
preprocess(test)

Started parsing...
Text parsed
Features created
Entities replaced
Text cleaned
Wall time: 13.4 s


# Models

In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline

In [13]:
y = train.author

### CountVect + untouched text

In [14]:
vectorizer = CountVectorizer(
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2), stop_words='english'
)
X = vectorizer.fit_transform(train.text)

In [15]:
%%time
scores = cross_val_score(LogisticRegression(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.725 +- 0.036
Wall time: 11min 8s


In [16]:
%%time
scores = cross_val_score(MultinomialNB(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 1.039 +- 0.077
Wall time: 2.09 s


### CountVect + cleaned text

In [17]:
vectorizer = CountVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(train.text_cleaned)

In [18]:
%%time
scores = cross_val_score(LogisticRegression(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.742 +- 0.040
Wall time: 6min 16s


In [19]:
%%time
scores = cross_val_score(MultinomialNB(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.957 +- 0.066
Wall time: 1.11 s


### CountVect + replaced entities text

In [20]:
vectorizer = CountVectorizer(token_pattern=r'\w{1,}', stop_words='english', ngram_range=(1,2))
X = vectorizer.fit_transform(train.text_ent_repl)

In [21]:
%%time
scores = cross_val_score(LogisticRegression(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.725 +- 0.036
Wall time: 6min 8s


In [22]:
%%time
scores = cross_val_score(MultinomialNB(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 1.039 +- 0.077
Wall time: 1.08 s


Seems like replacing named entities doesn't help

### TF-IDF + cleaned text

In [23]:
vectorizer = TfidfVectorizer(
    token_pattern=r'\w{1,}', ngram_range=(1, 1), 
    use_idf=True, smooth_idf=True, sublinear_tf=True,
)
X = vectorizer.fit_transform(train.text_cleaned)

In [24]:
%%time
scores = cross_val_score(LogisticRegression(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.802 +- 0.029
Wall time: 27.9 s


In [25]:
%%time
scores = cross_val_score(MultinomialNB(), X, y, cv=10, n_jobs=-1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 0.924 +- 0.017
Wall time: 311 ms


## Only meta-features

In [26]:
import xgboost as xgb
import lightgbm as lgb

In [27]:
drop = ['index', 'text', 'text_cleaned', 'text_ent_repl']

In [28]:
X_meta = train.drop(drop + ['author'], axis=1)

In [29]:
%%time
lgbc = lgb.LGBMClassifier(objective='multiclass', n_estimators=100)
scores = cross_val_score(lgbc, X_meta, y, cv=4, n_jobs=1, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 1.473 +- 0.003
Wall time: 7.2 s


In [30]:
%%time
xgbc = xgb.XGBClassifier(objective='multi:softprob', n_estimators=200)
scores = cross_val_score(xgbc, X_meta, y, cv=4, n_jobs=4, scoring='neg_log_loss')
print('LogLoss: %.3f +- %.3f' % (-np.mean(scores), 2*np.std(scores)))

LogLoss: 1.483 +- 0.003
Wall time: 23.7 s


In [31]:
xgbc.fit(X_meta, y);
xgb.plot_importance(xgbc);

## Stack 'em

In [32]:
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [33]:
def add_prob_features(vectorizer, col, model, prefix, cv=5):
    vectorizer.fit(train[col].append(test[col]))
    X = vectorizer.transform(train[col])
    X_test = vectorizer.transform(test[col])
    
    cv_scores = []
    pred_test = 0
    pred_train = np.zeros([train.shape[0], 5])
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)
    
    print('CV started')
    for train_index, dev_index in skf.split(X, y):
        X_train, X_dev = X[train_index], X[dev_index]
        y_train, y_dev = y[train_index], y[dev_index]
        
        model.fit(X_train, y_train)
        pred_dev   = model.predict_proba(X_dev)
        pred_test += model.predict_proba(X_test)
    
        pred_train[dev_index, :] = pred_dev
        cv_scores.append(metrics.log_loss(y_dev, pred_dev))
        print('.', end='')
        
    print('')
    print("Mean CV LogLoss: %.3f" % (np.mean(cv_scores)))
    pred_test /= cv

    train[prefix+'0'] = pred_train[:, 0]
    train[prefix+'1'] = pred_train[:, 1]
    train[prefix+'2'] = pred_train[:, 2]
    train[prefix+'3'] = pred_train[:, 3]
    train[prefix+'4'] = pred_train[:, 4]
    
    test[prefix+'0'] = pred_test[:, 0]
    test[prefix+'1'] = pred_test[:, 1]
    test[prefix+'2'] = pred_test[:, 2]
    test[prefix+'3'] = pred_test[:, 3]
    test[prefix+'4'] = pred_test[:, 4]

In [34]:
vectorizer = CountVectorizer(
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2), stop_words='english'
)
add_prob_features(vectorizer, 'text', MultinomialNB(), 'nb_ctv_', cv=40)

CV started
........................................
Mean CV LogLoss: 1.149


In [35]:
vectorizer = TfidfVectorizer(
    token_pattern=r'\w{1,}', ngram_range=(1, 1), 
    use_idf=True, smooth_idf=True, sublinear_tf=True,
)
add_prob_features(vectorizer, 'text_cleaned', MultinomialNB(), 'nb_tfv_', cv=40)

CV started
........................................
Mean CV LogLoss: 0.917


In [36]:
vectorizer = TfidfVectorizer(
    ngram_range=(1, 5), analyzer='char'
)
add_prob_features(vectorizer, 'text', MultinomialNB(), 'nb_char_', cv=40)

CV started
........................................
Mean CV LogLoss: 1.678


In [37]:
vectorizer = CountVectorizer(
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2), stop_words='english'
)
add_prob_features(vectorizer, 'text_ent_repl', MultinomialNB(), 'nb_ent_', cv=40)

CV started
........................................
Mean CV LogLoss: 1.149


In [43]:
from sklearn import model_selection
X = train.drop(drop + ['author'], axis=1)
X_test = test.drop(drop, axis=1)
clf = xgb.XGBClassifier(objective = 'multi:softprob', nthread=1)

parameters = {
    'n_estimators': [150],
    'max_depth': [3],
    'subsample': [0.65],
    'colsample_bytree': [0.95],
    'min_child_weight': [1],
}

clf = model_selection.GridSearchCV(clf, parameters, n_jobs=4, scoring='neg_log_loss', refit=True)  

In [44]:
cv = 5
cv_scores = []
pred_test = 0
pred_train = np.zeros([train.shape[0], 5])
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)
sub_train = pd.DataFrame(columns=[0, 1, 2, 3, 4])
sub_train.insert(0, 'index', X.index)
sub_test = pd.DataFrame(columns=[0, 1, 2, 3, 4])
sub_test.insert(0, 'index', test.index)

print('CV started')
for train_index, dev_index in skf.split(X, y):
    X_train, X_dev = X.iloc[train_index], X.iloc[dev_index]
    y_train, y_dev = y.iloc[train_index], y.iloc[dev_index]
        
    clf.fit(X_train, y_train)
    pred_dev   = clf.predict_proba(X_dev)
    pred_test += clf.predict_proba(X_test)
    
    pred_train[dev_index, :] = pred_dev
    cv_scores.append(metrics.log_loss(y_dev, pred_dev))
    print('.', end='')

print('')
print("Mean CV LogLoss: %.3f" % (np.mean(cv_scores)))
pred_test /= cv

sub_train[0] = pred_train[:, 0]
sub_train[1] = pred_train[:, 1]
sub_train[2] = pred_train[:, 2]
sub_train[3] = pred_train[:, 3]
sub_train[4] = pred_train[:, 4]

sub_test[0] = pred_test[:, 0]
sub_test[1] = pred_test[:, 1]
sub_test[2] = pred_test[:, 2]
sub_test[3] = pred_test[:, 3]
sub_test[4] = pred_test[:, 4]

CV started
.....
Mean CV LogLoss: 0.496


In [45]:
sub_train.to_csv('submission1_train.csv', index=False)
sub_test.to_csv('submission1_test.csv', index=False)

In [None]:
# https://www.kaggle.com/sandpiturtle/eda-fe-nb-xgb