In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding, GRU, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import model_selection
from sklearn import metrics

np.random.seed(7)

In [16]:
df = pd.read_csv("train.csv")
a2c = {0:0, 1:1, 2:2, 3:3, 4:4}
y = np.array([a2c[a] for a in df.author])
# y = to_categorical(y)

In [3]:
counter = {name : defaultdict(int) for name in set(df.author)}
for (text, author) in zip(df.text, df.author):
    text = text.replace(' ', '')
    for c in text:
        counter[author][c] += 1

chars = set()
for v in counter.values():
    chars |= v.keys()
    
names = [author for author in counter.keys()]

print('c ', end='')
for n in names:
    print(n, end='   ')
print()
for c in chars:    
    print(c, end=' ')
    for n in names:
        print(counter[n][c], end=' ')
    print()


c 0   1   2   3   4   
é 2 1 20 90 45 
* 15 0 45 20 187 
Œ 0 0 0 1 0 
s 120860 107436 115014 152460 79389 
H 3079 2404 3269 4943 2114 
N 1573 833 1451 1437 774 
’ 9929 673 2829 9763 2946 
: 919 505 255 734 448 
{ 0 0 1 0 11 
/ 1 0 0 0 0 
ä 0 0 0 2 2 
p 32466 26918 28574 39069 20782 
7 3 4 52 5 18 
j 1445 1798 1454 2353 831 
1 5 26 143 27 37 
Ê 0 0 0 0 1 
D 2068 573 1074 1187 821 
i 136740 120325 122259 172110 83899 
9 2 5 28 4 4 
ñ 0 2 0 0 0 
L 896 1109 1293 936 573 
g 41074 33639 35580 50346 24317 
v 17667 19267 18986 26328 11072 
! 4532 2084 2153 6118 1829 
‐ 0 0 0 769 0 
A 2994 1997 3159 4508 2809 
x 2564 2753 2567 3839 1292 
) 518 154 86 455 279 
q 1987 2113 1690 2559 1051 
” 4950 5069 9669 10387 5222 
, 50046 32401 32897 51600 29055 
— 662 246 727 706 574 
ï 0 0 0 67 1 
' 9623 2454 1925 4138 3448 
u 57950 51391 57526 82101 39046 
n 146493 131051 129620 187149 93382 
" 1148 3153 2728 7927 3077 
5 0 5 42 5 12 
_ 648 1756 452 1800 314 
S 2393 2426 3073 3205 1748 
[ 9 16 4 12 0 
º 0 0

In [4]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [5]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df.text:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [6]:
min_count = 2

docs = create_docs(df)
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=False, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

In [7]:
input_dim = np.max(docs) + 1
embedding_dims = 20

In [8]:
def create_model(embedding_dims=20, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(5, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

In [9]:
epochs = 25
es = EarlyStopping(patience=2, monitor='val_loss')
mc = ModelCheckpoint(filepath='fasttext.h5', monitor='val_loss', save_best_only=True)

In [10]:
docs = create_docs(df)
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(docs)
docs = tokenizer.texts_to_sequences(docs)

maxlen = 256

docs = pad_sequences(sequences=docs, maxlen=maxlen)

input_dim = np.max(docs) + 1

In [11]:
test_df = pd.read_csv("test_x.csv")
docs_test = create_docs(test_df)
docs_test = tokenizer.texts_to_sequences(docs_test)
docs_test = pad_sequences(sequences=docs_test, maxlen=maxlen)

In [22]:
cv = 5
epochs = 20
cv_scores = []
pred_test = 0
pred_train = np.zeros([docs.shape[0], 5])
skf = model_selection.StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)
sub_train = pd.DataFrame(columns=[0, 1, 2, 3, 4])
sub_train.insert(0, 'index', df.index)
sub_test = pd.DataFrame(columns=[0, 1, 2, 3, 4])
sub_test.insert(0, 'index', test_df.index)

print('CV started')
for train_index, dev_index in skf.split(docs, y):
    X_train, X_dev = docs[train_index], docs[dev_index]
    y_train, y_dev = to_categorical(y)[train_index], to_categorical(y)[dev_index]
    
    model = create_model()
    hist = model.fit(X_train, y_train,
                     batch_size=32,
                     validation_data=(X_dev, y_dev),
                     epochs=epochs,
                     callbacks=[es, mc])
    pred_dev   = model.predict_proba(X_dev)
    pred_test += model.predict_proba(docs_test)
    
    pred_train[dev_index, :] = pred_dev
    cv_scores.append(metrics.log_loss(y_dev, pred_dev))
    print('.', end='')

print('')
print("Mean CV LogLoss: %.3f" % (np.mean(cv_scores)))
pred_test /= cv

sub_train[0] = pred_train[:, 0]
sub_train[1] = pred_train[:, 1]
sub_train[2] = pred_train[:, 2]
sub_train[3] = pred_train[:, 3]
sub_train[4] = pred_train[:, 4]

sub_test[0] = pred_test[:, 0]
sub_test[1] = pred_test[:, 1]
sub_test[2] = pred_test[:, 2]
sub_test[3] = pred_test[:, 3]
sub_test[4] = pred_test[:, 4]

sub_train.to_csv('submission3_train.csv', index=False)
sub_test.to_csv('submission3_test.csv', index=False)

CV started
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Instructions for updating:
Please use `model.predict()` instead.
.Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
.Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
.Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15


Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
.Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
.
Mean CV LogLoss: 0.472


In [None]:
# https://www.kaggle.com/nzw0301/simple-keras-fasttext-val-loss-0-31