In [1]:
from tokenizers.implementations import ByteLevelBPETokenizer

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score,recall_score,precision_score, accuracy_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, GlobalAveragePooling1D, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
import itertools
from collections import Counter
from keras.models import load_model


Using TensorFlow backend.
  data = yaml.load(f.read()) or {}


In [3]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')
test = pd.read_csv('data/test.csv')

In [4]:
all_train_text = ' '.join(train.text.tolist())

In [5]:
with open('data/train_text.txt','w') as f:
    f.write(all_train_text)

In [6]:
tokenizer_ = ByteLevelBPETokenizer()

In [7]:
# Customize training
tokenizer_.train(files='train_text.txt', vocab_size=50000, min_frequency=2, special_tokens=[
    "[PAD]",
    "[UNK]",
    "[CLS]",
    "[SEP]",
    "[MASK]"
])

tokenizer_.save("results/tokenizer/", "spoilers")


['results/tokenizer/spoilers-vocab.json',
 'results/tokenizer/spoilers-merges.txt']

In [8]:
tokenizer = ByteLevelBPETokenizer(
    "results/tokenizer/spoilers-vocab.json",
    "results/tokenizer/spoilers-merges.txt",
)

In [9]:
clf_token = tokenizer.token_to_id('[CLS]')  # classifier token
pad_token = tokenizer.token_to_id('[PAD]')  # pad token
max_length = 200

def token2id(text):
    "Convert text (item[0]) to sequence of IDs and label (item[1]) to integer"
    inputs = tokenizer.encode(text).ids
    # Trim or pad dataset
    if len(inputs) >= max_length:
        inputs = inputs[:max_length - 1]
        ids = inputs + [clf_token]
    else:
        pad = [pad_token] * (max_length - len(inputs) - 1)
        ids = inputs + [clf_token] + pad
    return ids

def text2tensor(texts):
    out=[]
    for text in texts:
        out.append(token2id(text))
    out = np.array(out)#.transpose(1,0)
    return out

In [10]:
reviews_train = train.text.tolist()
reviews_val = val.text.tolist()
reviews_test = test.text.tolist()

In [11]:
x_train = text2tensor(reviews_train)
x_val = text2tensor(reviews_val)
x_test = text2tensor(reviews_test)

In [12]:
x_train.shape, x_val.shape, x_test.shape

((76096, 200), (19024, 200), (23781, 200))

In [13]:
y_train = train.spoiler.values
y_train = to_categorical(y_train)

y_val = val.spoiler.values
y_val = to_categorical(y_val)

y_test = test.spoiler.values
y_test = to_categorical(y_test)

In [14]:

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

Number of positive and negative reviews in traing and validation set
[49706. 26390.]
[15386.  8395.]


In [15]:
model = Sequential()
model.add(Embedding(tokenizer.get_vocab_size(), 16))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


W0419 20:49:40.235413 140519147849536 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:58: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0419 20:49:40.238693 140519147849536 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:442: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0419 20:49:40.242347 140519147849536 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3543: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0419 20:49:40.299147 140519147849536 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/optimize

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          800000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 800,306
Trainable params: 800,306
Non-trainable params: 0
_________________________________________________________________


In [16]:
calls = [EarlyStopping(monitor='val_loss', patience=2),
        ModelCheckpoint('results/tokenizer/model_weights_best.h5', monitor='val_loss', save_best_only=True)]

In [17]:
model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          callbacks=calls,
          epochs=100, 
          batch_size=50)

W0419 20:49:47.235413 140519147849536 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:899: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0419 20:49:47.252111 140519147849536 deprecation.py:506] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:625: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 76096 samples, validate on 19024 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x7fccd4db7290>

In [18]:
model=load_model('results/tokenizer/model_weights_best.h5')

model.evaluate(x_test, y_test, verbose=0)

[0.3469832340926389, 0.8535595643606074]

In [19]:
preds_round = model.predict(x_test).round().argmax(axis=1)
y_test_round = y_test.argmax(axis=1)

In [20]:
precision = precision_score(y_test_round, preds_round)
recall = recall_score(y_test_round, preds_round)
f1 = f1_score(y_test_round, preds_round)
cnf_test = confusion_matrix(y_test_round, preds_round)
auc = roc_auc_score(y_test_round, preds_round)

In [21]:
precision

0.8518146607373404

In [22]:
recall

0.7073257891602144

In [23]:
f1

0.7728751789665494

In [24]:
cnf_test

array([[14353,  1033],
       [ 2457,  5938]])

In [25]:
auc

0.8200934158331944

In [26]:
import re
import unidecode
from pprint import pprint

def clean_str(s):
    
    s = unidecode.unidecode(s)
    s = re.sub(r'\t','',s)
    s = re.sub(r'\r','',s)
    s = s.lower()
    s = re.sub(r'[^a-z0-9]',' ',s)
    s = re.sub(r' +',' ',s)

    return s.strip()

In [27]:
def spoilme(s):
    s = re.sub(r'\n','.',s)
    s = s.split('.')
    s = [clean_str(i) for i in s]
    s = [f for f in s if f]
    s_ = text2tensor(s)
    predictions = model.predict(s_)
    predictions_ = predictions.argmax(axis=1)
    result = list(zip(s, predictions_))
    spoilers = [r[0] for r in result if r[1]==1]
    print(len(spoilers)/len(result))
    return spoilers , predictions

In [28]:
s='''

Tras haber pasado la mayor parte de su vida explorando la jungla junto a sus padres, nada podría haber preparado a Dora para la aventura más peligrosa a la que jamás se ha enfrentado: ¡el instituto!. Exploradora hasta el final, Dora no tarda en ponerse al frente de un equipo formado por Botas (su mejor amigo, un mono), Diego, un misterioso habitante de la jungla y un desorganizado grupo de adolescentes en una aventura en la que deberán salvar a sus padres y resolver el misterio oculto tras una ciudad perdida de oro.

'''

In [29]:
spoilme(s)

1.0


(['tras haber pasado la mayor parte de su vida explorando la jungla junto a sus padres nada podria haber preparado a dora para la aventura mas peligrosa a la que jamas se ha enfrentado el instituto',
  'exploradora hasta el final dora no tarda en ponerse al frente de un equipo formado por botas su mejor amigo un mono diego un misterioso habitante de la jungla y un desorganizado grupo de adolescentes en una aventura en la que deberan salvar a sus padres y resolver el misterio oculto tras una ciudad perdida de oro'],
 array([[0.12715074, 0.86992395],
        [0.14945368, 0.8437836 ]], dtype=float32))