In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score,recall_score,precision_score, accuracy_score
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, GlobalAveragePooling1D, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
import itertools
from collections import Counter

Using TensorFlow backend.
  data = yaml.load(f.read()) or {}


In [2]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train

Unnamed: 0,len,spoiler,text
0,9,1,enserio casi eyaculo cuando comienza a arder e...
1,16,1,me faltan los miticos filibusteros y marineros...
2,79,0,el cine clasico es un invento de los estudios ...
3,4,0,si queremos concretar mas
4,2,1,darth plagueis
5,617,1,un ejemplo la escena en la que el joker hace e...
6,38,0,vengo de verla con un amigo y se nos ha hecho ...
7,33,0,sobre la escena que se ve tiene toda la pinta ...
8,12,1,de los viejos yendo a ayudar xdd estaba como m...
9,23,0,a veces es mejor no buscarle 5 pies al gato o ...


In [4]:
train['review'] = train['text'].apply(lambda s: s.split())
test['review'] = test['text'].apply(lambda s: s.split())
val['review'] = val['text'].apply(lambda s: s.split())


In [5]:
reviews_train = train.review.tolist()
reviews_val = val.review.tolist()
reviews_test = test.review.tolist()

In [6]:
# Creacion del diccionario (con los datos de train solamente para evitar leakage)

words= list(itertools.chain.from_iterable(reviews_train))
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)


vocab_to_int = {word: ii for ii, word in enumerate(vocab, 2)}
vocab_to_int['[PAD]']=0
vocab_to_int['[UNK]']=1

int2vocab={i:w for w,i in vocab_to_int.items()}


In [7]:
len(vocab_to_int)

89620

In [7]:
x_train = []
for review in reviews_train:
    x_train.append([vocab_to_int[word] for word in review])
    
x_val = []
for review in reviews_val:
    x_val.append([vocab_to_int.get(word,1) for word in review])
    
x_test = []
for review in reviews_test:
    x_test.append([vocab_to_int.get(word,1) for word in review])

In [8]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reviews_ints):
        features[i, :len(row)] = np.array(row)[:seq_length]
    
    return features

In [9]:
np.percentile(train.len.values,98.5)

224.0

In [10]:
seq_length = 200

x_train = pad_features(x_train, seq_length)
y_train = train.spoiler.values
y_train = to_categorical(y_train)

x_val = pad_features(x_val, seq_length)
y_val = val.spoiler.values
y_val = to_categorical(y_val)

x_test = pad_features(x_test, seq_length)
y_test = test.spoiler.values
y_test = to_categorical(y_test)

In [11]:

print('Number of positive and negative reviews in traing and validation set')
print(y_train.sum(axis=0))
print(y_test.sum(axis=0))

Number of positive and negative reviews in traing and validation set
[49706. 26390.]
[15386.  8395.]


In [12]:
model = Sequential()
model.add(Embedding(len(vocab_to_int), 16))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


W0415 18:44:37.163565 139707012126528 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:58: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0415 18:44:37.166036 139707012126528 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:442: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0415 18:44:37.169320 139707012126528 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3543: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0415 18:44:37.225982 139707012126528 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/optimize

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          1433920   
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
Total params: 1,434,226
Trainable params: 1,434,226
Non-trainable params: 0
_________________________________________________________________


In [13]:
calls = [EarlyStopping(monitor='val_loss', patience=2)]

In [14]:
model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          callbacks=calls,
          epochs=100, 
          batch_size=50)

W0415 18:44:38.651326 139707012126528 deprecation_wrapper.py:119] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:899: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0415 18:44:38.666726 139707012126528 deprecation.py:506] From /media/Datos/Documentos/Python/anacondas/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:625: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 76096 samples, validate on 19024 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


<keras.callbacks.History at 0x7f100c277a50>

In [15]:
model.evaluate(x_test, y_test, verbose=0)


[0.3611341979428089, 0.8532652117186321]

In [16]:
preds_round = model.predict(x_test).round().argmax(axis=1)
y_test_round = y_test.argmax(axis=1)

In [17]:
precision = precision_score(y_test_round, preds_round)
recall = recall_score(y_test_round, preds_round)
f1 = f1_score(y_test_round, preds_round)
cnf_test = confusion_matrix(y_test_round, preds_round)
auc = roc_auc_score(y_test_round, preds_round)

In [18]:
precision

0.8251127022010077

In [19]:
recall

0.7412745681953544

In [20]:
f1

0.78094999058794

In [21]:
cnf_test

array([[14067,  1319],
       [ 2172,  6223]])

In [22]:
auc

0.8277736418254816

In [23]:
import re
import unidecode
from pprint import pprint

def clean_str(s):
    
    s = unidecode.unidecode(s)
    s = re.sub(r'\t','',s)
    s = re.sub(r'\r','',s)
    s = s.lower()
    s = re.sub(r'[^a-z0-9]',' ',s)
    s = re.sub(r' +',' ',s)

    return s.strip()

In [24]:
def spoilme(s):
    s = re.sub(r'\n','.',s)
    s = s.split('.')
    s = [clean_str(i) for i in s]
    s = [f for f in s if f]
    s_ = [[vocab_to_int.get(j,1) for j in k.split()] for k in s]
    s_ = pad_features(s_,seq_length)
    predictions = model.predict(s_)
    predictions_ = predictions.argmax(axis=1)
    result = list(zip(s, predictions_))
    spoilers = [r[0] for r in result if r[1]==1]
    print(len(spoilers)/len(result))
    return spoilers , predictions

In [25]:
s='''

Tras haber pasado la mayor parte de su vida explorando la jungla junto a sus padres, nada podría haber preparado a Dora para la aventura más peligrosa a la que jamás se ha enfrentado: ¡el instituto!. Exploradora hasta el final, Dora no tarda en ponerse al frente de un equipo formado por Botas (su mejor amigo, un mono), Diego, un misterioso habitante de la jungla y un desorganizado grupo de adolescentes en una aventura en la que deberán salvar a sus padres y resolver el misterio oculto tras una ciudad perdida de oro.

'''

In [26]:
spoilme(s)

1.0


(['tras haber pasado la mayor parte de su vida explorando la jungla junto a sus padres nada podria haber preparado a dora para la aventura mas peligrosa a la que jamas se ha enfrentado el instituto',
  'exploradora hasta el final dora no tarda en ponerse al frente de un equipo formado por botas su mejor amigo un mono diego un misterioso habitante de la jungla y un desorganizado grupo de adolescentes en una aventura en la que deberan salvar a sus padres y resolver el misterio oculto tras una ciudad perdida de oro'],
 array([[0.34572867, 0.6567872 ],
        [0.07382895, 0.9271571 ]], dtype=float32))

In [27]:
import json

config={}
config['vocab2int']=vocab_to_int
config['max_len']=seq_length


with open('results/config.json', 'w') as outfile:
    json.dump(config, outfile)

model.save('results/model_weights.h5',overwrite=True)