# Red neuronal para el análisis del contenido

Usando los datos procesados en [Cleaning_news_index](Cleaning_news_index.ipynb), vamos a construir una NN con Keras para clasificar los textos.

In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from matplotlib import style
#Nicer style
style.use('seaborn') 

from tensorflow import keras as k

from sklearn.model_selection import train_test_split

from gensim.models import KeyedVectors

Data es DataFrame con los contenidos y títulos representados como listas de enteros. Estos enteros se corresponden con los índices de las palabras dentro del vocabulario de Word2Vec. Durante el pre-procesado se han *padeado* las secuencias a una longitud fija para poder pasárselo a la red neuronal.

Por cuestiones de memoria, la conversión del índice a embeddings se hará en la primera capa de la red en vez de pasar los vectores en el DataFrame.

In [19]:
data = pd.read_pickle('../data/news_proc.pickle')

In [20]:
data.head()

Unnamed: 0,type,content,title,one_hot_label
0,fake,"[2458, 4, 27, 17625, 12, 328, 5, 25587, 416, 1...","[1732, 258, 27, 7196, 2154, 4192, 88, 43, 13, ...","[0, 1, 0]"
1,fake,"[3836, 22, 506, 3059, 67, 2, 2941, 2429, 33, 3...","[6117, 13034, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0]"
2,fake,"[7, 37481, 39852, 483, 423, 24, 42, 1837, 81, ...","[6117, 13034, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0]"
3,fake,"[7, 1790, 659, 24, 846, 6786, 5428, 17, 52, 44...","[13341, 1421, 1992, 1178, 8704, 11, 13034, 387...","[0, 1, 0]"
4,fake,"[59, 24, 216, 529, 104, 11, 34925, 133, 31, 24...","[13034, 43, 39049, 43, 76, 1588, 38, 23, 8158,...","[0, 1, 0]"


<div class="alert alert-info">
    Esta parte se corresponde al testeo previo a encontrar una arquitectura adecuada.
</div>

Eliminar sobrerrepresentación de los fake

In [21]:
dfake = data[data['type'] == 'fake']
dtrue = data[data['type'] == 'truth']
dclic = data[data['type'] == 'click']

In [22]:
data = pd.concat([dfake.head(10000), dtrue, dclic])

In [23]:
del dfake, dtrue, dclic

In [7]:
max_size_content = len(data['content'][0])

In [8]:
max_size_content

1866

In [24]:
#Title len
len(data['title'][0])

14

### Word2Vec

In [32]:
model = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin.gz", binary=True,
                                          limit=50000)

In [33]:
embed_mat = np.zeros((50000,300))

In [34]:
for i, e in model.vocab.items():
    embed_mat[e.index] = model[i]


Train / test split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    data[['content', 'title']], data['one_hot_label'], test_size=0.4, random_state=1)

In [17]:
X_train_title = X_train['title']
X_train_content = X_train['content']

X_test_title = X_test['title']
X_test_content = X_test['content']

In [18]:
del data

### Modelo

* inputs: `[TITULO, CONTENT]`
* outputs: `label` (one_hot)

Arquitectura de la red

In [28]:
mode = 'functional'

In [64]:
if mode is not 'functional':
    model = k.models.Sequential()

    model.add(k.layers.Embedding(50000, 300, input_length=1866,
                        weights=[embed_mat], trainable=False))

    #model.add(k.layers.LSTM(300, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))

    model.add(k.layers.Dense(300, activation='relu'))
    model.add(k.layers.Dense(200, activation='relu'))
    model.add(k.layers.Dense(100, activation='relu'))

    model.add(k.layers.Flatten())
    model.add(k.layers.Dense(3, activation='softmax'))
    
else:
    
    #input_title
    title_input = k.layers.Input(shape=(14,), name='title_input')
    inp = k.layers.Embedding(output_dim=300, input_dim=50000, 
                             weights=[embed_mat], trainable=False)(title_input)
    x = k.layers.LSTM(100)(inp)
    
    
    #input_content
    content_input = k.layers.Input(shape=(1866,), name='content_input')
    inp2 = k.layers.Embedding(output_dim=300, input_dim=50000, 
                             weights=[embed_mat], trainable=False)(content_input)
    x2 = k.layers.LSTM(100, return_sequences=True)(inp2)
    x2 = k.layers.LSTM(100)(x2)
    
    
    #Merge
    x = k.layers.concatenate([x, x2])
    
    #Common part
    x = k.layers.Dense(64, activation='relu')(x)
    
    out = k.layers.Dense(3, activation='softmax')(x)
    
    model = k.models.Model(inputs=[title_input, content_input], outputs=[out])   

In [65]:
model.compile(k.optimizers.Adam(lr=0.1), loss='categorical_crossentropy', metrics=['acc'])

In [66]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
content_input (InputLayer)      (None, 1866)         0                                            
__________________________________________________________________________________________________
title_input (InputLayer)        (None, 14)           0                                            
__________________________________________________________________________________________________
embedding_17 (Embedding)        (None, 1866, 300)    15000000    content_input[0][0]              
__________________________________________________________________________________________________
embedding_16 (Embedding)        (None, 14, 300)      15000000    title_input[0][0]                
__________________________________________________________________________________________________
lstm_15 (L

In [62]:
train_fit = [np.asarray(X_train_title.tolist()), 
             np.asarray(X_train_content.tolist())]

In [67]:
hist = model.fit(x=train_fit, y=np.asarray(y_train.tolist()), batch_size=64, epochs=15,
          callbacks = [k.callbacks.EarlyStopping(monitor='val_acc', patience=2)], validation_split=0.3)

Train on 8721 samples, validate on 3738 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
 256/8721 [..............................] - ETA: 8:35 - loss: 7.7442 - acc: 0.5195

KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(16,6))
plt.plot(hist.history['val_acc'], label='val_acc')
plt.plot(hist.history['acc'], label='train_acc')
plt.legend()

Predicción

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
test_pred = model.predict(np.asarray(X_test.tolist()))

In [None]:
accuracy_score(np.array(y_test.tolist()), test_pred.round())

In [None]:

# Evaluate the model

scores = model.evaluate(np.array(X_test.tolist()), np.array(y_test.tolist()),
                        batch_size=32)
print('Loss:', scores[0])
print('Accuracy:', scores[1])