In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from tensorflow import keras

from unidecode import unidecode

In [2]:
df = pd.read_csv("../data/ro_news.csv")
df.head()

Unnamed: 0,source,title,text,url
0,evz.ro,Ministrul spulberă informațiile despre pensii....,Ministrul Muncii Violeta Alexandru a declarat ...,https://evz.ro/bomba-despre-pensii-ministrul-s...
1,adevarul.ro,G4Media: Klaus Iohannis i-a cerut premierului ...,"Preşedintele Klaus Iohannis i-a cerut, vineri,...",https://adevarul.ro/news/politica/g4media-klau...
2,puterea.ro,ULTIMA ORĂ: Medicul Mircea Beuran a fost REȚIN...,Medicul Mircea Beuran a fost reținut pentru 24...,https://www.puterea.ro/eveniment/ultima-ora-me...
3,agerpres.ro,"Iohannis, în Israel: România - angajată să con...","\r\nTrimisul special al AGERPRES, Florentina P...",https://www.agerpres.ro/politica/2020/01/21/io...
4,news.ro,Avertizări cod galben şi cod portocaliu de plo...,Meteorologii au emis saâmbătă seară avertizări...,https://www.news.ro/social/avertizari-cod-galb...


In [3]:
df['y'] = df['source'].apply(lambda x: x in ('puterea.ro', 'b1.ro')).astype(int)

In [4]:
from nltk.stem.snowball import RomanianStemmer

In [5]:
def clean(txt):
    tokens = word_tokenize(txt)
    stemmer = RomanianStemmer()
    # remove all tokens that are not alphabetic
    words = [stemmer.stem(word.lower()) for word in tokens if word.isalpha()]
    return words

In [6]:
%%time
df['title_clean'] = df['title'].apply(clean)
df['text_clean'] = df['text'].apply(clean)

Wall time: 4min 4s


In [7]:
def get_vocabulary(X):
    chars_ = set()
    for sent in X:
        for w in sent:
            for c in w:
                chars_.add(c)
    chars_ = sorted(list(chars_))
    char2idx = { c : i + 1 for i, c in enumerate(chars_)}
    return char2idx

def get_char_features(X, char2idx, sent_size=10, char_feat_size=10):
    def word2charidxs(word, char2idx):
        char_feats = list(map(lambda c : char2idx.get(c, 0), word))
        return char_feats
    
    X_chars = []
    for sent in X:
        sent_indx = list(map(lambda x: word2charidxs(x, char2idx), sent))
        sent_indx = keras.preprocessing.sequence.pad_sequences(maxlen=sent_size,
            sequences=sent_indx, padding="post", truncating="post", value=0)
        X_chars.append(sent_indx)
    pad_val = np.zeros((sent_size, char_feat_size))
    X_chars = keras.preprocessing.sequence.pad_sequences(maxlen=sent_size, sequences=X_chars,
        padding="post", truncating="post", value = pad_val)
    return X_chars

In [8]:
%%time
title_char2idx = get_vocabulary(df['title_clean'])
text_char2idx = get_vocabulary(df['text_clean'])

Wall time: 1.86 s


In [9]:
%%time
X_title = get_char_features(df['title_clean'], title_char2idx)
X_text = get_char_features(df['text_clean'], text_char2idx)

Wall time: 25 s


In [20]:
# char encoding
TITLE_SENT_SIZE = 10
TEXT_SENT_SIZE = 100
CHAR_FEAT_SIZE = 10

def title_only_model(char2idx):
    # TITLE INPUT
    title_input = keras.layers.Input(shape=(TITLE_SENT_SIZE, CHAR_FEAT_SIZE))
    title_char_emb = keras.layers.TimeDistributed(keras.layers.Embedding(input_dim=len(char2idx) + 1,
        output_dim=30, input_length=CHAR_FEAT_SIZE))(title_input)  

    title_char_dropout = keras.layers.Dropout(0.5)(title_char_emb)
    title_char_conv1d = keras.layers.TimeDistributed(keras.layers.Conv1D(kernel_size=3, filters=32,
        padding='same',activation='tanh', strides=1))(title_char_dropout)
    title_char_maxpool = keras.layers.TimeDistributed(keras.layers.MaxPooling1D(CHAR_FEAT_SIZE))(title_char_conv1d)
    title_char_feats = keras.layers.TimeDistributed(keras.layers.Flatten())(title_char_maxpool)

    #all_feat = keras.layers.concatenate([char_feats])
    all_feat = title_char_feats

    all_out = keras.layers.SpatialDropout1D(0.3)(all_feat)

    bi_lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=100,
            return_sequences=False))(all_out)

    out = keras.layers.Dense(1, activation="sigmoid")(bi_lstm)

    model = keras.models.Model([title_input], out)
    model.compile(optimizer='adam', loss=keras.losses.binary_crossentropy, metrics=['acc'])

    return model

def title_and_text_model(title_char2idx, text_char2idx):
    # TITLE INPUT
    title_input = keras.layers.Input(shape=(TITLE_SENT_SIZE, CHAR_FEAT_SIZE))
    title_char_emb = keras.layers.TimeDistributed(keras.layers.Embedding(input_dim=len(title_char2idx) + 1,
        output_dim=30, input_length=CHAR_FEAT_SIZE))(title_input)  

    title_char_dropout = keras.layers.Dropout(0.5)(title_char_emb)
    title_char_conv1d = keras.layers.TimeDistributed(keras.layers.Conv1D(kernel_size=3, filters=32,
        padding='same',activation='tanh', strides=1))(title_char_dropout)
    title_char_maxpool = keras.layers.TimeDistributed(keras.layers.MaxPooling1D(CHAR_FEAT_SIZE))(title_char_conv1d)
    title_char_feats = keras.layers.TimeDistributed(keras.layers.Flatten())(title_char_maxpool)
    
    # TEXT INPUT
    text_input = keras.layers.Input(shape=(TEXT_SENT_SIZE, CHAR_FEAT_SIZE))
    text_char_emb = keras.layers.TimeDistributed(keras.layers.Embedding(input_dim=len(text_char2idx) + 1,
        output_dim=30, input_length=CHAR_FEAT_SIZE))(text_input)  

    text_char_dropout = keras.layers.Dropout(0.5)(text_char_emb)
    text_char_conv1d = keras.layers.TimeDistributed(keras.layers.Conv1D(kernel_size=3, filters=32,
        padding='same',activation='tanh', strides=1))(text_char_dropout)
    text_char_maxpool = keras.layers.TimeDistributed(keras.layers.MaxPooling1D(CHAR_FEAT_SIZE))(text_char_conv1d)
    text_char_feats = keras.layers.TimeDistributed(keras.layers.Flatten())(text_char_maxpool)
    
    all_feat = keras.layers.concatenate([title_char_feats, text_char_feats])

    all_out = keras.layers.SpatialDropout1D(0.3)(all_feat)

    bi_lstm = keras.layers.Bidirectional(keras.layers.LSTM(units=100,
            return_sequences=False))(all_out)

    out = keras.layers.Dense(1, activation="sigmoid")(bi_lstm)

    model = keras.models.Model([title_input, text_input], out)
    model.compile(optimizer='adam', loss=keras.losses.binary_crossentropy, metrics=['acc'])

    return model


In [21]:
y = df['y'].values

In [22]:
title_model = title_only_model(title_char2idx)

In [18]:
title_model.fit(x=X_title, y=y, batch_size=32, epochs=10, validation_split=.1)

Train on 17663 samples, validate on 1963 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f984d8ec48>

In [23]:
text_model = title_and_text_model(title_char2idx, text_char2idx)

In [25]:
text_model.fit(x=[X_title, X_text], y=y, batch_size=32, epochs=10, validation_split=.1)

Train on 17663 samples, validate on 1963 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f9882e1c08>

In [26]:
import pickle

In [30]:
with open('char2idxs.pkl', 'wb') as f:
    pickle.dump({'text_char2idx' : text_char2idx,
        'title_char2idx' : title_char2idx,
    }, f)

In [32]:
text_model.save_weights('../model.h5')