# Deep learning tweeter

## Imports

In [7]:
import sys
import cufflinks
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import pickle

warnings.filterwarnings('ignore')

sys.path.append('./../')
cufflinks.go_offline()

In [8]:
from Corpus.Corpus import get_corpus, filter_binary_pn, filter_corpus_small
from auxiliar.VectorizerHelper import vectorizer, vectorizerIdf
from auxiliar import parameters
from sklearn.model_selection import KFold
from auxiliar.HtmlParser import HtmlParser

In [9]:
# Importaciones para DeepLearning
from keras.layers import Dense, Activation, LSTM, Dropout, Convolution1D, MaxPooling1D, Embedding, Conv2D, \
    MaxPooling2D, Reshape, Flatten, BatchNormalization, TimeDistributed
from keras.models import Sequential, load_model
from keras import callbacks
from keras import backend as K
from keras.layers import Bidirectional
from keras.models import Model
K.set_image_dim_ordering('th') 

Using Theano backend.


In [10]:
import logging
import math
import pickle
import pandas as pd
import Levenshtein as lv
from gensim.models import Doc2Vec
import gensim
from nltk.tokenize import word_tokenize
from Corpus.Corpus import get_corpus, filter_binary_pn, filter_corpus_small
from keras.preprocessing import sequence
from keras.utils import to_categorical
from time import time, strftime

## Config

In [11]:
polarity_dim = 2
clasificadores=['lstm', '2lstm', '2dcnn', '2dcnn+lstm', 'cnn+lstm', 'bidirectionalLstm']
idf = True
target_names=['Neg', 'Pos']
kfolds = 10
base_dir = '2-clases' if polarity_dim == 2 else ('3-clases' if polarity_dim == 3 else '5-clases')
name = 'deep_learning'

## Get data

In [12]:
cine = HtmlParser(200, "http://www.muchocine.net/criticas_ultimas.php", 1)
#data_corpus = get_corpus('general-corpus', 'general-corpus', 1, None)

if polarity_dim == 2:
    #data_corpus = filter_binary_pn(data_corpus)
    cine = filter_binary_pn(cine.get_corpus())
elif polarity_dim == 3:
    #data_corpus = filter_corpus_small(data_corpus)
    cine = filter_corpus_small(cine.get_corpus())
used_data = cine[:5000]
#used_data = data_corpus
split = round(len(used_data) * 0.8)
try:
    train_idx = pd.read_pickle('data/results/'+name+'/cine/'+base_dir+'/train_index.pickle').values.ravel()
    test_idx = pd.read_pickle('data/results/'+name+'/cine/'+base_dir+'/test_index.pickle').values.ravel()
except Exception as e:
    print(e)
    indices = np.random.permutation(len(used_data))
    train_idx, test_idx = indices[:split], indices[split:]
    pd.DataFrame(train_idx).to_pickle('data/results/'+name+'/cine/'+base_dir+'/train_index.pickle')
    pd.DataFrame(test_idx).to_pickle('data/results/'+name+'/cine/'+base_dir+'/test_index.pickle')
train_corpus = used_data.loc[train_idx]
train_corpus = train_corpus.reset_index()
test_corpus = used_data.loc[test_idx]
test_corpus = test_corpus.reset_index()
cine = None
#data_corpus = None

[Errno 2] File b'data/pelis.csv' does not exist: b'data/pelis.csv'


AttributeError: 'NoneType' object has no attribute 'find_all'

## Preprocess

### Initialize

In [7]:
w2vec_file = 'data/w2vec.bin'
prepro = vectorizerIdf.build_preprocessor()
analyzer = vectorizerIdf.build_analyzer()
root_logger = logging.getLogger()
root_logger.setLevel(logging.DEBUG)
handler = logging.FileHandler('entrenamiento-%s.log' % strftime("%d-%m-%Y-%H-%M"), 'w', 'utf-8')  # or whatever
handler.setFormatter = logging.Formatter('%(name)s %(message)s')  # or whatever
root_logger.addHandler(handler)

### Polarity distribution

In [8]:
train_corpus.groupby('polarity').agg({'index': 'count'}).iplot(kind='bar')

### Model initialization

In [9]:
model = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(w2vec_file, binary=True)

### Parse text

In [10]:
def apply_prepro(data):
    return prepro(data, 3, 0, 3)
train_corpus.content = train_corpus.content.apply(apply_prepro)

### Tokenize texts

In [11]:
def apply_tokenization(data):
    return analyzer(data)
tokens = train_corpus.content.apply(apply_tokenization)

### Construct df

In [12]:
token_df = pd.DataFrame([x for x in tokens]).transpose()
token_df.columns = pd.MultiIndex.from_arrays([train_corpus.polarity, token_df.columns])

In [13]:
print('palabras totales', token_df.count().sum())
print('media de palabras por texto', token_df.count().mean())

palabras totales 1004947
media de palabras por texto 267.2731382978723


In [14]:
pd.DataFrame([token_df[x].count().sum() for x in token_df.columns.levels[0]]).iplot(kind='bar')

### w2vec process

In [15]:
not_in_vocab = pd.concat([token_df[d][token_df[d].apply(lambda x: x not in model)] for d in token_df.columns], axis=1)

In [16]:
print('total de palabras no encontradas en el vocabulario', not_in_vocab.count().sum())
print('media de palabras no encontradas en el conjunto', not_in_vocab.count().mean())

total de palabras no encontradas en el vocabulario 38105
media de palabras no encontradas en el conjunto 10.134308510638299


In [17]:
pd.DataFrame([not_in_vocab[x].count().sum() for x in not_in_vocab.columns.levels[0]]).iplot(kind='bar')

In [18]:
not_in_vocab.columns = not_in_vocab.columns.droplevel()

In [19]:
not_in_vocab_words = pd.DataFrame(
    [x for sublist in [not_in_vocab[y].dropna().values for y in not_in_vocab.columns] for x in sublist]
).drop_duplicates()

In [20]:
def find_min_value(word, not_in_vocab_words):
    min_len = max(len(word) - 1, 1)
    max_len = len(word) + 1
    reduced_df = not_in_vocab_words.loc[(not_in_vocab_words[0].str.len() >= min_len) 
                                        & (not_in_vocab_words[0].str.len() <= max_len)]
    min_df = reduced_df[0].apply(lambda x: lv.distance(x, word))
    if len(min_df) > 0:
        min_idx = min_df.idxmin()
        min_val = min_df.min()
        if min_val < 3:
            old_word = not_in_vocab_words[0][min_idx]
            root_logger.info('current word %s - found_distance %s - idx to replace %s word replacement %s', old_word, min_val, min_idx, word)
            found_words[word] = { 'val': min_val, 'word': word, 'old_word': old_word }
    #         not_in_vocab[not_in_vocab == old_word] = word
            return not_in_vocab_words.drop(min_idx)
        else:
            return not_in_vocab_words
    else:
        return not_in_vocab_words

In [21]:
try:
    found_words_df = pd.read_pickle('data/results/'+name+'/cine/'+base_dir+'/found_words_vocab.pkl')
except Exception as e:
    found_words = dict()
    with tqdm(total=len(model.vocab)) as pbar:
        for x in model.vocab:
            if len(not_in_vocab_words) == 0:
                break
            pbar.update(1)
            not_in_vocab_words = find_min_value(x, not_in_vocab_words)
    found_words_df = pd.DataFrame(found_words)
    found_words_df.to_pickle('data/results/'+name+'/cine/'+base_dir+'/found_words_vocab.pkl')

In [8]:
try:
    wemb_df = pd.read_json('data/results/'+name+'/cine/'+base_dir+'/wemb_df.json')
except Exception as e:
    found_words_df = found_words_df.transpose()

    def replace_words(w, pbar):
        pbar.update(1)
        if not isinstance(w, str):
            return w
        try:
            return found_words_df[found_words_df.old_word == w].word[0]
        except:
            return w

    with tqdm(total=not_in_vocab.size) as pbar:
        not_in_vocab = not_in_vocab.applymap(lambda x: replace_words(x, pbar))

    not_in_vocab.columns = token_df.columns

    token_df.update(not_in_vocab)

    token_df.to_pickle('data/results/'+name+'/cine/'+base_dir+'/token_df.pkl')

    def get_wemb(w, pbar):
        pbar.update(1)
        if isinstance(w, str):
            return model[w] if w in model else np.array([0.0], dtype='float32')
        else:
            return w

    token_df = token_df.applymap(lambda x: np.array([0.0], dtype='float32') if not isinstance(x, str) else x)

    with tqdm(total=token_df.size) as pbar:
        wemb_df = token_df.transpose().applymap(lambda x: get_wemb(x, pbar))

wemb_values = wemb_df.values

wemb_df.to_json('data/results/'+name+'/cine/'+base_dir+'/wemb_df.json')

NameError: name 'found_words_df' is not defined

In [23]:
features=300
timesteps=len(wemb_df.columns)
input_dim = 200
neurons=256
input_shape=(timesteps, features)

In [24]:
X = np.array([sequence.pad_sequences(x, maxlen=features, dtype='float32') for x in wemb_values])

In [33]:
Y = to_categorical([x for x in train_corpus.polarity.values])

In [34]:
Y

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [27]:
polarity_dim

2

## Initialize DL

In [28]:
model_lstm = Sequential()
model_2lstm = Sequential()
model_2dcnn = Sequential()
model_2dcnnLstm = Sequential()
model_cnn = Sequential()
model_bidi = Sequential()

model_lstm.add(LSTM(neurons, activation='tanh', input_shape=input_shape, name='lstm'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_lstm.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])

model_2lstm.add(LSTM(neurons, activation='tanh', input_shape=input_shape, return_sequences=True, name='lstm'))
model_2lstm.add(Dropout(0.5))
model_2lstm.add(LSTM(neurons, activation='tanh'))
model_2lstm.add(Dropout(0.5))
model_2lstm.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_2lstm.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])

model_2dcnn.add(Reshape((1, timesteps, features), input_shape=input_shape))
model_2dcnn.add(Conv2D(128, (4, 300), padding='same', name='conv_layer'))
model_2dcnn.add(Activation('relu'))
model_2dcnn.add(MaxPooling2D(pool_size=(2,2), strides=None))
model_2dcnn.add(Flatten())
model_2dcnn.add(Dropout(0.5))
model_2dcnn.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_2dcnn.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])

model_2dcnnLstm.add(Reshape((1, timesteps, features), input_shape=input_shape))
model_2dcnnLstm.add(Conv2D(128, (4, 300), padding='same', name='conv_layer'))
model_2dcnnLstm.add(Activation('relu'))
model_2dcnnLstm.add(MaxPooling2D(pool_size=(2,2), strides=None))
model_2dcnnLstm.add(TimeDistributed(Flatten()))
model_2dcnnLstm.add(LSTM(neurons))
model_2dcnnLstm.add(Dropout(0.5))
model_2dcnnLstm.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_2dcnnLstm.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])

model_cnn.add(Convolution1D(128, 3, padding='same', name='conv_layer', input_shape=input_shape))
model_cnn.add(Activation('tanh'))
model_cnn.add(MaxPooling1D(4))
model_cnn.add(LSTM(neurons, dropout=0.5))
model_cnn.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_cnn.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])

model_bidi.add(Bidirectional(LSTM(neurons), input_shape=input_shape))
model_bidi.add(Dense(polarity_dim, activation='softmax', name='dense'))
model_bidi.compile(loss="categorical_crossentropy", optimizer='adagrad', metrics=['accuracy'])


## Train

In [29]:
kf = KFold(n_splits=kfolds, shuffle=True, random_state=None) # realización de k-folds

In [30]:
monitor = 'val_loss'
patience = 5
cbks = [callbacks.EarlyStopping(monitor=monitor, patience=patience)]

In [31]:
pipeline = {
    'lstm': model_lstm,
    '2lstm': model_2lstm,
    '2dcnn': model_2dcnn,
    '2dcnn+lstm': model_2dcnnLstm,
    'cnn+lstm': model_cnn,
    'bidirectionalLstm': model_bidi
}

In [35]:
results = {}
with tqdm(total=len(clasificadores) * 10) as pbar:
    for c in clasificadores:
        results[c] = { 'real': {}, 'predicted': {} }
        i = 0
        for train_index, test_index in kf.split(train_corpus.content):
            train_x = X[train_index]
            train_y = Y[train_index]
            test_x = X[test_index]
            test_y = Y[test_index]
            print(train_y)
            if c == 'bidiLstm':
                train_x = [train_x, np.flipud(train_x)]
                
            pipeline[c].fit(train_x, train_y, batch_size=64, callbacks=cbks, epochs=1000, validation_split=0.25, shuffle=False, verbose=1)

            predicted = pipeline[c].predict(test_x)

            results[c]['real'][i] = test_y.tolist()
            results[c]['predicted'][i] = predicted.tolist()
            i = i + 1

            pbar.update(1)

    

  0%|          | 0/60 [00:00<?, ?it/s]

[[0. 1.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Train on 2538 samples, validate on 846 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000


  2%|▏         | 1/60 [1:25:19<83:54:38, 5119.98s/it]

[[0. 1.]
 [1. 0.]
 [0. 1.]
 ...
 [0. 1.]
 [0. 1.]
 [1. 0.]]
Train on 2538 samples, validate on 846 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000


  3%|▎         | 2/60 [2:14:40<72:02:56, 4472.00s/it]


MemoryError: 

In [None]:
results

In [None]:
pd.DataFrame(results).to_pickle('data/results/'+name+'/cine/' + base_dir + '/' + name + '.pkl')