In [0]:
# !rm *.csv
# !wget http://smartheatmap.ddns.net/dist/p.csv
# !wget http://smartheatmap.ddns.net/dist/n.csv
# !pip3 install wldhx.yadisk-direct hyperas hyperopt
# !curl -L $(yadisk-direct https://yadi.sk/d/NmzmzI1_v9tecQ) -o w2v.zip
# !unzip w2v.zip

In [0]:
import pandas as pd
import numpy as np

data_positive = pd.read_csv('p.csv', sep=';', error_bad_lines=False, usecols=['ttext'])
data_negative = pd.read_csv('n.csv', sep=';', error_bad_lines=False, usecols=['ttext'])

sample_size = min(data_positive.shape[0], data_negative.shape[0])
raw_data = np.concatenate((data_positive['ttext'].values[:sample_size],
                           data_negative['ttext'].values[:sample_size]), axis=0)
labels = [1] * sample_size + [0] * sample_size

In [0]:
import re

def clear_text(text):
    text = text.lower().replace("—ë", "–µ")
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', text)
    text = re.sub('@[^\s]+', 'USER', text)
    text = re.sub('[^a-zA-Z–∞-—è–ê-–Ø1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip()


data = [clear_text(t) for t in raw_data]


In [0]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=1)

In [5]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle

SENTENCE_LENGTH = 26
NUM = 100000

tokenizer = Tokenizer(num_words=NUM)
tokenizer.fit_on_texts(x_train)

def to_seq(x):
    sequences = tokenizer.texts_to_sequences(x)
    return pad_sequences(sequences, maxlen=SENTENCE_LENGTH)

with open('tokenizer.pickle', 'wb') as handle:
  pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
  
x_train = to_seq(x_train)
x_test = to_seq(x_test)

Using TensorFlow backend.


In [6]:
from gensim.models import Word2Vec

wvmodel = Word2Vec.load("w2v/tweets_model.w2v")

DIM = wvmodel.vector_size 
embedding_matrix = np.zeros((NUM, DIM))

for word, i in tokenizer.word_index.items():
    if i >= NUM:
        break
    if word in wvmodel.wv.vocab.keys():
        embedding_matrix[i] = wvmodel.wv[word]

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:
from keras.layers import Input
from keras.layers.embeddings import Embedding

m_input = Input(shape=(SENTENCE_LENGTH,), dtype='int32')
m_embed = Embedding(NUM, 
                    DIM, 
                    input_length=SENTENCE_LENGTH,
                    weights=[embedding_matrix], 
                    trainable=True)(m_input)
# DIM=1000
# m_embed = Embedding(NUM, 
#                     DIM, 
#                     input_length=SENTENCE_LENGTH)(m_input)

W0717 12:35:28.823692 139847317223296 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0717 12:35:28.850658 139847317223296 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0717 12:35:28.854964 139847317223296 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0717 12:35:28.869721 139847317223296 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0717 12:35:28.871300 1398473172

In [8]:
from keras import optimizers
from keras.layers import Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers.convolutional import Conv1D
from keras.layers.pooling import GlobalMaxPooling1D

x = Dropout(0.2)(m_embed)

conv_branches = []
for size, filters_count in [(2, 10), (3, 10), (4, 10), (5, 10)]:
    for i in range(filters_count):
        i = Conv1D(filters=1, 
                   kernel_size=size, 
                   padding='valid', 
                   activation='relu')(x)
        i = GlobalMaxPooling1D()(i)
        conv_branches.append(i)

x = concatenate(conv_branches, 
                axis=1)

x = Dropout(0.2)(x)
x = Dense(30, activation='relu')(x)
x = Dense(1)(x)

m_output = Activation('sigmoid')(x)

model = Model(inputs=[m_input], 
              outputs=[m_output])

W0717 12:35:29.822127 139847317223296 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [0]:
from keras import backend as K


def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def recall(y_true, y_pred):
    """Recall metric.

    Only computes a batch-wise average of recall.

    Computes the recall, a metric for multi-label classification of
    how many relevant items are selected.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.
        

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

In [10]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy',
                       precision, 
                       recall, 
                       f1])

W0717 12:35:30.874514 139847317223296 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0717 12:35:30.916135 139847317223296 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [11]:
rm cnn*

rm: cannot remove 'cnn*': No such file or directory


In [12]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("cnn-{epoch:02d}-{val_f1:.2f}.hdf5", 
                             monitor='val_f1', 
                             save_best_only=True, 
                             mode='max', 
                             period=1)

history = model.fit(x_train, 
                    y_train, 
                    batch_size=32, 
                    epochs=10, 
                    validation_split=0.25, 
                    callbacks = [checkpoint])

Train on 134307 samples, validate on 44769 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
  4352/134307 [..............................] - ETA: 4:41 - loss: 0.2486 - acc: 0.9014 - precision: 0.9056 - recall: 0.9021 - f1: 0.9005

E0717 13:06:31.570133 139847317223296 ultratb.py:147] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-12-3b7c340669eb>", line 14, in <module>
    callbacks = [checkpoint])
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "/usr/local/lib/python3.6/dist-packages/keras/engine/training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "/usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py", line 2675, in _call
    fetched = self._callable_fn(*array_vals)
  File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/client/session.py", line 1458, in __call__
    run_metadata_ptr)
KeyboardInterrupt

During handling of

KeyboardInterrupt: ignored

In [0]:
model.predict(to_seq(["–ó–∞—á–µ–º –º–Ω–µ –ø–æ–¥—Ä—ã–≤–∞—Ç—å –∏ –±–µ–∑ —Ç–æ–≥–æ –Ω–µ —Å–∞–º—É—é –±–µ–∑—É–ø—Ä–µ—á–Ω—É—é —Ä–µ–ø—É—Ç–∞—Ü–∏—é –∫—É—Ä–∞—Ç–æ—Ä–æ–≤?",
                       "—Å–ª–æ–º–∞–Ω–Ω–∞—è –ª–∞–≤–æ—á–∫–∞",
                       "–ù—É —ç—Ç–æ –∂–µ –ª–æ–≥–∏—á–Ω–æ, —á—Ç–æ —è –¥–æ–∂–¥–∞–ª—Å—è, –ø–æ–∫–∞ —Ç—ã –ø—Ä–æ—á—Ç–µ—à—å –∏ —É–¥–∞–ª–∏–ª",
                       "—Ç—Ä–∞–Ω—Å–≥–µ–Ω–¥–µ—Ä –Ω–∞ —É–ª–∏—Ü–µ",
                       "–°–®–ê",
                       "–°–°–°–†",
                       "—Ñ–∞—à–∏—Å—Ç",
                       "–Ω–∞ —É–ª–∏—Ü–µ –ø–æ—Å—Ç–∞–≤–∏–ª–∏ —Ö—Ä–µ–Ω–æ–≤—É—é –∫—Ä–∞—Å–∏–≤—É—é —Å–∫–∞–º–µ–π–∫—É",
                       "–Ω–∞ —É–ª–∏—Ü–µ —Ö—Ä–µ–Ω–æ –ø–æ—Å—Ç–∞–≤–∏–ª–∏ –∫—Ä–∞—Å–∏–≤—É—é —Å–∫–∞–º–µ–π–∫—É",
                       "–Ω–∞ —É–ª–∏—Ü–µ –õ–µ–Ω–∏–Ω–∞ –ø–æ—Å—Ç–∞–≤–∏–ª–∏ —Ö—Ä–µ–Ω–æ–≤—É—é —Å–∫–∞–º–µ–π–∫—É",
                       "–∫—Ä–∞—Å–∏–≤—É—é —Å–∫–∞–º–µ–π–∫—É",
                       "–º–Ω–µ –Ω—Ä–∞–≤–∏—Ç—Å—è —ç—Ç–∞ —Ö—Ä–µ–Ω—å, –Ω–æ –±—É–¥–µ—Ç –Ω–µ—Ä–µ–∞–ª—å–Ω–æ –∫—Ä—É—Ç–æ",
                       "–ñ–∏—Ç—å –∫–æ–Ω–µ—á–Ω–æ —Ç—É—Ç –Ω–∞–≤–µ—Ä–Ω–æ–µ –Ω–µ—Ç",
                       "–û—á–µ–Ω—å-–æ—á–µ–Ω—å –∫—Ä–∞—Å–∏–≤–æ, —Å–∫–∞–∑–æ—á–Ω–æ, —Å–æ–ª–Ω–µ—á–Ω–æ, –∂–∞—Ä–∫–æ, –≤–æ–ª—à–µ–±–Ω–æ....–º–æ–∂–Ω–æ –º–Ω–æ–≥–æ —Å–ª–æ–≤ –Ω–∞–ø–∏—Å–∞—Ç—å, –∏ –≤—Å–µ –æ–Ω–∏ –±—É–¥—É—Ç –ø—Ä–æ —ç—Ç–æ –Ω–µ –∑–∞–±—ã–≤–∞–µ–º–æ–µ –º–µ—Å—Ç–æ. –Ø  –Ω–µ –∑–Ω–∞—é —É –∫–æ–≥–æ –∫–∞–∫. –ù–æ —è –æ—á–µ–Ω—å –ª—é–±–ª—é –°–æ—á–∏-–ê–¥–ª–µ—Ä –∏ –ª—é–±–ª—é —Ç—É—Ç –æ—Ç–¥—ã—Ö–∞—Ç—å —Å –¥–µ—Ç—å–º–∏ –∏ –æ–¥–Ω–∞ –∏ —Å –ø–æ–¥—Ä—É–≥–∞–º–∏ –∏ –≤–æ–æ–±—â–µ. –ñ–∏—Ç—å –∫–æ–Ω–µ—á–Ω–æ —Ç—É—Ç –Ω–∞–≤–µ—Ä–Ω–æ–µ –Ω–µ—Ç. –ù–æ –ª–µ—Ç–æ–º —Å—é–¥–∞-—ç—Ç–æ —Ç–æ—á–Ω–æ. –ù–∞ –≤—Å–µ –ª–µ—Ç–æ, —Å—é–¥–∞‚ò∫‚ò∫‚ò∫ü§©ü§©ü§©ü§©ü§©",
                       "—ç—Ç–∞ —Å–∫–∞–º–µ–π–∫–∞ –æ—á–µ–Ω—å –∫—Ä–∞—Å–∏–≤–∞—è",
                       "—Å–µ–≥–æ–¥–Ω—è –±—ã–ª–æ –≥—Ä—É—Å—Ç–Ω–æ",
                       "—ç—Ç–∞ —Å–∫–∞–º–µ–π–∫–∞ –ø–æ–ª–Ω–∞—è —á—É—à—å"]))