In [3]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import tensorflow as tf
from keras.models import Sequential, Model, load_model
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [4]:
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# **Подготовка данных**

Чистим данные. Удаляем из текста эмодзи, знаки пунктуации, и часто встречающийся символ "\xa0". Представляем предложения в виде векторов.

In [5]:
import re
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U0001F1F2-\U0001F1F4"  # Macau flag
        u"\U0001F1E6-\U0001F1FF"  # flags
        u"\U0001F600-\U0001F64F"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U0001F1F2"
        u"\U0001F1F4"
        u"\U0001F620"
        u"\u200d"
        u"\u2640-\u2642"
        "]+", flags=re.UNICODE)

def remove_emoji(text):
    return emoji_pattern.sub(r'', text)

In [6]:
with open("train.json", encoding='utf-8') as f:
    train_data = json.load(f)

with open("test.json", encoding='utf-8') as f:
    test_data = json.load(f)


In [7]:
X_train = np.array(list(train_data.keys()), dtype='str')
y_train = np.array(list(train_data.values()))

X_test = np.array(list(test_data.keys()), dtype='str')
y_test = np.array(list(test_data.values()))


In [8]:
data = pd.DataFrame({'text': np.concatenate([X_train, X_test], axis=0), 'funny': np.concatenate([y_train, y_test], axis=0)})

In [9]:
data.text = data.text.apply(lambda x: x.replace("\xa0", " "))
data.text = data.text.apply(lambda x: x.replace("quote", " "))
data.text = data.text.apply(remove_emoji)

In [10]:
X_train = data.text.values[:len(X_train)]
y_train = data.funny.values[:len(X_train)]

X_test = data.text.values[len(X_train):]
y_test = data.funny.values[len(X_train):]

In [11]:
filters = '!"#$%&()+,-./:;<=>?@[\]^*_`{|}~\t\n№…«»–”„☪●☼•—'
tokenizer = Tokenizer(num_words=50000, filters=filters)
tokenizer.fit_on_texts(X_train)

In [12]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [13]:
mylen = np.vectorize(len)
maxlen = mylen(X_train).max()
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
maxlen

70

In [14]:
vocab_size = len(tokenizer.word_index) + 1  
vocab_size

238642

# **Модель 1. Пара слоёв LSTM**

In [25]:
embedding_dim = 256

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Bidirectional(layers.LSTM(embedding_dim, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', f1_m])

history = model.fit(X_train, y_train,
                    shuffle=True,
                    epochs=2,
                    validation_split = 0.2,
                    batch_size=256)

Train on 201132 samples, validate on 50284 samples
Epoch 1/2
Epoch 2/2


**Результат f1 score на тестовых данных**

In [26]:
f1_score = model.evaluate(X_test, y_test)[2]
f1_score



0.7259800225996904

# **Модель 2. CNN с фильтрами разного размера**

In [32]:
embedding_dim = 256

filter_sizes = [2, 3, 5, 7]
conv_filters = []

input_tensor = layers.Input(shape=(maxlen,))
input_layer = layers.Embedding(vocab_size, embedding_dim, input_length=maxlen)(input_tensor)

for f_size in filter_sizes:
    conv_filter = layers.Conv1D(128, f_size, activation='relu')(input_layer)
    conv_filter = layers.GlobalMaxPooling1D()(conv_filter)
    conv_filters.append(conv_filter)

conc_layer=layers.Concatenate()(conv_filters)
graph = Model(inputs=input_tensor, outputs=conc_layer)

model = Sequential()
model.add(graph)
model.add(layers.Dropout(0.5))
model.add(layers.Dense(len(conv_filters), activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy', f1_m])

history = model.fit(X_train, y_train,
                    shuffle=True,
                    epochs=2,
                    validation_split = 0.2,
                    batch_size=128)

Train on 201132 samples, validate on 50284 samples
Epoch 1/2
Epoch 2/2


**Результат f1 score на тестовых данных**

In [33]:
f1_score = model.evaluate(X_test, y_test)[2]
f1_score



0.7204401534047448

# **Модель 3. Классический ML**

In [13]:
from sklearn.ensemble import RandomForestClassifier as RFC

clf = RFC(n_estimators = 100)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**Результат f1 score на тестовых данных**

In [14]:
clf.score(X_test, y_test)

0.690649577628896