<h2>Урок 8 - RNN<h2/>

In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [2]:
import tensorflow_datasets as tfds
tf.config.experimental.set_visible_devices([], 'GPU')
tfds.disable_progress_bar()

In [3]:
tfds.question_answering.ai2_arc

<module 'tensorflow_datasets.question_answering.ai2_arc' from 'C:\\Users\\Arhio\\anaconda3\\lib\\site-packages\\tensorflow_datasets\\question_answering\\ai2_arc.py'>

In [4]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

In [5]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.layers import concatenate, Input, Dense, Dropout, BatchNormalization, Flatten, Conv1D, Conv2D, LSTM

In [6]:
%load_ext tensorboard

In [7]:
dataset, metadata = tfds.load('civil_comments',
                              with_info=True, as_supervised=True)

In [8]:
metadata

tfds.core.DatasetInfo(
    name='civil_comments',
    full_name='civil_comments/CivilComments/1.1.2',
    description="""
    This version of the CivilComments Dataset provides access to the primary
    seven labels that were annotated by crowd workers, the toxicity and other
    tags are a value between 0 and 1 indicating the fraction of annotators that
    assigned these attributes to the comment text.
    
    The other tags are only available for a fraction of the input examples. They
    are currently ignored for the main dataset; the CivilCommentsIdentities set
    includes those labels, but only consists of the subset of the data with them.
    The other attributes that were part of the original CivilComments release are
    included only in the raw data. See the Kaggle documentation for more details
    about the available features.
    
    The comments in this dataset come from an archive of the Civil Comments
    platform, a commenting plugin for independent news sites. Thes

In [9]:
BUFFER_SIZE = 1024
BATCH_SIZE = 32

In [10]:
train = dataset['train']
test = dataset['test']
validation = dataset['validation']

In [11]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
def preprocess_image_train(text, toxicity):
    lowercase = tf.strings.lower(text)
    text = tf.strings.regex_replace(lowercase, "<br />", " ")
    text = tf.strings.regex_replace(text,r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', ' ')
    text = tf.strings.regex_replace(text, r'@([A-Za-z0-9_]+)', ' ' )
    for i in stop_words:
        text = tf.strings.regex_replace(text, f' {i} ', " ")
    
    return text, toxicity

In [14]:
train = train.map(
    preprocess_image_train, num_parallel_calls=AUTOTUNE).cache().shuffle(
    BUFFER_SIZE).batch(BATCH_SIZE)

test = test.map(
    preprocess_image_train, num_parallel_calls=AUTOTUNE).cache().shuffle(
    BUFFER_SIZE).batch(BATCH_SIZE)

validation = validation.map(
    preprocess_image_train, num_parallel_calls=AUTOTUNE).cache().shuffle(
    BUFFER_SIZE).batch(BATCH_SIZE)

In [15]:
sample_train = next(iter(train))
sample_test = next(iter(test))

In [16]:
sample_train

(<tf.Tensor: shape=(32,), dtype=string, numpy=
 array([b'he lived house makes  % liable! father win.',
        b'why bother democracy middle east.\n\nbeen there, tried iraq.\n\nnot muslim gene make up.  like theocracy tyrants.',
        b'this way anyone live. living like glad longer deal world.\n\nthe world likes youth, good looks, good health, money fame. farther away away five things, less world wants bother you. \n\nto sleep, perchance dream. think pass away, better alive world.\n\n"for three last five years, government determined cost living increase needed protect social security recipients. now, learn projected increase   average $  per month \xe2\x80\x94 enough purchase gallon milk!"\n\n"in lane county, due budget cuts, applications assistance filed   processed!"\n\nsix year wait applications processed. could easily dead then. treat old poor. like said, dead peace do.',
        b'islam gives rise radicalization - nature.',
        b'it getting funny watching msm bury fake news.

In [17]:
sample_test

(<tf.Tensor: shape=(32,), dtype=string, numpy=
 array([b'you simply wrong.',
        b"walker must recalled.   walker like sick gambler keeps betting he's sick can't help himself. walker obsessed gas line unwilling consider alaska can't afford time . he's sick keeps spending more line may never built.    another reason walker must recalled lack ethics.   last week repaid campaign debt giving french job . bad enough hired law partner become ag could sell business together---now . guy resigns walker unethical hires .  walker went japan talk gas forced state pay half wife's airfare . blew state money want away days .   ripped every alaskan half pfd check money spend gas line . must recall guy even damage alaska people.",
        b'the president good taking three positions every issue. people still trust despite unreliability.',
        b'this consistent ruling november  ,  ,  supreme court canada upheld decisions bc supreme court bc court appeal dismiss claim lax kw\'alaams first nation (

In [18]:
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train.map(lambda text, label: text))


In [19]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'would', 'people', 'one', 'like', 'the', 'i', 'trump',
       'it', 'get', 'us', 'time', 'think', 'many', 'you', 'know', 'good',
       'even', 'right'], dtype='<U14')

In [29]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [30]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['mae'])

In [32]:
history = model.fit(train, epochs=1,
                    validation_data=validation,
                    validation_steps=30)



In [33]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, None)              0         
_________________________________________________________________
embedding_2 (Embedding)      (None, None, 64)          64000     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 32)                2080      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                

In [72]:
test_loss, test_acc = model.evaluate(test)

print('Test Loss:', test_loss)
print('Test MAE:', test_acc)

Test Loss: 0.29595786333084106
Test MAE: 0.11881496012210846


In [36]:
pred = model.predict(test)

In [51]:
t = test.map(lambda t, l: l).as_numpy_iterator()
flat_list = [item for sublist in t for item in sublist]
len(flat_list)

97320

In [84]:
acc = [1 - abs(e - flat_list[p])  for p, e in enumerate(list(list([e if e > 0.31 else 0 for e in pred[:, 0] ])))]
print(f'Точность модели {sum(acc) / len(acc)}')

Точность 0.8880650090984845


Вывод: При простой модели возможно довести до 89 % точности классификации токсичных комментарий пользователя.