In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file("aclImdb_v1", url,
                                    untar=True, cache_dir='.',
                                    cache_subdir='')

dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
os.listdir(dataset_dir)

['test', 'imdbEr.txt', 'README', 'train', 'imdb.vocab']

In [94]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['.ipynb_checkpoints',
 'urls_neg.txt',
 'unsupBow.feat',
 'neg',
 'urls_unsup.txt',
 'urls_pos.txt',
 'pos',
 'labeledBow.feat']

In [96]:
sample_file = os.path.join(train_dir + "/neg", 'n1.txt')
with open(sample_file) as f:
  print(f.read())

ЗАКОНОДАТЕЛЬНОЕ СОБРАНИЕ
ЕВРЕЙСКОЙ АВТОНОМНОЙ ОБЛАСТИ
ЗАКОН
ЕВРЕЙСКОЙ АВТОНОМНОЙ ОБЛАСТИ
от 30.05.2008 №378-ОЗ
О ГАРАНТИЯХ ОСУЩЕСТВЛЕНИЯ ПОЛНОМОЧИЙ ДЕПУТАТА ПРЕДСТАВИТЕЛЬНОГО ОРГАНА МУНИЦИПАЛЬНОГО ОБРАЗОВАНИЯ ЕВРЕЙСКОЙ АВТОНОМНОЙ ОБЛАСТИ 
Настоящий закон в соответствии со статьей 40 Федерального закона от 06.10.2003 №131-ФЗ«Об общих принципах организации местного самоуправления в Российской Федерации» устанавливает гарантии осуществления полномочий депутата представительного органа муниципального образования в Еврейской автономной области (далее - область).
Статья 1. Определение гарантий осуществления полномочий депутата представительного органа муниципального образования области
Под гарантиями осуществления полномочий депутата представительного органа муниципального образования области понимается обеспечение органами местного самоуправления муниципальных образований области условий для беспрепятственного осуществления на территории муниципального образования области депутатом представ

In [6]:
remove_dir = os.path.join(train_dir, 'unsup')
shutil.rmtree(remove_dir)

In [97]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='training', 
    seed=seed)

Found 12 files belonging to 3 classes.
Using 10 files for training.


In [98]:
for text_batch, label_batch in raw_train_ds.take(2):
  for i in range(3):
    print("Review", text_batch.numpy()[i])
    print("Label", label_batch.numpy()[i])

Review b'\xd0\xa3\xd0\x9f\xd0\xa0\xd0\x90\xd0\x92\xd0\x9b\xd0\x95\xd0\x9d\xd0\x98\xd0\x95 \xd0\x90\xd0\x92\xd0\xa2\xd0\x9e\xd0\x9c\xd0\x9e\xd0\x91\xd0\x98\xd0\x9b\xd0\xac\xd0\x9d\xd0\xab\xd0\xa5 \xd0\x94\xd0\x9e\xd0\xa0\xd0\x9e\xd0\x93 \xd0\x98 \xd0\xa2\xd0\xa0\xd0\x90\xd0\x9d\xd0\xa1\xd0\x9f\xd0\x9e\xd0\xa0\xd0\xa2\xd0\x90 \xd0\x9f\xd0\xa0\xd0\x90\xd0\x92\xd0\x98\xd0\xa2\xd0\x95\xd0\x9b\xd0\xac\xd0\xa1\xd0\xa2\xd0\x92\xd0\x90 \xd0\x95\xd0\x92\xd0\xa0\xd0\x95\xd0\x99\xd0\xa1\xd0\x9a\xd0\x9e\xd0\x99 \xd0\x90\xd0\x92\xd0\xa2\xd0\x9e\xd0\x9d\xd0\x9e\xd0\x9c\xd0\x9d\xd0\x9e\xd0\x99 \xd0\x9e\xd0\x91\xd0\x9b\xd0\x90\xd0\xa1\xd0\xa2\xd0\x98\r\n\r\n\xd0\x9f\xd0\xa0\xd0\x98\xd0\x9a\xd0\x90\xd0\x97\r\n\xd0\xbe\xd1\x82 05 \xd0\xb4\xd0\xb5\xd0\xba\xd0\xb0\xd0\xb1\xd1\x80\xd1\x8f 2012 \xd0\xb3\xd0\xbe\xd0\xb4\xd0\xb0 \xe2\x84\x96319\r\n\r\n\xd0\x9e \xd0\x9a\xd0\x9e\xd0\x9c\xd0\x98\xd0\xa1\xd0\xa1\xd0\x98\xd0\x98 \xd0\x9f\xd0\x9e \xd0\xa1\xd0\x9e\xd0\x91\xd0\x9b\xd0\xae\xd0\x94\xd0\x95\xd0\x9d\xd0\x

In [99]:
print("Label 0 corresponds to", raw_train_ds.class_names[0])
print("Label 1 corresponds to", raw_train_ds.class_names[1])

Label 0 corresponds to .ipynb_checkpoints
Label 1 corresponds to neg


In [100]:
raw_val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/train', 
    batch_size=batch_size, 
    validation_split=0.2, 
    subset='validation', 
    seed=seed)

Found 12 files belonging to 3 classes.
Using 2 files for validation.


In [101]:
raw_test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'aclImdb/test', 
    batch_size=batch_size)

Found 25000 files belonging to 2 classes.


In [102]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')

In [103]:
max_features = 40000
sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

In [104]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [105]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [106]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print("Review", first_review)
print("Label", raw_train_ds.class_names[first_label])
print("Vectorized review", vectorize_text(first_review, first_label))

Review tf.Tensor(b'(\xd0\xa3\xd1\x82\xd1\x80\xd0\xb0\xd1\x82\xd0\xb8\xd0\xbb \xd1\x81\xd0\xb8\xd0\xbb\xd1\x83: \xd0\xbf\xd1\x80\xd0\xb8\xd0\xba\xd0\xb0\xd0\xb7 \xd1\x83\xd0\xbf\xd1\x80\xd0\xb0\xd0\xb2\xd0\xbb\xd0\xb5\xd0\xbd\xd0\xb8\xd1\x8f \xd0\xbf\xd0\xbe \xd1\x82\xd1\x80\xd1\x83\xd0\xb4\xd1\x83 \xd0\xbf\xd1\x80\xd0\xb0\xd0\xb2\xd0\xb8\xd1\x82\xd0\xb5\xd0\xbb\xd1\x8c\xd1\x81\xd1\x82\xd0\xb2\xd0\xb0 \xd0\x95\xd0\xb2\xd1\x80\xd0\xb5\xd0\xb9\xd1\x81\xd0\xba\xd0\xbe\xd0\xb9 \xd0\xb0\xd0\xb2\xd1\x82\xd0\xbe\xd0\xbd\xd0\xbe\xd0\xbc\xd0\xbd\xd0\xbe\xd0\xb9 \xd0\xbe\xd0\xb1\xd0\xbb\xd0\xb0\xd1\x81\xd1\x82\xd0\xb8 \xd0\xbe\xd1\x82 05.05.2017 \xe2\x84\x9635-\xd0\x9e\xd0\x94)\r\n\xd0\xa3\xd0\x9f\xd0\xa0\xd0\x90\xd0\x92\xd0\x9b\xd0\x95\xd0\x9d\xd0\x98\xd0\x95 \xd0\x9f\xd0\x9e \xd0\xa2\xd0\xa0\xd0\xa3\xd0\x94\xd0\xa3 \xd0\x9f\xd0\xa0\xd0\x90\xd0\x92\xd0\x98\xd0\xa2\xd0\x95\xd0\x9b\xd0\xac\xd0\xa1\xd0\xa2\xd0\x92\xd0\x90 \xd0\x95\xd0\x92\xd0\xa0\xd0\x95\xd0\x99\xd0\xa1\xd0\x9a\xd0\x9e\xd0\x99 \xd0

In [107]:
print("1287 ---> ",vectorize_layer.get_vocabulary()[2])
print(" 313 ---> ",vectorize_layer.get_vocabulary()[13])
print('Vocabulary size: {}'.format(len(vectorize_layer.get_vocabulary())))

1287 --->  в
 313 --->  или
Vocabulary size: 2939


In [108]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [109]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [110]:
embedding_dim = 16

In [111]:
model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1)])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 16)          640016    
_________________________________________________________________
dropout_6 (Dropout)          (None, None, 16)          0         
_________________________________________________________________
global_average_pooling1d_3 ( (None, 16)                0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 640,033
Trainable params: 640,033
Non-trainable params: 0
_________________________________________________________________


In [93]:
!rm -rf aclImdb/train/pos/*

In [53]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [112]:
model.compile(loss=losses.BinaryCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=tf.metrics.BinaryAccuracy(threshold=0.0))

In [114]:
epochs = 12
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


In [115]:
loss, accuracy = model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

Loss:  0.6932265758514404
Accuracy:  0.5
