In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from tensorflow.keras.utils import to_categorical
from keras.datasets import imdb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

In [None]:
print("Categories:", np.unique(targets))
print("Number of unique words:", len(np.unique(np.hstack(data))))
length = [len(i) for i in data]
print("Average Review length:", np.mean(length))
print("Standard Deviation:", round(np.std(length)))

Categories: [0 1]
Number of unique words: 9998
Average Review length: 234.75892
Standard Deviation: 173
Label: 1


In [None]:
print("Label:", targets[0])
print(data[0])

Label: 1
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [None]:
index = imdb.get_word_index()
reverse_index = dict([(value, key) for (key, value) in index.items()])
decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )
print(decoded)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
# this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert # is an amazing actor and now the same being director # father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for # and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also # to the two little boy's that played the # of norman and paul they were just brilliant children are often left out of the # list i think because the stars that play them all grown up are such a big profile for th

In [None]:
def vectorize(sequences, dimension = 10000):
  results = np.zeros((len(sequences), dimension))
  for i, sequence in enumerate(sequences):
    results[i, sequence] = 1
  return results

In [None]:
data = vectorize(data)
targets = np.array(targets).astype("float32")

In [None]:
test_x = data[:10000]
test_y = targets[:10000]
train_x = data[10000:]
train_y = targets[10000:]

In [None]:
def getModel(num = 10000):
  (training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=num)
  data = np.concatenate((training_data, testing_data), axis=0)
  targets = np.concatenate((training_targets, testing_targets), axis=0)

  index = imdb.get_word_index()
  reverse_index = dict([(value, key) for (key, value) in index.items()])
  decoded = " ".join( [reverse_index.get(i - 3, "#") for i in data[0]] )

  data = vectorize(data, num)
  targets = np.array(targets).astype("float32")

  test_x = data[:num]
  test_y = targets[:num]
  train_x = data[num:]
  train_y = targets[num:]

  model = keras.Sequential()
  # Input - Layer
  model.add(layers.Dense(50, activation = "relu", input_shape=(num, )))
  # Hidden - Layers
  model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
  model.add(layers.Dense(50, activation = "relu"))
  model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
  model.add(layers.Dense(50, activation = "relu"))
  # Output- Layer
  model.add(layers.Dense(1, activation="sigmoid"))
  model.summary()

  model.compile(optimizer = "adam", loss = "binary_crossentropy", metrics = ["accuracy"])
  results = model.fit(train_x, train_y, epochs= 2, batch_size = 500, validation_data = (test_x, test_y))
  
  return results

---
**Модель 1**

> Длина: 10 000

> Точность: 0.9189

In [None]:
model = getModel(10000)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                500050    
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 505,201
Trainable params: 505,201
Non-trai

---
**Модель 2**

> Длина: 6 000

> Точность: 0.9081

In [None]:
getModel(6000)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 50)                300050    
                                                                 
 dropout_2 (Dropout)         (None, 50)                0         
                                                                 
 dense_5 (Dense)             (None, 50)                2550      
                                                                 
 dropout_3 (Dropout)         (None, 50)                0         
                                                                 
 dense_6 (Dense)             (None, 50)                2550      
                                                                 
 dense_7 (Dense)             (None, 1)                 51        
                                                                 
Total params: 305,201
Trainable params: 305,201
Non-tr

<keras.callbacks.History at 0x7fed743b9090>

---
**Модель 3**

> Длина: 1 000

> Точность: 0.8567

In [None]:
getModel(1000)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 50)                50050     
                                                                 
 dropout_6 (Dropout)         (None, 50)                0         
                                                                 
 dense_13 (Dense)            (None, 50)                2550      
                                                                 
 dropout_7 (Dropout)         (None, 50)                0         
                                                                 
 dense_14 (Dense)            (None, 50)                2550      
                                                                 
 dense_15 (Dense)            (None, 1)                 51        
                                                                 
Total params: 55,201
Trainable params: 55,201
Non-trai

<keras.callbacks.History at 0x7fed747d0450>

In [None]:
def eval(model, text, num):
  data = keras.datasets.imdb
  word_index = data.get_word_index()
  my_word_index = {}
  for k, v in word_index.items():
    my_word_index[k]= v+3

  words = text.split()
  encoded = []
  for word in words:
      encoded.append(my_word_index.get(word, 0))
  result=keras.preprocessing.sequence.pad_sequences([encoded], value=0, padding="post", maxlen=300)
  decoded=vectorize(result, num)
  prediction = model.predict(decoded)
  return "Положительный обзор" if prediction > 0.5 else "Отрицательный обзор"





---

**Пользовательский ввод**

In [None]:
text = "The movie sucks. Wouldn't recommend anybody to waste their time on it."
result = eval(model.model, text, 10000)
print(result)

Отрицательный обзор


In [None]:
text = "Got enjoyed by that movie. 5 stars out of 5"
result = eval(model.model, text, 10000)
print(result)

Положительный обзор


---
**Выводы**

> При уменьшени размера вектора представления текста точность можели снижается