In [1]:
# Import and data read

import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()

ModuleNotFoundError: No module named 'tensorflow.tsl'

In [2]:
# Remove punctuation

reviews = "".join([char for char in reviews if char not in string.punctuation])

reviews = reviews.split('\n')
labels = labels.split('\n')

In [3]:
# Tokenization, Lemmatization, Stemming. Label numerical encoding

import itertools

reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))
  
labels = [1 if label == "positive" else 0 for label in labels]

In [4]:
# Remove empty reviews and the corresponding labels

empty_idx = []
for i, review in enumerate(reviews_tokenized):
  if len(review) == 0:
    empty_idx.append(i)
    
for i in empty_idx:
  reviews_tokenized.pop(i)
  reviews.pop(i)
  labels.pop(i)

In [5]:
# Create vocabulary, word2index reference and convert the reviews into numerical form

vocab_size = 10000

word_counter = Counter(reviews_unrolled)
# word_counter = dict(word_counter.most_common(vocab_size))
word2index = {k:i for i,k in enumerate(word_counter.keys(), start = 3)}

reviews_int = []
for review in reviews_tokenized:
  cur_review = [1]
  for word in review:
    if word in word2index.keys():
      cur_review.append(word2index[word])
    else:
      cur_review.append(2)
  reviews_int.append(cur_review)

In [6]:
# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_reviews = pad_sequences(reviews_int, maxlen = 500, padding = 'pre', truncating = 'pre')

In [7]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

X_train = np.array(X_train)
X_test = np.array(X_test)

y_train = np.array(y_train).reshape(20000, 1)
y_test = np.array(y_test).reshape(5000, 1)

In [10]:
# Define the model

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
import tensorflow_hub as hub

sample_sentence = 'This is a bad movie'

# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# embed_samples = embed([sample_sentence, 'This is my second sentence'])


# sentence_encoder_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder/4',
#                                          input_shape = [],
#                                          dtype=tf.string,
#                                          trainable = False)

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1", 
                                        output_shape=[256],
                                        input_shape=[], 
                                        dtype=tf.string, trainable = False)

model = Sequential([
  sentence_encoder_layer,
  Dense(1024, activation = 'relu'),
  Dense(1024, activation = 'relu'),
  Dense(1, activation = 'sigmoid')
])


# inputs = Input(shape = (X_train.shape[1:]))
# mask = tf.keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
# x = Embedding(input_dim = vocab_size, output_dim = 128, input_length = 200)(inputs)
# x = Conv1D(filters = 200, kernel_size = 13, strides = 1, padding = 'same', activation = 'relu')(x)
# x = BatchNormalization()(x)
# x = Dropout(dropout_rate)(x)
# x = GRU(128, return_sequences = True)(x)
# x = BatchNormalization()(x)
# x = Dropout(dropout_rate)(x)
# x = GRU(128, return_sequences = False)(x)
# x = BatchNormalization()(x)
# x = Dropout(dropout_rate)(x)
# x = Dense(512, activation = 'relu')(x)
# x = Dropout(dropout_rate)(x)
# outputs = Dense(1, activation = 'sigmoid')(x)



# model = Model(inputs = inputs, outputs = outputs)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 128)               124642688 
                                                                 
 dense_3 (Dense)             (None, 1024)              132096    
                                                                 
 dense_4 (Dense)             (None, 1024)              1049600   
                                                                 
 dense_5 (Dense)             (None, 1)                 1025      
                                                                 
Total params: 125,825,409
Trainable params: 1,182,721
Non-trainable params: 124,642,688
_________________________________________________________________


In [11]:
# Train the model

history = model.fit(X_train, y_train, epochs = 100, batch_size = 128, validation_data = (X_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100

KeyboardInterrupt: 

In [25]:
!git add.
!git commit -m "add universal sentence encoder option"

git: 'add.' is not a git command. See 'git --help'.

The most similar command is
	add


On branch master
Your branch is up to date with 'origin/master'.

Changes not staged for commit:
  (use "git add <file>..." to update what will be committed)
  (use "git restore <file>..." to discard changes in working directory)
	modified:   Sentiment_Analysis_RNN_update.ipynb

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	.ipynb_checkpoints/Sentiment_Analysis_RNN_TransferLearning_USE-checkpoint.ipynb
	Sentiment_Analysis_RNN_TransferLearning_USE.ipynb

no changes added to commit (use "git add" and/or "git commit -a")


In [6]:
import tensorflow_hub as hub

embed = hub.load("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1", output_dim = 256)
embeddings = embed(["cat is on the mat", "dog is in the fog"])

TypeError: load() got an unexpected keyword argument 'output_dim'

In [5]:
embeddings

<tf.Tensor: shape=(2, 128), dtype=float32, numpy=
array([[ 0.27107972, -0.01055073, -0.05728397,  0.06853679, -0.08438271,
         0.22396211, -0.00247001, -0.09797598, -0.06092518,  0.01678422,
         0.0183306 , -0.02683547,  0.01987647,  0.02205245,  0.0380337 ,
         0.02345292, -0.0535214 , -0.02916854, -0.13816142,  0.255649  ,
         0.00548296,  0.08994407,  0.09702856, -0.01617393,  0.15273312,
         0.03449007,  0.05599031,  0.01964826, -0.01901525,  0.11601479,
         0.06575833, -0.03560898, -0.02412845, -0.00716866, -0.08950593,
        -0.01021391,  0.07431487, -0.10462939, -0.03951982,  0.00272066,
        -0.01468687, -0.01350653, -0.04825642,  0.03088917, -0.0448269 ,
        -0.01743765,  0.1034883 ,  0.04149228, -0.03979184,  0.03878277,
         0.15273733, -0.09228262, -0.01723959,  0.01830614, -0.02075483,
         0.0800882 , -0.08071491, -0.15573218,  0.13893387,  0.06140287,
        -0.05639812, -0.05526257, -0.02765993, -0.175832  ,  0.01034007,
 