## Handmade RNN

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub

plt.style.use('ggplot')

In [2]:
# Read in data
df = pd.read_csv('Reviews.csv')
df = df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis = 1)
df = df.head(50000)
df['Label'] = np.where(df['Score'] >= 3, 1, 0)
df.head()

Unnamed: 0,Id,Score,Text,Label
0,1,5,I have bought several of the Vitality canned d...,1
1,2,1,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,4,This is a confection that has been around a fe...,1
3,4,2,If you are looking for the secret ingredient i...,0
4,5,5,Great taffy at a great price. There was a wid...,1


In [3]:
# Remove punctuation

reviews = list(df['Text'])
labels = list(df['Label'])

reviews_nopunct = []
for review in reviews:
  reviews_nopunct.append("".join([char for char in review if char not in string.punctuation]))

reviews = reviews_nopunct

# reviews = reviews.split('\n')
# labels = labels.split('\n')

In [4]:
# Tokenization, Lemmatization, Stemming. Label numerical encoding

import itertools

reviews_tokenized = []
for review in reviews:
  splitted_review = nltk.word_tokenize(review)
  splitted_review = [WordNetLemmatizer().lemmatize(w) for w in splitted_review]
  splitted_review = [PorterStemmer().stem(w).strip() for w in splitted_review]
  reviews_tokenized.append(splitted_review)
  
reviews_unrolled = list(itertools.chain(*reviews_tokenized))

In [5]:
# Remove empty reviews and the corresponding labels

empty_idx = []
for i, review in enumerate(reviews_tokenized):
  if len(review) == 0:
    empty_idx.append(i)
    
for i in empty_idx:
  reviews_tokenized.pop(i)
  reviews.pop(i)
  labels.pop(i)

In [6]:
# Create vocabulary, word2index reference and convert the reviews into numerical form

vocab_size = 10000

word_counter = Counter(reviews_unrolled)
word_counter = dict(word_counter.most_common(vocab_size))
word2index = {k:i for i,k in enumerate(word_counter.keys(), start = 3)}

reviews_int = []
for review in reviews_tokenized:
  cur_review = [1]
  for word in review:
    if word in word2index.keys():
      cur_review.append(word2index[word])
    else:
      cur_review.append(2)
  reviews_int.append(cur_review)

In [7]:
# Pad sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_reviews = pad_sequences(reviews_int, maxlen = 500, padding = 'pre', truncating = 'pre')

In [27]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_reviews, labels, test_size = 0.2, random_state = 1)

X_train = np.array(X_train).reshape(40000, 500)
X_test = np.array(X_test).reshape(10000, 500)

y_train = np.array(y_train).reshape(40000, 1)
y_test = np.array(y_test).reshape(10000, 1)

In [30]:
# Define the model

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K

dropout_rate = 0.5

inputs = Input(shape = (X_train.shape[1:]))
x = Embedding(input_dim = vocab_size, output_dim = 128, input_length = 200)(inputs)
x = Conv1D(filters = 200, kernel_size = 13, strides = 1, padding = 'same', activation = 'relu')(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = GRU(128, return_sequences = True)(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = GRU(128, return_sequences = False)(x)
x = BatchNormalization()(x)
x = Dropout(dropout_rate)(x)
x = Dense(512, activation = 'relu')(x)
x = Dropout(dropout_rate)(x)
outputs = Dense(1, activation = 'sigmoid')(x)

model = Model(inputs = inputs, outputs = outputs)

model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding_5 (Embedding)     (None, 500, 128)          1280000   
                                                                 
 conv1d_5 (Conv1D)           (None, 500, 200)          333000    
                                                                 
 batch_normalization_7 (Batc  (None, 500, 200)         800       
 hNormalization)                                                 
                                                                 
 dropout_9 (Dropout)         (None, 500, 200)          0         
                                                                 
 gru_5 (GRU)                 (None, 500, 128)          126720    
                                                           

In [31]:
history = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Universal Sentence encoder

In [41]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
# Read in data
df = pd.read_csv('Reviews.csv')
df = df.drop(['ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis = 1)
df = df.head(50000)
df['Label'] = np.where(df['Score'] >= 3, 1, 0)
df.head()

Unnamed: 0,Id,Score,Text,Label
0,1,5,I have bought several of the Vitality canned d...,1
1,2,1,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,4,This is a confection that has been around a fe...,1
3,4,2,If you are looking for the secret ingredient i...,0
4,5,5,Great taffy at a great price. There was a wid...,1


In [43]:
# Remove punctuation

reviews = list(df['Text'])
labels = list(df['Label'])

reviews_nopunct = []
for review in reviews:
  reviews_nopunct.append("".join([char for char in review if char not in string.punctuation]))

reviews = reviews_nopunct

# reviews = reviews.split('\n')
# labels = labels.split('\n')



In [44]:
# Train test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

X_train = np.array(X_train)
X_test = np.array(X_test)

y_train = np.array(y_train).reshape(40000, 1)
y_test = np.array(y_test).reshape(10000, 1)

In [45]:
# Define the model

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
import tensorflow_hub as hub

sample_sentence = 'This is a bad movie'

sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1", 
                                        output_shape=[256],
                                        input_shape=[], 
                                        dtype=tf.string, trainable = False)

model = Sequential([
  sentence_encoder_layer,
  Dense(1024, activation = 'relu'),
  Dense(1024, activation = 'relu'),
  Dense(1, activation = 'sigmoid')
])


model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 keras_layer_1 (KerasLayer)  (None, 128)               124642688 
                                                                 
 dense_9 (Dense)             (None, 1024)              132096    
                                                                 
 dense_10 (Dense)            (None, 1024)              1049600   
                                                                 
 dense_11 (Dense)            (None, 1)                 1025      
                                                                 
Total params: 125,825,409
Trainable params: 1,182,721
Non-trainable params: 124,642,688
_________________________________________________________________


In [46]:
# Train the model

history = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
