In [1]:
import pandas as pd
import numpy as np
import re
import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/sarah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv("reviews.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [20]:
# Data Attributes
def averages(text):
    """
    This function calculates the average number of words per sentence and the average number of sentences per entry.
    @param text: pandas series; array of strings
    @rvalue:
    @rvalue:
    """
    total_sentences = 0
    total_words = 0
    # for each processed document of the text
    for doc in nlp.pipe(text, disable=["ner", "tagger"]): # nlp.pipe includes different components of the text
        sentences = list(doc.sents) # extract all sentences for each entry
        total_sentences += len(sentences) # total number of sentences in text
        total_words += sum([len(sentence) for sentence in sentences]) # total number of words in text
    
    avg_words_per_sentence = total_words / total_sentences if total_sentences else 0 # average number of words per sentence
    avg_num_sentences = total_sentences / len(text) if len(text) else 0 # average number of sentences across text
    
    return avg_words_per_sentence, avg_num_sentences

avg_words, avg_sent = averages(data['review'][:len(data['review'])//10])



In [11]:
avg_words, avg_sent

(20.16343979755327, 13.3566)

In [3]:
# Truncate text
def truncate_text(text, min_words, min_sent):
    """
    This function takes in a text and truncates it so that there are only two sentences and if the number of words in the 
    text is less than min_words, it adds another sentence to reach at least min_words.
    
    """
    doc = nlp(text) # create a document that stores different components of text
    sentences = list(doc.sents) # list of sentences of text
    
    new_text = []
    word_count = 0 # records number of words in text
    sent_count = 0 # records number of sentences in text
    
    # for each sentence in text
    for sentence in sentences:
        sent_word_count = len(sentence.text.split()) # count number of words in each sentence
        # add a sentence to meet requirements
        if word_count + sent_word_count <= min_words or sent_count < min_sent:
            new_text.append(sentence.text) # add sentence
            word_count += sent_word_count # keep track of words added
            sent_count += 1 # keep track of sentences added
        # once requirements are met break
        if word_count >= min_words and sent_count >= min_sent:
            break
    
    # if there are less than min_words in text, and no remaining sentences to add, keep text as is
    return ' '.join(new_text)

In [4]:
new_data = data.iloc[:len(data)//10].copy()
new_data["new_data"] = new_data["review"].apply(lambda x: truncate_text(x, min_words = 50, min_sent = 2))
new_data[:10]

Unnamed: 0,review,sentiment,new_data
0,One of the other reviewers has mentioned that ...,positive,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,negative,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love in the Time of Money"" is..."
5,"Probably my all-time favorite movie, a story o...",positive,"Probably my all-time favorite movie, a story o..."
6,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...
7,"This show was an amazing, fresh & innovative i...",negative,"This show was an amazing, fresh & innovative i..."
8,Encouraged by the positive comments about this...,negative,Encouraged by the positive comments about this...
9,If you like original gut wrenching laughter yo...,positive,If you like original gut wrenching laughter yo...


In [5]:
# Preprocessing Data
def clean(data, column):
    """
    This function takes in a data frame and column name of the text data. It converts all letters to lowercase, removes HTML tags,
    removes punctuation, removes unnecessary spaces, and removes duplicates.
    @param data: data frame with string column
    @param column: column name of string column
    """
    clean_data = (data[column] # Reduce the data to a specific column
                .str.lower() # Convert to lowercase
                .apply(lambda x: re.sub('<.*?>', ' ', x)) # Replace HTML tags with a space
                .apply(lambda x: re.sub(r'[^\w\s]', '', x)) # Remove punctuation
                .apply(lambda x: re.sub(r'\s{2,}', ' ', x)) # Replace 2+ consecutive spaces with a single space
                .drop_duplicates()) # Remove duplicates
    return clean_data

In [6]:
text = clean(new_data, 'new_data')
print(text)

0       one of the other reviewers has mentioned that ...
1       a wonderful little production the filming tech...
2       i thought this was a wonderful way to spend ti...
3       basically theres a family where a little boy j...
4       petter matteis love in the time of money is a ...
                              ...                        
4995    an interesting slasher film with multiple susp...
4996    i watched this series when it first came out i...
4997    once again jet li brings his charismatic prese...
4998    i rented this movie after hearing chris gore s...
4999    this was a big disappointment for me i think t...
Name: new_data, Length: 4996, dtype: object


In [14]:
len(new_data), type(new_data)

(5000, pandas.core.frame.DataFrame)

In [8]:
# Tokenize Data
# represent each word as a numerical value
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text) # fit on series of text
total_words = len(tokenizer.word_index) + 1 # length of word index
total_words

21125

In [23]:
# Create input_sequences
input_sequences = []
for line in text: # for each review
    token_list = tokenizer.texts_to_sequences([line])[0] # map each unique word to an integer with tokenizer
    # Creating n gram for each review
    for i in range(1, len(token_list)): 
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence) # input_sequences is a list of sequences from tokenized reviews
# padding sequences so each sequence in input_sequences has the same length
max_sequence_len = max([len(x) for x in input_sequences]) # identify length of largest sequence
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre') # pad sequences with zeros

# Generate predictor and target data
x = input_sequences[:,:-1] # the tokenized sequences minus the last token
y = input_sequences[:,-1] # the last token for each tokenized sequence

In [24]:
print("max_sequence_len:", max_sequence_len) 
print("Shape of x: ", x.shape, " Type of x: ", type(x))
print("Shape of y: ", y.shape, " Type of y: ", type(y))

max_sequence_len: 198
Shape of x:  (248366, 197)  Type of x:  <class 'numpy.ndarray'>
Shape of y:  (248366,)  Type of y:  <class 'numpy.ndarray'>


In [25]:
# One-hot encoding
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words)) # convert to a binary class matrix

In [27]:
x.shape, type(x), y.shape, type(y)

((248366, 197), numpy.ndarray, (248366, 21125), numpy.ndarray)

Even though 'x' and 'y' are numpy arrays, 'np.array()' has to be used in order to avoid errors when fitting the model.

In [28]:
# Prepare data for model 
# convert data to numpy arrays
x = np.array(x)
y = np.array(y)

In [29]:
x.shape, type(x), y.shape, type(y)

((248366, 197), numpy.ndarray, (248366, 21125), numpy.ndarray)

In [31]:
# Model 1
model = tf.keras.models.Sequential([
        layers.Embedding(total_words, 100, input_length = max_sequence_len-1),
        layers.LSTM(100),
        layers.Dense(total_words, activation='softmax'),
    ])
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 197, 100)          2112500   
                                                                 
 lstm_4 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 21125)             2133625   
                                                                 
Total params: 4326525 (16.50 MB)
Trainable params: 4326525 (16.50 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
callback = EarlyStopping(patience=10, monitor= 'loss') # stop training when accuracy doesn't improve
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x, y, epochs=20, verbose=1, callbacks=[callback])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20

KeyboardInterrupt: 