In [None]:
# I. Import All Packages for Review Generator

import pandas as pd
import numpy as np
import re
import string

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
# II. Data Upload
#     A. If on Colab, navigate to the left sidebar and select these icons: 'Files' > 'Upload' and import `reviews.csv`
#     B. If on standalone, ensure `reviews.csv` is in your environment before running the following block:
#     C. Possible Error:
#            You may receive the following error. `ParserError: Error tokenizing data. C error: EOF inside string starting at row 25310`.
#            Simply wait and run this code block again. The row number should continually increase until eventually it successfully
#            tokenizes the entire file.
data = pd.read_csv("reviews.csv")
data.head()

# Expected Output:
#                                               review	sentiment
# 0	One of the other reviewers has mentioned that ...	  positive
# 1	A wonderful little production. <br /><br />The...	  positive
# 2	I thought this was a wonderful way to spend ti...	  positive
# 3	Basically there's a family where a little boy ...	  negative
# 4	Petter Mattei's "Love in the Time of Money" is...	  positive

In [None]:
# III. Data Attributes
#     A. Averages
def averages(text):
    """
    This function calculates the average number of words per sentence and the average number of sentences per entry.
    @param text: pandas series; array of strings
    @rvalue:
    @rvalue:
    """
    total_sentences = 0
    total_words = 0
    # for each processed document of the text
    for doc in nlp.pipe(text, disable=["ner", "tagger"]): # nlp.pipe includes different components of the text
        sentences = list(doc.sents) # extract all sentences for each entry
        total_sentences += len(sentences) # total number of sentences in text
        total_words += sum([len(sentence) for sentence in sentences]) # total number of words in text
    
    avg_words_per_sentence = total_words / total_sentences if total_sentences else 0 # average number of words per sentence
    avg_num_sentences = total_sentences / len(text) if len(text) else 0 # average number of sentences across text
    
    return avg_words_per_sentence, avg_num_sentences

avg_words, avg_sent = averages(data['review'][:len(data['review'])//10])

In [None]:
#     B. Average Computation
avg_words, avg_sent
# Expected Output:
# (20.16343979755327, 13.3566)

In [None]:
# IV. Data Truncation 
#     C. Truncate text
def truncate_text(text, min_words, min_sent):
    """
    This function takes in a text and truncates it so that there are only two sentences and if the number of words in the 
    text is less than min_words, it adds another sentence to reach at least min_words.
    
    """
    doc = nlp(text) # create a document that stores different components of text
    sentences = list(doc.sents) # list of sentences of text
    
    new_text = []
    word_count = 0 # records number of words in text
    sent_count = 0 # records number of sentences in text
    
    # for each sentence in text
    for sentence in sentences:
        sent_word_count = len(sentence.text.split()) # count number of words in each sentence
        # add a sentence to meet requirements
        if word_count + sent_word_count <= min_words or sent_count < min_sent:
            new_text.append(sentence.text) # add sentence
            word_count += sent_word_count # keep track of words added
            sent_count += 1 # keep track of sentences added
        # once requirements are met break
        if word_count >= min_words and sent_count >= min_sent:
            break
    
    # if there are less than min_words in text, and no remaining sentences to add, keep text as is
    return ' '.join(new_text)

In [None]:
#     D. Truncation
new_data = data.iloc[:len(data)//10].copy()
new_data["new_data"] = new_data["review"].apply(lambda x: truncate_text(x, min_words = 50, min_sent = 2))
new_data[:10]
# Expected Output:
#                                             review	sentiment	new_data
# 0	One of the other reviewers has mentioned that ...	positive	One of the other reviewers has mentioned that ...
# 1	A wonderful little production. <br /><br />The...	positive	A wonderful little production. <br /><br />The...
# 2	I thought this was a wonderful way to spend ti...	positive	I thought this was a wonderful way to spend ti...
# 3	Basically there's a family where a little boy ...	negative	Basically there's a family where a little boy ...
# 4	Petter Mattei's "Love in the Time of Money" is...	positive	Petter Mattei's "Love in the Time of Money" is...
# 5	Probably my all-time favorite movie, a story o...	positive	Probably my all-time favorite movie, a story o...
# 6	I sure would like to see a resurrection of a u...	positive	I sure would like to see a resurrection of a u...
# 7	This show was an amazing, fresh & innovative i...	negative	This show was an amazing, fresh & innovative i...
# 8	Encouraged by the positive comments about this...	negative	Encouraged by the positive comments about this...
# 9	If you like original gut wrenching laughter yo...	positive	If you like original gut wrenching laughter yo...

In [None]:
# V. Preprocessing Data
def clean(data, column):
    """
    This function takes in a data frame and column name of the text data. It converts all letters to lowercase, removes HTML tags,
    removes punctuation, removes unnecessary spaces, and removes duplicates.
    @param data: data frame with string column
    @param column: column name of string column
    """
    clean_data = (data[column] # Reduce the data to a specific column
                .str.lower() # Convert to lowercase
                .apply(lambda x: re.sub('<.*?>', ' ', x)) # Replace HTML tags with a space
                .apply(lambda x: re.sub(r'[^\w\s]', '', x)) # Remove punctuation
                .apply(lambda x: re.sub(r'\s{2,}', ' ', x)) # Replace 2+ consecutive spaces with a single space
                .drop_duplicates()) # Remove duplicates
    return clean_data

In [None]:
#     A. Preprocessing Computation
text = clean(new_data, 'new_data')
print(text)
# Expected Output:
# 0       one of the other reviewers has mentioned that ...
# 1       a wonderful little production the filming tech...
# 2       i thought this was a wonderful way to spend ti...
# 3       basically theres a family where a little boy j...
# 4       petter matteis love in the time of money is a ...
#                               ...                        
# 4995    an interesting slasher film with multiple susp...
# 4996    i watched this series when it first came out i...
# 4997    once again jet li brings his charismatic prese...
# 4998    i rented this movie after hearing chris gore s...
# 4999    this was a big disappointment for me i think t...
# Name: new_data, Length: 4996, dtype: object

In [None]:
#     B. Preprocessed Metadata
len(new_data), type(new_data)
# Expected Output:
# (5000, pandas.core.frame.DataFrame)

In [None]:
# VI. Tokenize Data
# represent each word as a numerical value
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text) # fit on series of text
total_words = len(tokenizer.word_index) + 1 # length of word index
print(total_words)
# Expected Output:
# 21125

In [None]:
# VII. Create input_sequences
input_sequences = []
for line in text: # for each review
    token_list = tokenizer.texts_to_sequences([line])[0] # map each unique word to an integer with tokenizer
    # Creating n gram for each review
    for i in range(1, len(token_list)): 
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence) # input_sequences is a list of sequences from tokenized reviews
# padding sequences so each sequence in input_sequences has the same length
max_sequence_len = max([len(x) for x in input_sequences]) # identify length of largest sequence
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre') # pad sequences with zeros

# Generate predictor and target data
x = input_sequences[:,:-1] # the tokenized sequences minus the last token
y = input_sequences[:,-1] # the last token for each tokenized sequence

In [None]:
#     A. Sequence Metadata
print("max_sequence_len:", max_sequence_len) 
print("Shape of x: ", x.shape, " Type of x: ", type(x))
print("Shape of y: ", y.shape, " Type of y: ", type(y))
# Expected Output:
# max_sequence_len: 198
# Shape of x:  (248366, 197)  Type of x:  <class 'numpy.ndarray'>
# Shape of y:  (248366,)  Type of y:  <class 'numpy.ndarray'>

In [None]:
# VIII. One-hot Encoding
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))
# converts matrix to a binary class matrix

In [None]:
#     A. One-hot Metadata
x.shape, type(x), y.shape, type(y)
# Expected Output:
# ((248366, 197), numpy.ndarray, (248366, 21125), numpy.ndarray)

Even though 'x' and 'y' are numpy arrays, 'np.array()' has to be used in order to avoid errors when fitting the model.

In [None]:
# IX. Prepare/Resize Data for Model
# convert data to numpy arrays to match model dimensions for fitting
x = np.array(x)
y = np.array(y)

In [None]:
#     A. Resized Metadata
x.shape, type(x), y.shape, type(y)
# Expected Output:
# ((248366, 197), numpy.ndarray, (248366, 21125), numpy.ndarray)

In [None]:
# X. Model 1: Train with LSTM (100), Softmax Dense Layer
#     We picked LSTM to train review generation because the content of each
#     generated review should be cohesive, and the semantics and meaning of
#     each word will depend on the ones before it in the current sentence
#     and beyond.
#     We picked a Softmax Dense Layer because our output is NLP and is based
#     on next-word selection, so a probabilistic mapping of all next possible
#     words would be necessary. This is acheived with a Softmax function.
#     Other methods like Sigmoid or ReLU do not apply well to our model since
#     it is fundamentally for binary and multiclass classification, not
#     iterative generation.
model = tf.keras.models.Sequential([
        layers.Embedding(total_words, 100, input_length = max_sequence_len-1),
        layers.LSTM(100),
        layers.Dense(total_words, activation='softmax'),
    ])
model.summary()
# Expected Output:
# Model: "sequential_4"
# _________________________________________________________________
#  Layer (type)                Output Shape              Param #   
# =================================================================
#  embedding_4 (Embedding)     (None, 197, 100)          2112500   
                                                                 
#  lstm_4 (LSTM)               (None, 100)               80400     
                                                                 
#  dense_4 (Dense)             (None, 21125)             2133625   
                                                                 
# =================================================================
# Total params: 4326525 (16.50 MB)
# Trainable params: 4326525 (16.50 MB)
# Non-trainable params: 0 (0.00 Byte)
# _________________________________________________________________

In [None]:
# XI. Model 1 Fitting
callback = EarlyStopping(patience=10, monitor= 'loss') # stop training when accuracy doesn't improve
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x, y, epochs=20, verbose=1, callbacks=[callback])
# Expected Output: [To Be Completed]
# Epoch 1/20
# 7762/7762 [==============================] - 780s 100ms/step - loss: 6.7600 - accuracy: 0.0910
# Epoch 2/20
# 7762/7762 [==============================] - 784s 101ms/step - loss: 6.0031 - accuracy: 0.1341
# Epoch 3/20
# 7762/7762 [==============================] - 785s 101ms/step - loss: 5.6365 - accuracy: 0.1538
# Epoch 4/20
# 7762/7762 [==============================] - 800s 103ms/step - loss: 5.3247 - accuracy: 0.1701
# Epoch 5/20
# 7762/7762 [==============================] - 785s 101ms/step - loss: 5.0337 - accuracy: 0.1877
# Epoch 6/20
# 7762/7762 [==============================] - 797s 103ms/step - loss: 4.7652 - accuracy: 0.2057
# Epoch 7/20
# 6407/7762 [=======================>......] - ETA: 2:18 - loss: 4.4733 - accuracy: 0.2295