In [32]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [22]:
data = pd.read_csv("reviews.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [28]:
# Preprocessing Data
def clean_and_combine(df, col_name):
    """
    This function takes in a data frame and a col_name of text data. It applies the clean() function to the dataframe
    which makes all characters lowercase and removes punctuation. It then combines all the text across the rows into a
    single object.
    
    @param df: data frame with string column
    @param col_name: column name of string column
    @rvalue: text object containing all combines strings of df[col_name]
    """
    
    def clean(text):
        lower_text = text.lower()
        trans_table = str.maketrans('', '', string.punctuation)
        no_punctuation = lower_text.translate(trans_table)
        return no_punctuation
    
    cleaned = df[col_name].apply(clean)
    text = ' '.join(cleaned)
    
    return text

In [30]:
text = clean_and_combine(data, 'review')
print(text[:100])

one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they 


In [33]:
# Tokenize Data
# represent each word as a numerical value
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 # length of word index


{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'br': 12,
 'was': 13,
 'as': 14,
 'with': 15,
 'for': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'be': 26,
 'he': 27,
 'one': 28,
 'its': 29,
 'at': 30,
 'all': 31,
 'by': 32,
 'an': 33,
 'they': 34,
 'from': 35,
 'who': 36,
 'so': 37,
 'like': 38,
 'or': 39,
 'just': 40,
 'her': 41,
 'about': 42,
 'if': 43,
 'has': 44,
 'out': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'when': 50,
 'more': 51,
 'very': 52,
 'my': 53,
 'even': 54,
 'no': 55,
 'up': 56,
 'would': 57,
 'she': 58,
 'time': 59,
 'only': 60,
 'which': 61,
 'really': 62,
 'their': 63,
 'see': 64,
 'were': 65,
 'story': 66,
 'had': 67,
 'can': 68,
 'me': 69,
 'than': 70,
 'we': 71,
 'much': 72,
 'well': 73,
 'been': 74,
 'get': 75,
 'will': 76,
 'other': 77,
 'do': 78,
 'great': 79,
 'also': 80,
 'into': 81,
 'bad': 82,
 'be