In [35]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [48]:
data = pd.read_csv("reviews.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [47]:
# Preprocessing Data
def clean(text):
    lower_text = text.lower() # convert all text to lowercase
    trans_table = str.maketrans('', '', string.punctuation) # replace punctuation with empty string
    no_punctuation = lower_text.translate(trans_table) # remove punctuation
    return no_punctuation

def clean_and_combine(df, col_name):
    """
    This function takes in a data frame and a col_name of text data. It applies the clean() function to the dataframe
    which makes all characters lowercase and removes punctuation. It then combines all the text across the rows into a
    single object.
    
    @param df: data frame with string column
    @param col_name: column name of string column
    @rvalue: text object containing all combines strings of df[col_name]
    """
    data = df[col_name] # subset text column
    
    data = data.apply(lambda x: re.sub('<.*?>', ' ', x)) # replace anything between <> with a space

    data = data.drop_duplicates() # drop duplicates
    
    cleaned = data.apply(clean) # drop punctuation and convert text to lowercase
    
    text = ' '.join(cleaned) # convert to a single text object
    
    return text

In [49]:
text = clean_and_combine(data, 'review')
print(text[:100000])

one of the other reviewers has mentioned that after watching just 1 oz episode youll be hooked they are right as this is exactly what happened with me  the first thing that struck me about oz was its brutality and unflinching scenes of violence which set in right from the word go trust me this is not a show for the faint hearted or timid this show pulls no punches with regards to drugs sex or violence its is hardcore in the classic use of the word  it is called oz as that is the nickname given to the oswald maximum security state penitentary it focuses mainly on emerald city an experimental section of the prison where all the cells have glass fronts and face inwards so privacy is not high on the agenda em city is home to manyaryans muslims gangstas latinos christians italians irish and moreso scuffles death stares dodgy dealings and shady agreements are never far away  i would say the main appeal of the show is due to the fact that it goes where other shows wouldnt dare forget pretty p

In [45]:
print(text.find("Quite what the producers of this appalling adaptation were trying to do is impossibl"))

-1


In [39]:
# Tokenize Data
# represent each word as a numerical value
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1 # length of word index