In [77]:
import pandas as pd
import numpy as np
import re
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [88]:
data = pd.read_csv("reviews.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [89]:
# Preprocessing Data
def clean(text):
    """
    This function is applied to text to remove html characters, convert all letters to lowercase, and remove punctuation.
    @param text: string
    @rvalue no_punctuation: string
    """
    remove_html = re.sub('<.*?>', ' ', text) # replace anything between <> with a space
    lower_text = remove_html.lower() # convert all text to lowercase
    trans_table = str.maketrans('', '', string.punctuation) # replace punctuation with empty string
    no_punctuation = lower_text.translate(trans_table) # remove punctuation
    return no_punctuation

def clean_and_combine(df, col_name):
    """
    This function takes in a data frame and a col_name of text data. It drops duplicates of the text column and 
    applies the clean() function to the text column, which makes all characters lowercase and removes punctuation.
    It then returns the series of string data.
    single object.
    
    @param df: data frame with string column
    @param col_name: column name of string column
    @rvalue cleaned: pandas series of strings
    """
    data = df[col_name] # subset text column

    data = data.drop_duplicates() # drop duplicates
    
    cleaned = data.apply(clean) # drop punctuation and convert text to lowercase
    
    return cleaned

In [90]:
text = clean_and_combine(data, 'review')
print(text)

0        one of the other reviewers has mentioned that ...
1        a wonderful little production   the filming te...
2        i thought this was a wonderful way to spend ti...
3        basically theres a family where a little boy j...
4        petter matteis love in the time of money is a ...
                               ...                        
49995    i thought this movie did a down right good job...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    i am a catholic taught in parochial elementary...
49998    im going to have to disagree with the previous...
49999    no one expects the star trek movies to be high...
Name: review, Length: 49582, dtype: object


In [92]:
# Tokenize Data
# represent each word as a numerical value
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text) # fit on series of text
total_words = len(tokenizer.word_index) + 1 # length of word index
total_words

168860

In [None]:
# 
input_sequences = []
for line in data['reviews_clean']:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Optionally, pad sequences to have the same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')