# Keras LSTM Text prediction

## Imports

In [1]:
import numpy
import json
import re
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from gensim.parsing.preprocessing import preprocess_string, strip_numeric, strip_punctuation, remove_stopwords
from gensim.corpora import Dictionary as d

In [2]:
#changing notebooke width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Functions

In [3]:
#loading dataframe function
def dataframe_from_json(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(json.loads(line.strip()))
    df = pd.DataFrame.from_dict(data)
    return df

## Loading the file

In [4]:
#replace either with URL or your directory
in_luxury = dataframe_from_json('/Users/niklastodenhoefer/Library/Mobile Documents/com~apple~CloudDocs/Data Science CBS/3. Semester/Data Science for Business Applications/Project/Amazon Reviews/Luxury_Beauty.json') 

In [5]:
in_luxury.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 574628 entries, 0 to 574627
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         574628 non-null  float64
 1   vote            103689 non-null  object 
 2   verified        574628 non-null  bool   
 3   reviewTime      574628 non-null  object 
 4   reviewerID      574628 non-null  object 
 5   asin            574628 non-null  object 
 6   reviewerName    574597 non-null  object 
 7   reviewText      574228 non-null  object 
 8   summary         574445 non-null  object 
 9   unixReviewTime  574628 non-null  int64  
 10  style           251013 non-null  object 
 11  image           7418 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 48.8+ MB


In [6]:
in_luxury.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,image
0,2.0,3.0,True,"06 15, 2010",A1Q6MUU0B2ZDQG,B00004U9V2,D. Poston,"I bought two of these 8.5 fl oz hand cream, an...",dispensers don't work,1276560000,,
1,5.0,14.0,True,"01 7, 2010",A3HO2SQDCZIE9S,B00004U9V2,chandra,"Believe me, over the years I have tried many, ...",Best hand cream ever.,1262822400,,
2,5.0,,True,"04 18, 2018",A2EM03F99X3RJZ,B00004U9V2,Maureen G,Great hand lotion,Five Stars,1524009600,{'Size:': ' 3.5 oz.'},
3,5.0,,True,"04 18, 2018",A3Z74TDRGD0HU,B00004U9V2,Terry K,This is the best for the severely dry skin on ...,Five Stars,1524009600,{'Size:': ' 3.5 oz.'},
4,5.0,,True,"04 17, 2018",A2UXFNW9RTL4VM,B00004U9V2,Patricia Wood,The best non- oily hand cream ever. It heals o...,I always have a backup ready.,1523923200,{'Size:': ' 3.5 oz.'},


## Text preprocessing

In [7]:
#replace non alphabetical (and non-space) chars with empty chars
in_luxury['preprocessed_text'] = in_luxury['reviewText'].apply(lambda x: re.sub(r'[^A-Za-z ]+', '', str(x)))

In [8]:
#checking dataframe after cleaning
in_luxury.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,image,preprocessed_text
0,2.0,3.0,True,"06 15, 2010",A1Q6MUU0B2ZDQG,B00004U9V2,D. Poston,"I bought two of these 8.5 fl oz hand cream, an...",dispensers don't work,1276560000,,,I bought two of these fl oz hand cream and ne...
1,5.0,14.0,True,"01 7, 2010",A3HO2SQDCZIE9S,B00004U9V2,chandra,"Believe me, over the years I have tried many, ...",Best hand cream ever.,1262822400,,,Believe me over the years I have tried many ma...
2,5.0,,True,"04 18, 2018",A2EM03F99X3RJZ,B00004U9V2,Maureen G,Great hand lotion,Five Stars,1524009600,{'Size:': ' 3.5 oz.'},,Great hand lotion
3,5.0,,True,"04 18, 2018",A3Z74TDRGD0HU,B00004U9V2,Terry K,This is the best for the severely dry skin on ...,Five Stars,1524009600,{'Size:': ' 3.5 oz.'},,This is the best for the severely dry skin on ...
4,5.0,,True,"04 17, 2018",A2UXFNW9RTL4VM,B00004U9V2,Patricia Wood,The best non- oily hand cream ever. It heals o...,I always have a backup ready.,1523923200,{'Size:': ' 3.5 oz.'},,The best non oily hand cream ever It heals ove...


In [9]:
text = ''

In [17]:
#add text from preprocessed_text column to one large string from which to build the vocab
for i in range(0,5000):
    text = text + str(in_luxury.iloc[i,12])

In [18]:
print('amount of words we have in the entire text is: {}'.format(len(text)))

amount of words we have in the entire text is: 772787


In [19]:
text = text.lower()

In [20]:
print(text[0:10])

i bought t


In [21]:
#create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [22]:
#summarize the loaded data
n_chars = len(text)
n_vocab = len(chars)
print("Total Characters: {}".format(n_chars))
print("Total Vocab: {}".format(n_vocab))

Total Characters: 772787
Total Vocab: 27


In [24]:
#check the characters inside the vocab, detect potential preprocessing errors
print(chars)

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


looks fine, we want to keep space hence the first element of this list shall remain.

## This part takes a while

In [25]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
	seq_in = text[i:i + seq_length]
	seq_out = text[i + seq_length]
	dataX.append([char_to_int[char] for char in seq_in])
	dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: {}".format(n_patterns))

Total Patterns: 772687


In [26]:
#reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
#normalize
X = X / float(n_vocab)
#one hot encode the output variable
y = np_utils.to_categorical(dataY)

## LSTM model

In [27]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [28]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [None]:
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20
 105/6037 [..............................] - ETA: 23:58 - loss: 2.9042