# Load libraries

In [5]:
import pickle, logging
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,SimpleRNN, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load data

In [6]:
reviews=pd.read_csv('../data/amazon_reviews_us_Apparel_v1_00.tsv.gz.tsv',sep='\t')

In [7]:
reviews['label']=(reviews['verified_purchase']=='y').astype(int)

In [19]:
selected_reviews=selected_reviews.dropna()

In [33]:
reviews['label'].mean()

0.8995058355158776

In [34]:
samples=reviews[:5000]

In [35]:
samples['label'].mean()

0.6492

# Samples

In [36]:
undersample=samples[samples['label']==1].sample(1754)

In [37]:
undersample=pd.concat([undersample,samples[samples['label']==0]])
undersample = undersample.sample(frac=1,replace=False).reset_index(drop=True)

In [38]:
undersample['label'].mean()

0.5

In [39]:
selected_features=[ 'review_body','label']

In [16]:
selected_reviews=undersample[selected_features]

In [17]:
selected_reviews['review_body']=selected_reviews['review_body'].str.replace(r'([^a-z\s]+)','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
selected_reviews['review_body'].head()

0    comfortable and wears well for water aerobic c...
1    these tiny little workout shorts are super cut...
2                                    i loved the style
3    bought this dress for my wife to wear to a par...
4    this dress is so cute and very flattering  i p...
Name: review_body, dtype: object

In [40]:
selected_reviews['label'].mean()

0.4998574280011406

## Save/Load the clean dataframe

In [23]:
saved_dataframe='df_verified_purchase.p'
# pickle.dump( selected_reviews, open( saved_dataframe, "wb" ) )
selected_reviews= pickle.load( open( saved_dataframe, "rb" ) )

# Split data

In [24]:
def split_pad(selected_reviews):
    X_train, X_test, Y_train, Y_test = train_test_split(selected_reviews['review_body'].values,\
                                                    selected_reviews['label'].values,\
                                                    test_size=0.30,\
                                                    shuffle=True)
    for label in [Y_train, Y_test]:
          logging.debug("label mean: {0}".format(label.mean()))
    tokenizer = Tokenizer(num_words= 50000)
    tokenizer.fit_on_texts(X_train)
    X_train_tok = tokenizer.texts_to_sequences(X_train)
    X_test_tok = tokenizer.texts_to_sequences(X_test)
    max_review_length = 600
    X_train_pad = sequence.pad_sequences(X_train_tok, maxlen=max_review_length)
    X_test_pad = sequence.pad_sequences(X_test_tok, maxlen=max_review_length)
    return X_train_pad, X_test_pad, Y_train, Y_test

In [31]:
 X_train_pad, X_test_pad, Y_train, Y_test=split_pad(selected_reviews)

# Compile models

In [26]:
models=[]

max_review_length = 600
embedding_vector_length = 32
top_words = 50000
models.append(Sequential())
model=models[-1]
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
# The output of SimpleRNN will be a 2D tensor of shape (batch_size, 128)
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[tf.keras.metrics.Recall(),tf.keras.metrics.Precision()])

In [31]:
embedding_vector_length = 40
models.append(Sequential())
model=models[-1]
model.add(Embedding(top_words+1, embedding_vector_length, input_length=max_review_length))
model.add(LSTM(32,return_sequences=True))
model.add(LSTM(16))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
models.append(Sequential())
model=models[-1]
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))

# The output of GRU will be a 3D tensor of shape (batch_size, timesteps, 256)
model.add(GRU(256, return_sequences=True))
model.add(SimpleRNN(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [32]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
for model in models:
    model.fit(X_train_pad, Y_train, epochs=100, callbacks=[callback], batch_size=64,validation_data=(X_test_pad,Y_test))
    # Final evaluation of the model on test data
    scores = model.evaluate(X_test_pad, Y_test, verbose=0)
    print("loss: {0:2.2f}, accuracy {1:2.2f}".format(scores[0],scores[1]))

Train on 2454 samples, validate on 1053 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
loss: 0.83, accuracy 0.81
Train on 2454 samples, validate on 1053 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100






KeyboardInterrupt: 