In [53]:
import numpy as np
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from yhelper import neural_modeling, overall_cleaner
import pickle
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import pandas as pd
from keras.preprocessing import sequence, text
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, GlobalMaxPool1D, Dropout, Bidirectional, Conv1D, Activation

# Binned (Good vs Bad)

In [55]:
df = pickle.load(open('post_eda/eda_az.p', 'rb'))

In [56]:
text_df = overall_cleaner(df, ['text', 'review_rating'])

In [57]:
text_df.review_rating.replace(1.0, False, inplace=True)
text_df.review_rating.replace(2.0, False, inplace=True)
text_df.review_rating.replace(3.0, False, inplace=True)
text_df.review_rating.replace(4.0, True, inplace=True)
text_df.review_rating.replace(5.0, True, inplace=True)

In [58]:
text_df

Unnamed: 0,text,review_rating
5201838,i find this to be one of the better walmarts i...,True
678826,best place ever very authentic staff super nic...,True
2683530,the bean and cheese burrito with green sauce i...,True
5802832,i guess to ups oclock means pm i tried to ma...,False
2197113,i am the vice president of product marketing f...,True
...,...,...
3750456,always looking for a great cup with knowledgea...,False
2781437,stopped in after mtb south mountain my boyfrie...,True
302891,the food here is some of the best ive had in a...,True
3312023,loved the arizona roll but the spicy tuna roll...,False


In [59]:
text_srs = text_df.text
total_vocab = set(word for sentence in text_srs for word in sentence.split(' '))

In [60]:
tokenizer = text.Tokenizer(num_words=len(total_vocab))
tokenizer.fit_on_texts(text_srs)

In [61]:
tokenized_list = tokenizer.texts_to_sequences(text_srs)

In [62]:
padded_seq = sequence.pad_sequences(tokenized_list, maxlen=150)

In [63]:
X_train, X_test, y_train, y_test = train_test_split(padded_seq, pd.get_dummies(text_df.review_rating), test_size=0.2)

In [64]:
padded_seq

array([[    0,     0,     0, ...,  1366,     4,    28],
       [    0,     0,     0, ...,  8565,    16,    30],
       [    0,     0,     0, ...,   344,     2,   510],
       ...,
       [    0,     0,     0, ...,     5,    89,    30],
       [    0,     0,     0, ...,   133,    41,   495],
       [    0,     0,     0, ...,    35, 35867,   372]], dtype=int32)

In [65]:
pd.get_dummies(text_df.review_rating).shape

(94946, 2)

In [66]:
model = Sequential()
embedding_size=150

model.add(Embedding(len(total_vocab), embedding_size))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(2, activation='softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 150)         18442500  
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 128)         110080    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 258       
Total params: 18,552,838
Trainable params: 18,552,838
Non-trainable params: 0
_________________________________________________________________


In [67]:
model.fit(X_train, y_train, epochs=5, batch_size = 1500, validation_split=0.1)

Train on 68360 samples, validate on 7596 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa094914710>

In [68]:
model.evaluate(X_test, y_test)



[0.26795434442208027, 0.9058978409500986]

In [69]:
model.metrics_names

['loss', 'acc']

# Glove

In [39]:
def create_corpus_new(df):
    corpus=[]
    for review in tqdm(df['text']):
        words=[word.lower() for word in word_tokenize(review)]
        corpus.append(words)
    return corpus   

In [40]:
corpus = create_corpus_new(df)

100%|██████████| 94946/94946 [00:32<00:00, 2903.78it/s]


In [41]:
embedding_dict={}
with open('glove.6B.300d.txt','r') as f:
    for line in f:
        values=line.split()
        word = values[0]
        vectors=np.asarray(values[1:],'float64')
        embedding_dict[word]=vectors
f.close()

In [42]:
MAX_LEN=150
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

glove_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [43]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

Number of unique words: 122780


In [44]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,300))

for word,i in tqdm(word_index.items()):
    if i < num_words:
        emb_vec=embedding_dict.get(word)
        if emb_vec is not None:
            embedding_matrix[i]=emb_vec           

100%|██████████| 122780/122780 [00:00<00:00, 671788.11it/s]


In [45]:
glove_pad.shape

(94946, 150)

In [46]:
model = Sequential()
embedding_size=150

model.add(Embedding(150000, embedding_size))
model.add(Dropout(0.2))
model.add(Conv1D(128,
                 kernel_size=5,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy', 'mse'])

model.summary()

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 150)         22500000  
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 150)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         96128     
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 256)         263168    
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 256)               0         
___________________________________________________________

In [48]:
X_train,X_test,y_train,y_test=train_test_split(glove_pad, pd.get_dummies(df.review_rating),test_size=0.2)

In [50]:
model.fit(X_train, y_train, epochs=5, batch_size=1500, validation_split=0.1)

Train on 68360 samples, validate on 7596 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fa0949358d0>

In [51]:
model.evaluate(X_test, y_test)



[0.8217614086820553, 0.6801474460179457, 0.08627992903969676]

In [52]:
model.metrics_names

['loss', 'acc', 'mean_squared_error']