Importing libraries

In [4]:
import spacy
import pandas as pd
import string
import re
import numpy as np
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
nlp = spacy.load("en_core_web_sm")

Due to the size of the data, opening csv file as an iterator and extracting 10,000 rows.

In [2]:
raw_data = pd.read_csv('data.csv', iterator=True)

In [3]:
text_gen_data = raw_data.get_chunk(100000)

Using the category funny to train model on funny reviews.

In [63]:
text_gen_data = text_gen_data.loc[text_gen_data.label == 'funny']
text_gen_data

Unnamed: 0,text,label
66,So my roommate borrowed my clock radio and app...,funny
69,Was not that great. We sat for like 15 mins be...,funny
305,We received terrible service when we ordered a...,funny
309,I usually go into places like chilis with pret...,funny
375,"""I overheard you are out of the roast beef, ri...",funny
...,...,...
99742,After shopping on Newbury Street for a good wh...,funny
99765,You should never bring a girl here on a date.\...,funny
99825,DON'T GET THE BRAZILIAN!!!\nI went there last ...,funny
99826,I had an appointment and arrived on time for i...,funny


Cleaning text by removing punctuation and stopwords.

In [5]:
clean_punct_text = []
for review in text_gen_data.text:
    review = review.lower()
    
    review = review.translate(str.maketrans('', '', string.punctuation))
    
    clean_punct_text.append(review)

In [20]:
stopwords = nlp.Defaults.stop_words

clean_no_stopwords = []
for review in clean_punct_text:
    review = set(review.split())
    clean_review = review.difference(stopwords)
    clean_no_stopwords.append(' '.join(clean_review))

Extracting unique words to assing them a number and create a dictionary.

In [67]:
text_gen_data_all = []

for review in clean_no_stopwords:
    text_gen_data_all.extend(review.split())

unique_words = list(set(text_gen_data_all))

In [68]:
num_to_word = {k:v for k,v in enumerate(unique_words)}
word_to_num = {v:k for k,v in enumerate(unique_words)}

Creating dataset

In [9]:
X_text_gen = []
y_text_gen = []

for review in clean_no_stopwords:
    
    split_review = review.split()
    
    for i in range(len(split_review)):
        try:
            x = split_review[i:i+3]
            y = split_review[i+4]
            
            X_text_gen.append(x)
            y_text_gen.append(y)
        except:
            break

Turning words to assigned number

In [10]:
X_text_gen_processed = []

for i in X_text_gen:
    temp_list = []
    
    for word in i:
        temp_list.append(word_to_num.get(word))
    
    X_text_gen_processed.append(temp_list)

y_text_gen_processed = []

for word in y_text_gen:
    y_text_gen_processed.append(word_to_num.get(word))

Processing input arrays and encoding target variable

In [11]:
y = np_utils.to_categorical(y_text_gen_processed)
X = np.array(X_text_gen_processed)

In [12]:
X = np.reshape(X,(X.shape[0],X.shape[1],1))

Building and fitting model

In [57]:
model = Sequential()
model.add(LSTM(400, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(400))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [58]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X, y, epochs=100, batch_size=50)

Due to model performance requirements google colab was used to train the model. The following is importing the trained model and testing the output.

In [13]:
import keras

In [15]:
model = keras.models.load_model('word_model.h5')

In [17]:
review = []
start = np.random.randint(0,14685,3)
review.extend(start)
for i in range(100):
    last_three = review[i:i+3]
    last_three = np.array(last_three).reshape((1,3,1))
    preds = np.argmax(model.predict(last_three))
    review.append(preds)
    
review_word = []
for i in review:
    review_word.append(num_to_word.get(i))
' '.join(review_word)

'traveling mexican nuance coladas cutest appraised appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised coladas steps appraised'

# Iteration 2

In this iteration, the same steps as before were taken. The difference is that this model deals with character level predictions rather than word label.

In [83]:
char_raw_data =raw_data.get_chunk(10000)

In [84]:
clean_punct_text = []
for review in char_raw_data.text:
    if review is not np.nan:
        lower_review = review.lower()
        no_punct = lower_review.translate(str.maketrans('', '', string.punctuation))
        s = re.sub(r'[^a-z]+ ', '', no_punct)
        clean_punct_text.append(s)
    

In [85]:
stopwords = nlp.Defaults.stop_words

clean_no_stopwords = []
for review in clean_punct_text:
    review = set(review.split())
    clean_review = review.difference(stopwords)
    clean_no_stopwords.append(' '.join(clean_review))

In [86]:
chars = [char for review in clean_no_stopwords for char in review if char.isascii()]
chars_unique = set(chars)

In [87]:
lower_case_letters = set(list(string.ascii_lowercase))
lower_case_letters.add(' ')
split_reviews = []
for review in clean_no_stopwords:
    review_split = list(review)
    temp_list = []
    for letter in review_split:
        if letter in lower_case_letters:
            temp_list.append(letter)
    split_reviews.append(''.join(temp_list))

In [88]:
x_char_gen = []
y_char_gen = []

for review in clean_no_stopwords:
    
    split_review = list(review)
    for i in range(len(split_review)):
        
        try:
            x = split_review[i:i+3]
            y = split_review[i+4]
            
            x_char_gen.append(x)
            y_char_gen.append(y)
        except:
            break

In [2]:
char_to_num = {v:k for k,v in enumerate(list(string.ascii_lowercase + ' '))}
num_to_char = {k:v for k,v in enumerate(list(string.ascii_lowercase + ' '))}

In [102]:
X_char_gen_processed = []

for i in x_char_gen:
    temp_list = []
    
    for char in i:
        char_num = char_to_num.get(char,0)
        temp_list.append(char_num)
    X_char_gen_processed.append(temp_list)

y_char_gen_processed = []

for word in y_char_gen:
    char_num = char_to_num.get(char,0)
    y_char_gen_processed.append(char_num)

In [98]:
np.save('x.npy', X_char_gen_processed)
np.save('y.npy', y_char_gen_processed)

In [55]:
y = np_utils.to_categorical(y_char_gen_processed)
X = np.array(X_char_gen_processed)

In [56]:
X = np.reshape(X,(X.shape[0],X.shape[1],1))

In [None]:
model.fit(X, y, epochs=2, batch_size=500)

In [5]:
model = keras.models.load_model('/Users/alejandro/Documents/Coding/ML/yelp_reviews/data/char_model.h5')

In [10]:
print(char_to_num.get('l'))

11


In [16]:
review = []
start = np.random.randint(0,26,3)
review.extend(start)
for i in range(100):
    last_three = review[i:i+3]
    last_three = np.array(last_three).reshape((1,3,1))
    preds = np.argmax(model.predict(last_three))
    review.append(preds)
    
review_char = []
for i in review:
    review_char.append(num_to_char.get(i))
' '.join(review_char)

'e h g i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i i'