## LSTM Mexican Review Grades Prediction

In [114]:
import pandas as pd
import numpy as np
import os
import json
import torch
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
import time
import random
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Mounted at /content/drive


### Initialize

In [0]:
df_mexican_review = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review.csv')
df_mexican_tip =pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_tip.csv')
df_mexican_info = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_info.csv')

In [0]:
vocabulary_size = 20000
learning_rate = 1e-4
batch_size = 128
num_epoch = 15
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
embedding_dim = 256
hidden_dim = 256
output_dim = 1

In [109]:
df_review = df_mexican_review[['text', 'stars']].dropna()
df_review.shape

(401692, 2)

In [99]:
df_review.tail()

Unnamed: 0,text,stars
401688,My first time here tonight and we got seated i...,5.0
401689,I have tried a lot of Mexican restaurants in t...,5.0
401690,The Cabeza is amazing! I had the vampiros (tos...,5.0
401691,"Off the grid Mexican in Vegas. Very tasty, qua...",5.0
401692,We hired Taco Naco to cater our family party a...,5.0


### Pre Processing

In [0]:
def text_processing(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-z0-9]', ' ', text)
    text = word_tokenize(text)
    text = [w for w in text if w not in stopwords.words('english')]
    text = [WordNetLemmatizer().lemmatize(w, pos = 'v') for w in text]
    text = ' '.join(text)
    
    return text

In [67]:
print(text_processing(df_review.iloc[0, 0]))

love chinese food love mexican food go wrong couple things first things first place rice bowl kind place think go diverse far menu go mainly rice bowl get different kinds meats order little confuse first one employees help us get 2 item bowl get jade chicken hengrenade chicken rice jerk also order jade chicken quesadilla side gon na admit place look kinda dirty think arizona use health department letter grade system like california judge look inside give c grade lol wait 15 minutes finally get food take go eat hotel room mmmm food alright jade chicken nothing special taste like generic chinese fast food orange chicken sesame chicken variant hengrenade chicken although less spicier version jerk chicken still pretty spicy warn jerk chicken super spicy sure ask sample restaurant order way spicy jade chicken quesadilla decent nothing special imagine orange chicken tortilla cheese friend mine order jade chicken burrito confuse pull bag literally size mcdonald apple pie order burrito warn bu

In [0]:
dict_review_p = {'text':[], 
                 'stars':[]}

for i in range(df_review.shape[0]):
    dict_review_p['text'].append(text_processing(df_review.loc[i, 'text']))
    dict_review_p['stars'].append(df_review.loc[i, 'stars'])

In [105]:
df_review_p = pd.DataFrame(dict_review_p)
#del dict_review_p
#del df_review
df_review_p.tail()

Unnamed: 0,text,stars
401687,first time tonight get seat immediately booth ...,5.0
401688,try lot mexican restaurants area find one one ...,5.0
401689,cabeza amaze vampiros tostadas good crisp carn...,5.0
401690,grid mexican vegas tasty quality food fantasti...,5.0
401691,hire taco naco cater family party daughter bap...,5.0


In [0]:
df_review_p.to_csv('/content/drive/My Drive/STAT628/data/mexican_review_p.csv', index = False)
del df_review_p

In [110]:
df_review_p = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review_p.csv')
df_review_p.head()

Unnamed: 0,text,stars
0,love chinese food love mexican food go wrong c...,3.0
1,pick meat planet chef make mexican style dish ...,5.0
2,party 3 order fish tacos pork belly banh mi co...,5.0
3,employees busy chat one employee head drive th...,2.0
4,come loco mill since open love come especially...,5.0


In [111]:
df_review_p.tail()

Unnamed: 0,text,stars
401687,first time tonight get seat immediately booth ...,5.0
401688,try lot mexican restaurants area find one one ...,5.0
401689,cabeza amaze vampiros tostadas good crisp carn...,5.0
401690,grid mexican vegas tasty quality food fantasti...,5.0
401691,hire taco naco cater family party daughter bap...,5.0


In [136]:
df_review_p[df_review_p['text'] != ' '].shape

(401692, 2)

### Embedding

In [118]:
random_seed = 1
text = data.Field(sequential=True, tokenize='spacy', include_lengths=True)
label = data.LabelField(dtype=torch.float)

fields = [('text', text), ('stars', label)]

dataset = data.TabularDataset(path='/content/drive/My Drive/STAT628/data/mexican_review_p.csv', 
                              format='csv', skip_header=True, fields=fields)

train_data, valid_data, test_data = dataset.split(split_ratio=[0.75, 0.05, 0.2],
                                                  random_state=random.seed(random_seed))

print(f'Num Train: {len(train_data)}')
print(f'Num Valid: {len(valid_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 301269
Num Valid: 80338
Num Test: 20085


In [119]:
text.build_vocab(train_data, max_size=vocabulary_size)
label.build_vocab(train_data)

print(f'Vocabulary size: {len(text.vocab)}')
print(f'Number of classes: {len(label.vocab)}')

Vocabulary size: 20002
Number of classes: 5


In [120]:
label.vocab.freqs

Counter({'1.0': 39502,
         '2.0': 29335,
         '3.0': 37326,
         '4.0': 72330,
         '5.0': 122776})

In [124]:
train_loader, valid_loader, test_loader = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=batch_size,
    sort_within_batch=True, # necessary for packed_padded_sequence
    sort_key=lambda x: len(x.text),
    device=device)

print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.stars.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.stars.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.text[0].size()}')
    print(f'Target vector size: {batch.stars.size()}')
    break

Train
Text matrix size: torch.Size([475, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([6, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([7, 128])
Target vector size: torch.Size([128])


### LSTM

In [0]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text, text_length):

        #[sentence len, batch size] => [sentence len, batch size, embedding size]
        embedded = self.embedding(text)
        
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, text_length)
        
        #[sentence len, batch size, embedding size] => 
        #  output: [sentence len, batch size, hidden size]
        #  hidden: [1, batch size, hidden size]
        packed_output, (hidden, cell) = self.rnn(packed)
        
        return self.fc(hidden.squeeze(0)).view(-1)

In [0]:
input_dim = len(text.vocab)

torch.manual_seed(random_seed)
model = RNN(input_dim, embedding_dim, hidden_dim, output_dim)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [0]:
def compute_binary_accuracy(model, data_loader, device):
    model.eval()
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for batch_idx, batch_data in enumerate(data_loader):
            text, text_lengths = batch_data.text
            logits = model(text, text_lengths)
            predicted_labels = (torch.sigmoid(logits) > 0.5).long()
            num_examples += batch_data.stars.size(0)
            correct_pred += (predicted_labels.long() == batch_data.stars.long()).sum()
        return correct_pred.float()/num_examples * 100

In [145]:
start_time = time.time()

for epoch in range(num_epoch):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text, text_lengths = batch_data.text
        text_lengths[text_lengths==0] = 1
        
        ### FORWARD AND BACK PROP
        logits = model(text, text_lengths)
        cost = F.binary_cross_entropy_with_logits(logits, batch_data.stars)
        optimizer.zero_grad()
        
        cost.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 200:
            print (f'Epoch: {epoch+1:03d}/{num_epoch:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Cost: {cost:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_binary_accuracy(model, train_loader, device):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_binary_accuracy(model, valid_loader, device):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_binary_accuracy(model, test_loader, device):.2f}%')

Epoch: 001/015 | Batch 000/2354 | Cost: -70.6906
Epoch: 001/015 | Batch 200/2354 | Cost: -122.9291
Epoch: 001/015 | Batch 400/2354 | Cost: -32.2435
Epoch: 001/015 | Batch 600/2354 | Cost: -66.9548
Epoch: 001/015 | Batch 800/2354 | Cost: -45.4285
Epoch: 001/015 | Batch 1000/2354 | Cost: -78.6135
Epoch: 001/015 | Batch 1200/2354 | Cost: -119.3737
Epoch: 001/015 | Batch 1400/2354 | Cost: -111.9911
Epoch: 001/015 | Batch 1600/2354 | Cost: -82.6179
Epoch: 001/015 | Batch 1800/2354 | Cost: -69.1725
Epoch: 001/015 | Batch 2000/2354 | Cost: -82.5294
Epoch: 001/015 | Batch 2200/2354 | Cost: -119.9547


RuntimeError: ignored

In [130]:
batch.text

(tensor([[  317,    19,   258,   250,   313,     4,    52,    45,    41,   136,
           6495,    24,     4,     6,   176,    39,    14,    33,   743,  1858,
              8,    52,    56,    21,     8,     5,   336,  1702,     8,     8,
             45,    13,   105,   304,     7,  7721,    15,  1467,     2,    47,
              5,   141,   126,   186,   693,    28,   126,   163,     6,    83,
           1163,    15,    26,     9,     5,     5,  1283,  5398,    94, 17407,
            795,     3,    23,   809,   104,     5,  1360,   191,    24,     3,
            609,    25,   609,   281,     4,  1272,    52,  1041,     8,   570,
            144,     8,     8,    24, 14178,   113,     8,    35,    33,    52,
            774,   609,  9066,   288,   126,   155,     4,     4,    33,     8,
              8,   267,    15,    12,    25,    99,     8,    33,    25,     0,
             24,   596,    42,   112,    33,    33,   242,    24,     8,    55,
            417,   317,     0,  1139,   

In [140]:
text_lengths[text_lengths==0] = 1
text_lengths

tensor([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
        7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4,
        4, 4, 3, 3, 2, 2, 1, 1], device='cuda:0')