# Step0: Import Packages

In [31]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn import metrics
from string import punctuation
import torch.nn.functional as F
from collections import Counter
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader, TensorDataset

# Step1: Get dataset

In [32]:
reviews = pd.read_csv("../data/english_yep_reviews.csv")

# Step2: Count the words

In [33]:
all_reviews = list(reviews['cleaned'])
all_text = " ".join(all_reviews)
all_words = all_text.split()
print(all_words[0:10])

# Count all the words using Counter Method
count_words = Counter(all_words)
total_words = len(all_words)
sorted_words=count_words.most_common(total_words)
print("Top ten occuring words : ", sorted_words[:10])

['someon', 'ha', 'work', 'mani', 'museum', 'wa', 'eager', 'visit', 'thi', 'galleri']
Top ten occuring words :  [('wa', 199857), ('thi', 86639), ('place', 55772), ('food', 53489), ('good', 50852), ('great', 44401), ('veri', 44062), ('time', 42695), ('get', 38251), ('would', 38160)]


# Step3: Create a dictionary

In [34]:
# Create a dictionary
# We will start createing dictionary with index 1 because 0 
    # is reserved for padding

vocab_to_int = {w: i+1 for i, (w, c) in enumerate(sorted_words)}

# Step4: Encode the review data

In [35]:
# Encode review
encoded_reviews = list()
for review in all_reviews:
    encoded_review = list()
    for word in review.split():
        if word not in vocab_to_int.keys():
            # if word is not available in vocab_to_int put 0 in that place
            encoded_review.append(0)
        else:
            encoded_review.append(vocab_to_int[word])
    encoded_reviews.append(encoded_review)

# Step5: Make the encode_review of the same length

In [36]:
# make all the encoded_review of the same length
# this step will return features of review_ints,
# where each review is padded with 0's or truncated to the input seq_length.
# the longest review has 564 words
# sequence_length is 100, but also could be 150, 200, 250 (here just for save energy)

sequence_length = 100
features = np.zeros((len(encoded_reviews), sequence_length), dtype=int)
for i, review in enumerate(encoded_reviews):
    review_len = len(review)
    if review_len <= sequence_length:
        zeros = list(np.zeros(sequence_length-review_len))
        new = zeros+review
    else:
        new = review[:sequence_length]
    features[i, :] = np.array(new)

# Step6: Create labels

In [37]:
# set labels, 0 negative, 1 neutral, 2 positive
labels = list(reviews['Review_Labels'])

# Step7: Split this feature data into Traning, Testing and Validation set

In [38]:
# split this feature data into training and validation set
# 80% training, 10% test and 10% validation dataset

# However, for cpu running, set 10% of them
features = features[:int(0.5*len(features))]
labels = labels[:int(0.5*len(labels))]
train_x = features[:int(0.8*len(features))]
train_y = labels[:int(0.8*len(features))]
valid_x = features[int(0.8*len(features)):int(0.9*len(features))]
valid_y = labels[int(0.8*len(features)):int(0.9*len(features))]
test_x = features[int(0.9*len(features)):]
test_y = labels[int(0.9*len(features)):]
print(len(train_y), len(valid_y), len(test_y))

39999 5000 5000


# Step8: Create DataLoader objects for Pytorch model

In [39]:
import torch
from torch.utils.data import DataLoader, TensorDataset

#create Tensor Dataset
train_data=TensorDataset(torch.FloatTensor(train_x), torch.FloatTensor(train_y))
valid_data=TensorDataset(torch.FloatTensor(valid_x), torch.FloatTensor(valid_y))
test_data=TensorDataset(torch.FloatTensor(test_x), torch.FloatTensor(test_y))

#dataloader
# remember to add drop_last=True, which will delete the last batch of the data if it's size is not equal to batch_size
batch_size=100
train_loader=DataLoader(train_data, batch_size=batch_size, shuffle=True, drop_last=True)
valid_loader=DataLoader(valid_data, batch_size=batch_size, shuffle=True, drop_last=True)
test_loader=DataLoader(test_data, batch_size=batch_size, shuffle=True, drop_last=True)

# Step9: Analyze the dataloader data

In [40]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([100, 100])
Sample input: 
 tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 1.1740e+03, 2.0000e+00,
         7.0000e+01],
        [1.0000e+00, 6.8200e+02, 3.0900e+03,  ..., 1.0000e+00, 3.0200e+02,
         1.5700e+02],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 3.3710e+03, 1.1308e+04,
         3.2500e+02],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 9.0000e+02, 8.3000e+01,
         7.9900e+02],
        [2.3700e+02, 2.8000e+01, 4.0000e+01,  ..., 1.0000e+00, 1.7400e+02,
         4.0700e+02],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 6.0000e+00, 1.2900e+02,
         1.9600e+02]])
Sample label size:  torch.Size([100])
Sample label: 
 tensor([0., 2., 2., 2., 2., 2., 2., 0., 0., 2., 2., 2., 2., 0., 2., 0., 2., 2.,
        0., 2., 2., 2., 2., 0., 2., 2., 2., 2., 2., 0., 0., 2., 2., 0., 2., 0.,
        1., 1., 2., 2., 2., 2., 2., 2., 1., 0., 0., 2., 2., 0., 2., 2., 0., 2.,
        2., 0., 2., 2., 2., 2., 2., 0., 2., 1., 1.,

# Step10: Create an R-CNN model

In [78]:
class RCNN_model(nn.Module):
    def __init__(self, vocab_size, embed, hidden_size, num_layers, dropout, num_classes, pad_size):
        super(RCNN_model, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed)
        self.bidirectinoal=True
        self.lstm = nn.LSTM(embed, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.maxpool = nn.MaxPool1d(pad_size)
        self.fc = nn.Linear(hidden_size * 2 + embed, num_classes)
    
    def forward(self, x):
        
        embed = self.embedding(x)
        print(embed.shape)
        out, _ = self.lstm(embed)
        out = torch.cat((embed, out), 2)
        out = F.relu(out)
        out = out.permute(0, 2, 1)
        out = self.maxpool(out).squeeze()
        print(out.shape)
        out = self.fc(out)
        
        return out
        

# Step11: Initialize the R-CNN models

In [79]:
vocab_size = len(vocab_to_int) + 1
embed = 300
hidden_size = 256
num_layers = 2
num_classes = 3
dropout = 0.5
pad_size = 32

model_rcnn = RCNN_model(vocab_size, embed, hidden_size, num_layers,
                        dropout, num_classes, pad_size)
print(model_rcnn)

RCNN_model(
  (embedding): Embedding(77398, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True, dropout=0.5)
  (maxpool): MaxPool1d(kernel_size=32, stride=32, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=812, out_features=3, bias=True)
)


# Step12: Train the model

In [80]:
def train(epoch, model):
    lr = 0.001  # learning rate
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    total_batch = 0
    train_loss = 0
    result_dict = {}
    epoch_list = []
    batch_list = []
    loss_list = []
    acc_list = []
    for batch_idx, (trains, labels) in enumerate(train_loader):
        
        outputs = model(trains.long())
        model.zero_grad()
        loss = F.cross_entropy(outputs, labels.long())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
        
        true = labels.data.cpu()
        predic = torch.max(outputs.data, 1)[1].cpu()
        train_acc = metrics.accuracy_score(true, predic)
        loss_value = train_loss/(batch_idx+1)
        epoch_list.append(epoch)
        batch_list.append(batch_idx)
        loss_list.append(loss_value)
        acc_list.append(train_acc)

        if total_batch % 20 == 0 :
            print('epoch: {}'.format(epoch), 'batch: {}'.format(batch_idx), 
                  'total train loader: {}'.format(len(train_loader)),
                  'Loss: %.3f | Acc: %.3f'
            % (loss_value, train_acc))
        total_batch += 1
        
    result_dict['epoch'] = epoch_list
    result_dict['batch'] = batch_list
    result_dict['loss'] = loss_list
    result_dict['acc'] = acc_list

    return pd.DataFrame(result_dict)

## Step12-1. CNN model

In [81]:
epochs = 10
rcnn_train_result = pd.DataFrame()
for epoch in range(epochs):
    train_result = train(epoch, model_rcnn)
    rcnn_train_result = rcnn_train_result.append(train_result, ignore_index=True)
    

torch.Size([100, 100, 300])
torch.Size([100, 556, 3])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (55600x3 and 812x3)

In [None]:
rcnn_train_result.to_csv('../result/RCNN/rcnn_train_result.csv')

# Step13: Test the model

In [44]:
def test(epoch, model):
    lr = 0.001  # learning rate
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    total_batch = 0
    train_loss = 0
    result_dict = {}
    epoch_list = []
    batch_list = []
    loss_list = []
    acc_list = []
    for batch_idx, (trains, labels) in enumerate(test_loader):

        outputs = model(trains.long()[None, ...])
        model.zero_grad()
        loss = F.cross_entropy(outputs, labels.long())
        loss.backward()
        optimizer.step()
        
        
        true = labels.data.cpu()
        predic = torch.max(outputs.data, 1)[1].cpu()
        train_acc = metrics.accuracy_score(true, predic)
        train_loss += loss.item()
        loss_value = train_loss/(batch_idx+1)
        epoch_list.append(epoch)
        batch_list.append(batch_idx)
        loss_list.append(loss_value)
        acc_list.append(train_acc)
            
        if total_batch % 50 == 0 :
            print('epoch: {}'.format(epoch), 'batch: {}'.format(batch_idx), 
                  'total train loader: {}'.format(len(test_loader)),
                  'Loss: %.3f | Acc: %.3f'
            % (loss_value, train_acc))
        
        total_batch += 1
        
    result_dict['epoch'] = epoch_list
    result_dict['batch'] = batch_list
    result_dict['loss'] = loss_list
    result_dict['acc'] = acc_list
    
    return pd.DataFrame(result_dict)

In [None]:
bilstm_test_result = pd.DataFrame()
bilstm_test_result.append(train(0, model_bilstm), ignore_index=True)

epoch: 0 batch: 0 total train loader: 50 Loss: 1.044 | Acc: 0.760
