In [None]:
import numpy as np
import torch
import time
import nltk
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from torch.nn.functional import pad
cuda = torch.cuda.is_available()

In [15]:
class Network(nn.Module):
    def __init__(self, input_channel, out_channel, kernel_sizes, output_dim):
        super().__init__()
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = input_channel, 
                                              out_channels = out_channel, 
                                              kernel_size = ks)
                                    for ks in kernel_sizes
                                    ])
        
        self.linear = nn.Linear(len(kernel_sizes) * out_channel, output_dim)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, embedded):     
        embedded = embedded.permute(0, 2, 1)       
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.linear(cat)
input_channel = 300
out_channel = 100
kernel_sizes = [3,4,5]
output_dim = 5

In [None]:
#predict
def tokenize_sentence(sentence, word2vec_dict):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer() 
    english_stopwords = stopwords.words('english')
    sentence = sentence.strip()
    tokenized_sentence = [lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords]
    return tokenized_sentence

def load_word2vec_dict(word2vec_dict_paths):
  word2vec_dict = []
  for path in word2vec_dict_paths:
    word2vec = torch.load(path)
    word2vec_dict += list(word2vec.items())
  return dict(word2vec_dict)

def predict(sentence, model_path = "./models/xentropy_adam_lr0.0001_wd0.0005_bs128", word2vec_dict_paths = ["./word2vec/word2vec_dict_{}".format(i+1) for i in range(10)],max_seq_length = 29):
  word2vec_dict = load_word2vec_dict(word2vec_dict_paths)
  tokenized_sentence = tokenize_sentence(sentence,word2vec_dict)
  embedding = np.array([word2vec_dict[word] for word in tokenized_sentence])

  temp = torch.load(model_path,map_location=torch.device('cpu'))  
  model = Network(input_channel, out_channel, kernel_sizes, output_dim)
  model.load_state_dict(temp['model_state_dict'])
  model.eval()
  
  embedding = np.expand_dims(embedding,axis=0)
  embedding = pad(torch.FloatTensor(embedding), (0, 0, 0, max_seq_length - len(embedding)))
  outputs = model(embedding)
  
  _, predicted = torch.max(outputs.data, 1)
  return outputs.data, predicted.item() + 1, embedding

# Example: input: sentence  output: model_outputs, predicted_rating, sentence_embedding
# predict("what a wonderful movie! I love it!")
# predict("what a terrible movie! I hate it!")