In [3]:
import numpy as np
import torch
import time
import nltk
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from torch.nn.functional import pad
import zipfile
import os
from os import listdir
from zipfile import ZipFile
from os.path import isfile, join
from urllib.request import urlopen
import pickle
cuda = torch.cuda.is_available()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
class Network(nn.Module):
    def __init__(self, input_channel, out_channel, kernel_sizes, output_dim):
        super().__init__()
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = input_channel, 
                                              out_channels = out_channel, 
                                              kernel_size = ks)
                                    for ks in kernel_sizes
                                    ])
        
        self.linear = nn.Linear(len(kernel_sizes) * out_channel, output_dim)
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, embedded):     
        embedded = embedded.permute(0, 2, 1)       
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        return self.linear(cat)
input_channel = 300
out_channel = 100
kernel_sizes = [3,4,5]
output_dim = 5

In [None]:
# !wget http://nlp.stanford.edu/data/glove.42B.300d.zip
# !unzip *.zip
# from google.colab import drive
# drive.mount("/content/drive")

In [None]:
# read GloVe
# citation: https://stackoverflow.com/questions/37793118/load-pretrained-glove-vectors-in-python
def loadGloveModel(File):
    print("Loading Glove Model")
    with open(File, 'r', encoding='utf-8') as f:
        gloveModel = {}
        for line in f:
            splitLines = line.split()
            word = splitLines[0]
            wordEmbedding = np.array([float(value) for value in splitLines[1:]])
            gloveModel[word] = wordEmbedding
        print(len(gloveModel)," words loaded!")
        return gloveModel
# word2vec_dict = loadGloveModel("./glove.42B.300d.txt")

In [4]:
#predict
def tokenize_sentence(sentence, word2vec_dict):
    tokenizer = RegexpTokenizer(r'\w+')
    lemmatizer = WordNetLemmatizer() 
    english_stopwords = stopwords.words('english')
    sentence = sentence.strip()
    tokenized_sentence = [lemmatizer.lemmatize(token.lower()) for token in tokenizer.tokenize(sentence) if token.lower() in word2vec_dict and token.lower() not in english_stopwords]
    return tokenized_sentence

def load_word2vec_dict(word2vec_urls,word2vec_dir):
  word2vec_dict = []
  for i in range(len(word2vec_urls)):
    url = word2vec_urls[i]
    torch.hub.download_url_to_file(url,word2vec_dir)
    word2vec = pickle.load(open("./drive/MyDrive/good_or_bad/word2vec/word2vec_dict1.pt", "rb" ))
    word2vec = list(word2vec.items())
    word2vec_dict += word2vec
  
  return dict(word2vec_dict)
      
def predict(sentence, model_url = 'https://github.com/CMU-IDS-2020/fp-good_or_bad/raw/main/models/xentropy_adam_lr0.0001_wd0.0005_bs128.pt', word2vec_urls = ['https://github.com/CMU-IDS-2020/fp-good_or_bad/blob/main/word2vec/word2vec_dict{}.pt'.format(i+1) for i in range(5)],word2vec_dir = "./word2vec",max_seq_length = 29):
  word2vec_dict = load_word2vec_dict(word2vec_urls,word2vec_dir)
  tokenized_sentence = tokenize_sentence(sentence,word2vec_dict)
  embedding = np.array([word2vec_dict[word] for word in tokenized_sentence])

  model = Network(input_channel, out_channel, kernel_sizes, output_dim)
  model.load_state_dict(torch.hub.load_state_dict_from_url(model_url, progress=False))
  model.eval()
  
  embedding = np.expand_dims(embedding,axis=0)
  embedding = pad(torch.FloatTensor(embedding), (0, 0, 0, max_seq_length - len(embedding)))
  outputs = model(embedding)
  
  _, predicted = torch.max(outputs.data, 1)
  return outputs.data, predicted.item() + 1, embedding

# Example: input: sentence  output: model_outputs, predicted_rating, sentence_embedding
predict("what a wonderful movie! I love it!")
# predict("what a terrible movie! I hate it!")

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Downloading: "https://github.com/CMU-IDS-2020/fp-good_or_bad/raw/main/models/xentropy_adam_lr0.0001_wd0.0005_bs128.pt" to /root/.cache/torch/hub/checkpoints/xentropy_adam_lr0.0001_wd0.0005_bs128.pt


(tensor([[-5.6343, -4.9686, -2.3953,  3.4261,  5.1620]]),
 5,
 tensor([[[-0.1407,  0.0105, -0.3944,  ...,  0.1503,  0.0072,  0.0554],
          [-0.4207, -0.1447,  0.1019,  ...,  0.1122, -0.1660,  0.7195],
          [-0.1493, -0.0068, -0.2495,  ..., -0.0461,  0.1480,  0.4215],
          ...,
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
          [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]]))

In [None]:
# word2vec_dict = torch.load("/content/drive/MyDrive/good_or_bad/word2vec/word2vec_dict_1")
# word2vec_list = list(word2vec_dict.items())
# num = 4
# per_cnt = len(word2vec_list) // num
# os.chdir('/content/drive/MyDrive/good_or_bad/word2vec')
# for i in range(num):
#   if i == num-1:
#     vec = dict(word2vec_list[i*per_cnt:])
#   else:
#     vec = dict(word2vec_list[i*per_cnt:(i+1)*per_cnt])
#   file_name = "word2vec_dict{}".format(i+1)
#   zip_file_name = "word2vec_dict{}.zip".format(i+1)
#   torch.save(vec,file_name)
#   ZipFile(zip_file_name, 'w').write(file_name)

# os.chdir('/content')


# word2vec = torch.load("./drive/MyDrive/good_or_bad/word2vec_dict5")
# pickle.dump(word2vec,open( "./drive/MyDrive/good_or_bad/word2vec/word2vec_dict5.pt", "wb" ))