In [2]:
import os
import json
import gzip
import pickle
import numpy as np
import glob
import random 

import pandas as pd
from urllib.request import urlopen
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.metrics.pairwise import cosine_similarity

#import more_itertools as mit
#from keras.preprocessing.sequence import pad_sequences


import torch
import torch.nn as nn
#from pytorch_pretrained_bert import BertTokenizer, BertModel
from transformers import BertTokenizer, BertModel, BertConfig
from torch.utils.data import DataLoader, Dataset

In [2]:
from prep_amazon_books import tokenize_text_to_ids

In [3]:
bert_loaded = False

In [4]:
%%time

if not bert_loaded:
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_tokenizer.add_special_tokens({"unk_token": '[UNK]', 'cls_token': '[CLS]', 
                                       'pad_token':'[PAD]', 'sep_token':'[SEP]'})
    print(len(bert_tokenizer))
    assert bert_tokenizer.cls_token == '[CLS]'
    bert_tokenizer.sep_token_id
    
    bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True,
                                            output_attentions=True)
    bert_model.resize_token_embeddings(len(bert_tokenizer))
    
    bert_loaded = True

30522
CPU times: user 2 s, sys: 526 ms, total: 2.53 s
Wall time: 9.41 s


In [5]:
os.listdir('../datasets')

['meta_Computers.json.gz',
 'Electronics_5.json.gz',
 'out-pickle',
 'Books_5.json.gz',
 'one_week.tar.gz',
 'contentdata.tar.gz',
 'books-pickle']

In [6]:
(dirpath, dirnames, filenames) = next(os.walk('../datasets/books-pickle/'), (None, None, []))
filenames

['df.json',
 'item_stats.pkl',
 'pytorch_model.bin',
 'df.csv',
 'user_stats.pkl',
 'df.pkl',
 'config.json',
 'encoded_text.pkl']

In [7]:
books_path = '../datasets/Books_5.json.gz'
pkl_path = '../datasets/books-pickle/'

with open(pkl_path + "df.pkl", 'rb') as fin:
    df = pickle.load(fin)

with open(pkl_path + "user_stats.pkl", 'rb') as fin:
    user_dict = pickle.load(fin)
    
with open(pkl_path + "item_stats.pkl", 'rb') as fin:
    item_dict = pickle.load(fin)

In [8]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,itemID,reviewText,reviewWords
0,5,False,2005-03-30,A1REUF3A1YCPHM,1713353,A story children will love and learn from The ...,170
1,5,True,2016-06-20,AVP0HXC9FG790,1713353,Five Stars The kids loved it!,6
2,5,True,2016-01-24,A324TTUBKTN73A,1713353,Five Stars My students (3 & 4 year olds) loved...,15
3,5,False,2015-07-09,A2RE7WG349NV5D,1713353,Five Stars LOVE IT,4
4,5,True,2015-01-18,A32B7QIUDQCD0E,1713353,Five Stars Great!,3


In [9]:
user_df = pd.DataFrame.from_dict(user_dict, orient='index')

In [10]:
user_df.head()

Unnamed: 0,n_reviews,m_rating,m_words
A0334855HN6E38CXWXZR,1,5.0,12.0
A0790722OCX87RKL2J3T,1,5.0,71.0
A100JBBLCC0NUC,1,5.0,27.0
A101OKMJFCIWYH,2,5.0,26.0
A1022R52JDJVMA,1,5.0,58.0


In [11]:
item_df = pd.DataFrame.from_dict(item_dict, orient='index')

In [12]:
word_embeddings = bert_model.get_input_embeddings()

Options for Encoding Sentences: 
1. Bert Embeddings (Raw): tokenise text and apply Bert Embeddings, no further processing [batch_size x seq_len x dim_e] => subsequent (sophisticated) Encoder should produce sentence representation [batch_size x dim_s]
2. Last Hidden States: sequence encoded by Bert, powerful processing [batch_size x seq_len x dim_e] => subsequent (simple) Encoder produce sentence representation [batch_size x dim_s]
3. Average over Last Hidden States: seq. encoded by Bert and average in the end to yield sentence representation [batch_size x dim_s]
    

In [13]:
%%time
max_elems=2
encoded_in = [bert_tokenizer.encode(sent, max_length=30, add_special_tokens=True, pad_to_max_length=True) for sent in list(df['reviewText'])[:3]]

CPU times: user 10.5 ms, sys: 2.89 ms, total: 13.3 ms
Wall time: 13 ms


In [14]:
x_in = torch.tensor(encoded_in, requires_grad=False).long()
#outputs = bert_model(x_in)[:2]

BERT for Sentence Encoding

1. Use last hidden state of CLS 
2. Use average of last hidden states of all tokens 
3. Use average over multiple hidden layers (A) for CLS or (B) for all tokens

In [15]:
"""
def encode_sequence(df, tokenizer, sent_tokenize, bert_model, max_len=50, batch_size=2, method='cls_last', n=None):

    encoded_text = {}
    start_idx = 0
    stop_idx = batch_size

    while(start_idx < len(df)):

        if stop_idx > len(df):
            stop_idx = len(df)

        tokens = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text
                  in list(df['reviewText'])[start_idx:stop_idx]]

        x_in = torch.tensor(tokens, requires_grad=False, device=device).long()

        with torch.no_grad():
            last_hidden, pooled_out, hidden_outs, attention_weights = bert_model(x_in)

        assert last_hidden.shape[0] == batch_size

"""


"\ndef encode_sequence(df, tokenizer, sent_tokenize, bert_model, max_len=50, batch_size=2, method='cls_last', n=None):\n\n    encoded_text = {}\n    start_idx = 0\n    stop_idx = batch_size\n\n    while(start_idx < len(df)):\n\n        if stop_idx > len(df):\n            stop_idx = len(df)\n\n        tokens = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text\n                  in list(df['reviewText'])[start_idx:stop_idx]]\n\n        x_in = torch.tensor(tokens, requires_grad=False, device=device).long()\n\n        with torch.no_grad():\n            last_hidden, pooled_out, hidden_outs, attention_weights = bert_model(x_in)\n\n        assert last_hidden.shape[0] == batch_size\n\n"

In [40]:
batch_size=2
max_len=50
methods={'last_cls':0, 'sum_last_n':4}
#n=4

encoded_text = {}
start_idx = 0
stop_idx = batch_size


##BERT for Feature-Extraction

#tokenize
tokens, _ = zip(*[tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len, lower_case=True) for text
          in list(df['reviewText'])[start_idx:stop_idx]])

x_in = torch.tensor(tokens, requires_grad=False).long() #batch_size x max_len

tokens_raw, _ = zip(*[tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len, 
                                            add_special_tokens=False, lower_case=True) 
                                            for text in list(df['reviewText'])[start_idx:stop_idx]])

#print(x_in.shape)


In [48]:
#model forward pass
with torch.no_grad():
    bert_outputs = bert_model(x_in)

In [51]:
last_hidden, pooled_out, hidden_outs, attention_weights = bert_outputs
print(len(hidden_outs))
print(hidden_outs[0].shape)
print(len(attention_weights))

13
torch.Size([2, 50, 768])
12


In [46]:
def get_naive_seq_emb(tokens, word_embeddings, method='naive_mean'):
    tokens_emb = word_embeddings(torch.tensor(tokens))
    if method == 'naive_mean':
        emb_out = torch.mean(tokens_emb, dim=1)
    elif method == 'naive_sum':
        emb_out = torch.sum(tokens_emb, dim=1)
    else:
        raise NotImplementedError()

    return emb_out

In [47]:
m = 'naive_mean'
naive_emb = get_naive_seq_emb(tokens_raw, word_embeddings, method=m)
naive_emb.shape

torch.Size([2, 768])

In [17]:
from collections import defaultdict

In [23]:

    
#save tensor of encoded text into separate dictionary
keys = range(start_idx, stop_idx)

encoded_text = defaultdict(dict)
encodings = {}


item_inds = range(start_idx, stop_idx) #absolute index of items of size batch_size 
slice_idx = list(range(0, len(item_inds)))

In [187]:
def extract_bert_features(bert_output, method, n_layers):
    """
    
    Argument: 
        rel_slice: index that indicates which relative slice of the output should be used
    
    """
    
    _keys = ['pooled_out', 'last_cls', 'pool_all_last', 'pool_cls_n', 'pool_last_n', 'sum_last_n', 'sum_all']
    
    assert len(bert_output) == 4
    
    last_hidden, pooled_out, hidden_outs, attention_weights = bert_output
    
    if 'pooled_out' == method:
        x_out = pooled_out # batch_size x dim_emb
    elif 'last_cls' == method:
        #take the embedding of CLS token of last hidden layer
        x_out = last_hidden[:,0,:] # batch_size x dim_emb
    elif 'pool_all_last' == method:
        #average embeddings of last hidden layer of all tokens
        x_out = torch.mean(last_hidden, dim=1)
    elif 'pool_cls_n'==method:
        # and n != None
        x_out = torch.mean(torch.cat([hidden[:,0,:].unsqueeze(1) for hidden in hidden_outs[-n_layers:]], dim=1), dim=1)
    elif 'pool_last_n'==method:
        #average embeddings of last N hidden layers of all tokens
        x_out = torch.mean(torch.cat(hidden_outs[-n_layers:], dim=1), dim=1)
    elif 'sum_last_n' == method:
        #sum embeddings of last N hidden layers of all tokens
        x_out = torch.sum(torch.cat(hidden_outs[-n_layers:], dim=1), dim=1)
        #sum last four hidden => 95.9 F1 on dev set for NER
    elif 'sum_all'==method and n:
        x_out = torch.sum(torch.cat(hidden_outs, dim=1), dim=1)
    else:
        raise KeyError("'{}' is not a valid method!".format(method))
        
    #print(method)
    #print(x_out.shape)
    
    return x_out


def get_dummy_bert_output(batch_size, dim_bert=64, seq_len=20, n_layers=12, hidden_outs=True, attn_weights=True):
    bert_outs = []
    #last_hidden, pooled_out, hidden_outs, attention_weights = bert_output
    # 1. last_hidden
    last_hidden = torch.randn([batch_size, seq_len, dim_bert])
    bert_outs.append(last_hidden) #batch_size x seq_len x dim_bert
    
    # 2. pooled_out
    bert_outs.append(torch.randn([batch_size, dim_bert]))
    
    # 3. hidden_outs
    if hidden_outs:
        h_outs = [torch.randn([batch_size, seq_len, dim_bert]) for n in range(n_layers)]
        h_outs.append(last_hidden)
        bert_outs.append(h_outs)
    
    # 4. attn_weights
    if attn_weights:
        a_weights = [torch.randn([batch_size, seq_len, dim_bert]) for n in range(n_layers)]
        bert_outs.append(a_weights)

    return bert_outs

In [180]:
bert_dummy_outs = get_dummy_bert_output(batch_size=4)
#len(bert_dummy_outs)
l_hidden, p_out, h_outs, a_weights = bert_dummy_outs
#print(len(h_outs))
#print(h_outs[0].shape)

In [181]:
%%time
# items_inds = [0, 1, 2]
# methods = [A, B]
# value = [tensor1, tensor2]

#naive approach with slices relative to keys
n_iterations = 0 #loops: n_methods * n_keys

for (method, n) in methods.items():
    encodings[method] = extract_bert_features(bert_outputs, method, n) #batch_size x emb_dim
    assert encodings[method].shape[0] == len(slice_idx)
    
    for idx in slice_idx:
        encoded_text[item_inds[idx]][method] = encodings[method][idx, :]
        n_iterations+=1
        
print("Iterations: {}" .format(n_iterations))
#encoded_text = {**encoded_text, **dict(zip(item_inds, ))}
    
#encodings.items()
#encodings['last_cls'].shape
#encoded_text.keys()
encoded_text[1].keys()
#encoded_text.keys()

KeyError: 0

In [176]:
list(range(0,5))

[0, 1, 2, 3, 4]

In [189]:
%%time

n_items=50
methods={'last_cls':0, 'sum_last_n':4}
batch_size=6
emb_dim = 64
max_len=20

enc_text = test_feature_extraction(n_items, methods, batch_size, emb_dim, max_len)

48
50
2
Iterations: 100
CPU times: user 26.8 ms, sys: 0 ns, total: 26.8 ms
Wall time: 26.8 ms


In [178]:
len(enc_text.keys())

50

In [22]:
class BertFeatureExtractor():

    def __init__(self, device, **kwargs):
        super().__init__(**kwargs)

        self.device = device

        self.sent_tokenizer = sent_tokenize
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.bert_tokenizer.add_special_tokens({"unk_token": '[UNK]', 'cls_token': '[CLS]',
                                           'pad_token': '[PAD]', 'sep_token': '[SEP]'})

        self.bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True,
                                               output_attentions=True)
        self.bert_model.resize_token_embeddings(len(self.bert_tokenizer))
        self.bert_model.to(device)

        self._feat_methods = ['pooled_out', 'last_cls', 'pool_all_last', 'pool_cls_n', 'pool_last_n', 'sum_last_n', 'sum_all']
        self._naive_feat_methods = ['naive_mean', 'naive_sum']

        #self.word_embeddings = 


    @property
    def _get_feat_methods(self):
        return self._feat_methods

    @property
    def _get_naive_feat_methods(self):
        return self._naive_feat_methods

    def __call__(self, *args, **kwargs):
        return self.bert_model(*args, **kwargs)

    def get_naive_seq_emb(self, tokens, method='naive_mean'):

        if method not in self._get_naive_feat_methods():
            raise KeyError("'{}' is not a valid method!".format(method))
        
        word_embeddings = self.bert_model.get_input_embeddings()
        word_embeddings.to(self.device)
        tokens_emb = word_embeddings(torch.tensor(tokens, device=self.device)).to(self.device)
        if method == 'naive_mean':
            emb_out = torch.mean(tokens_emb, dim=1)
        elif method == 'naive_sum':
            emb_out = torch.sum(tokens_emb, dim=1)
        else:
            raise NotImplementedError()

        return emb_out
    
    def get_bert_word_embeddings(self):
        return self.bert_model.get_input_embeddings().to(self.device)

    def extract_bert_features(self, bert_output, method, n_layers):
        """

        Argument:
            rel_slice: index that indicates which relative slice of the output should be used

        """

        assert len(bert_output) == 4

        if method not in self._get_feat_methods():
            raise KeyError("'{}' is not a valid method!".format(method))

        last_hidden, pooled_out, hidden_outs, attention_weights = bert_output

        if 'pooled_out' == method:
            x_out = pooled_out  # batch_size x dim_emb
        elif 'last_cls' == method:
            # take the embedding of CLS token of last hidden layer
            x_out = last_hidden[:, 0, :]  # batch_size x dim_emb
        elif 'pool_all_last' == method:
            # average embeddings of last hidden layer of all tokens
            x_out = torch.mean(last_hidden, dim=1)
        elif 'pool_cls_n' == method and n_layers:
            x_out = torch.mean(torch.cat([hidden[:, 0, :].unsqueeze(1) for hidden in hidden_outs[-n_layers:]], dim=1),
                               dim=1)
        elif 'pool_last_n' == method and n_layers:
            # average embeddings of last N hidden layers of all tokens
            x_out = torch.mean(torch.cat(hidden_outs[-n_layers:], dim=1), dim=1)
        elif 'sum_last_n' == method and n_layers:
            # sum embeddings of last N hidden layers of all tokens
            x_out = torch.sum(torch.cat(hidden_outs[-n_layers:], dim=1), dim=1)
            # sum last four hidden => 95.9 F1 on dev set for NER
        elif 'sum_all' == method:
            x_out = torch.sum(torch.cat(hidden_outs, dim=1), dim=1)
        else:
            raise NotImplementedError()

        # print(method)
        # print(x_out.shape)
        return x_out

    def test_feature_extraction(self, n_items, methods, batch_size, emb_dim, seq_len, **kwargs):

        encoded_text = defaultdict(dict)
        encodings = defaultdict(dict)
        # n_items = len(df)

        all_item_inds = range(n_items)
        start_idx = 0
        stop_idx = batch_size
        n_iterations = 0  # loops: n_methods * n_keys

        slice_idx = list(range(0, batch_size))

        while (start_idx < n_items):

            # handling edge cases
            if stop_idx > n_items:
                # indices and slice range
                slice_idx = list(range(0, (n_items - start_idx)))
                stop_idx = n_items

            # divide item_inds into batches
            rel_item_inds = range(start_idx, stop_idx)

            # assert len(rel_item_inds) == batch_size
            if len(rel_item_inds) != batch_size:
                print(start_idx)
                print(stop_idx)
                print(len(slice_idx))

            # create naive sequence features
            naive_method = 'naive_mean'
            # naive_emb = get_naive_seq_emb(tokens_raw, word_embeddings, method=m) #batch_size x emb_dim
            naive_emb = torch.randn([batch_size, emb_dim])

            # generate dummy BERT output
            bert_dummy_out = get_dummy_bert_output(len(slice_idx), emb_dim, seq_len)

            # extract BERT features
            for (method, n) in methods.items():
                encodings[method] = self.extract_bert_features(bert_dummy_out, method, n)  # batch_size x emb_dim

                assert encodings[method].shape[0] == len(slice_idx)
                # print(encodings[method].shape[0])
                # print(len(slice_idx))

                # add features to dictionary
                for idx in slice_idx:
                    if naive_method not in encoded_text[rel_item_inds[idx]]:
                        encoded_text[rel_item_inds[idx]][naive_method] = naive_emb[idx, :]  # add naive features

                    encoded_text[rel_item_inds[idx]][method] = encodings[method][idx, :]
                    n_iterations += 1

            start_idx += (batch_size)
            stop_idx += (batch_size)

        print("Iterations: {}".format(n_iterations))

        return encoded_text

    def truncate_seq(self, tokens, max_len=512):
        if len(tokens) < max_len:
            tokens = tokens[:-1]
            n = max_len - len(tokens) - 1
            tokens += n * [self.bert_tokenizer.pad_token]
        elif len(tokens) > max_len:
            tokens = tokens[:max_len - 1]
        else:
            return tokens

        tokens.append(self.bert_tokenizer.sep_token)

        return tokens

    def tokenize_text_to_ids(self, text, max_len=512, add_special_tokens=True, lower_case=False):
        """
        With tokenizer, separate text first into tokens
        and then convert these to corresponding IDs of the vocabulary

        Return:
            tokens: list of token IDs
            n_words: number of words in full sequence (before truncating)
        """
        sents = self.sent_tokenizer(text)
        tokens = []
        n_words = 0
        added_tokens = 0

        if add_special_tokens:
            tokens.append(self.bert_tokenizer.cls_token)
            added_tokens += 1

        # split each sentence of the text into tokens
        for s in sents:
            if lower_case:
                tokens.extend([word.lower() for word in self.bert_tokenizer.tokenize(s) if word.isalpha()])
            else:
                tokens.extend([word for word in self.bert_tokenizer.tokenize(s) if word.isalpha()])

            if add_special_tokens:
                tokens.append(self.bert_tokenizer.sep_token)
                added_tokens += 1

        n_words = len(tokens) - added_tokens

        tokens = self.truncate_seq(tokens, self.bert_tokenizer, max_len)

        assert len(tokens) == max_len

        return self.bert_tokenizer.convert_tokens_to_ids(tokens), n_words

    def encode_input_ids(self, x_in):
        return self.bert_model(x_in)

In [4]:
%%time
bert_extractor = BertFeatureExtractor(device='cpu')

CPU times: user 1.94 s, sys: 639 ms, total: 2.58 s
Wall time: 11.2 s


In [6]:
bert_extractor.bert_tokenizer.vocab_size

30522

In [9]:
seq_len=20

tokens = [random.randint(1, 30522)] * seq_len
tokens

[20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631,
 20631]

In [25]:
del(bert_extractor)

In [14]:
x_in = torch.randint(30522, [2, seq_len]).long()

In [23]:
outputs = bert_extractor.encode_input_ids(x_in)

AttributeError: 'BertFeatureExtractor' object has no attribute 'encode_input_ids'

In [None]:
z1 = ['A', 'A', 'B', 'B', 'C', 'C']
z2 = ['k1', 'k2', 'k1', 'k2', 'k1', 'k2'] #
z3 = ['v1', 'v2', 'v3', 'v4', 'v5', 'v6']

d = defaultdict(dict)
for x, y, z in zip(z1, z2, z3):
    d[x][y] = z

print(dict(d))

In [None]:
encoded_text = {**encoded_text, **dict(zip(keys, encodings))}

In [25]:
encoded_text.items()

dict_items([(0, 'last_cls'), (1, 'sum_last_n')])

In [21]:
"""
dict1: 
    keys:= [0 ... N] index of review
    value := dict2, containing encodings of that review
        keys: [method_1, ... , method_N]
        values: [enc_1, ... , enc_N]
"""

'\ndict1: \n    keys:= [0 ... N] index of review\n    value := dict2, containing encodings of that review\n        keys: [method_1, ... , method_N]\n        values: [enc_1, ... , enc_N]\n'

print("Encoding Text..")

bert_model.to(device)

encoded_text = {}
start_idx = 0
stop_idx = batch_size

while(start_idx < len(df)):

    if stop_idx > len(df):
        stop_idx = len(df)

    encoded_in = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text
              in list(df['reviewText'])[start_idx:stop_idx]]

    encoded_in = torch.tensor(encoded_in, requires_grad=False, device=device).long()

    with torch.no_grad():
        last_hidden, pooled_out = bert_model(encoded_in)

    assert last_hidden.shape[0] == batch_size

    #save tensor of encoded text into separate dictionary
    keys = range(start_idx, stop_idx+1)
    encoded_text = {**encoded_text, **dict(zip(keys, last_hidden))}

    start_idx += batch_size
    stop_idx += batch_size

In [24]:
def create_user_representations(user_reviews, item_reviews, method='avg_items'):
    user_repr = {}
    for u in user_reviews.keys():
        
        if 'avg_items' == method:
            stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[u].keys()])
            #print(stacked_user_reviews.shape)
            user_repr[u] = torch.mean(stacked_reviews, dim=0)
        elif 'avg_items_user' == method:
            stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[u].keys()])
            stacked_reviews_user = torch.stack([user_reviews[u][r] for r in user_reviews[u].keys()])
            user_repr[u] = torch.mean(stacked_reviews+stacked_reviews_user, dim=0)
    
    return user_repr


def create_toy_representations(seq_emb=64, item_ids=range(100), user_ids = range(20), method='avg_items'):
        
    item_reviews = {}

    for i in item_ids:
        item_reviews[i] = torch.stack([torch.randn(seq_emb)] * random.randint(5,10), dim=0)
        item_reviews[i] = torch.mean(item_reviews[i], dim=0)
    
    
    user_reviews = {}
    
    for i in user_ids:
        user_reviews[i] = {}
        for r in random.sample(list(item_reviews.keys()), random.randint(5,10)):
        #user_reviews[i] = dict(zip(random.sample(list(item_reviews.keys()), random.randint(5,10)), torch.randn(seq_emb)))
            user_reviews[i][r] = torch.randn(seq_emb)
            
    return item_reviews, user_reviews, create_user_representations(user_reviews, item_reviews, method=method)

In [26]:
item_reviews, user_reviews, user_repr_avg = create_toy_representations()

In [27]:
user_repr_ui = create_user_representations(user_reviews, item_reviews, method='avg_items_user')

In [28]:
cos = []
for k in list(user_reviews[0].keys()):
    cos.append(cosine_similarity(user_repr_avg[0].unsqueeze(0), item_reviews[k].unsqueeze(0)).item())
    
print(np.mean(cos))
print(np.std(cos))

0.34152778796851635
0.06405663349608343


In [29]:
cos = []
for k in list(user_reviews[0].keys()):
    cos.append(cosine_similarity(user_repr_ui[0].unsqueeze(0), item_reviews[k].unsqueeze(0)).item())
    
print(np.mean(cos))
print(np.std(cos))

0.21049505844712257
0.08154537465453288


In [30]:
cosine_similarity(user_repr_ui[0].unsqueeze(0), user_repr_ui[0].unsqueeze(0)).item()

1.0

In [56]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, device, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device)
        return hidden

class LSTMNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, device, drop_prob=0.2, bi=False):
        super(LSTMNet, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob, bidirectional=bi)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.lstm(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden


In [67]:
#encode user with GRU
seq_emb = 64
input_dim = seq_emb
hidden_dim = seq_emb
output_dim = seq_emb

stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[0].keys()])
n_layers = stacked_reviews.shape[0]
batch_size = 1
device = 'cpu'


model = GRUNet(input_dim, hidden_dim, output_dim, n_layers, device)
h = model.init_hidden(batch_size)
#if model_type == "GRU":
h = h.data
        
out, h = model(stacked_reviews.unsqueeze(0).float(), h)
print(len(h))
user_repr_gru = h[-1].detach()
user_repr_gru.shape

8


torch.Size([1, 64])

### Alternative Sequence Encoding - LSTM

In [57]:
#prep input
word_embeddings = bert_model.get_input_embeddings()
max_seq_len = 30

text = df['reviewText'].iloc[0]
token_ids = tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len=max_seq_len)
x_in = word_embeddings(torch.tensor(token_ids, requires_grad=False).long())
x_in.shape

torch.Size([30, 768])

In [58]:
## encode sequence with LSTM
bert_dim = 768
input_dim = bert_dim
hidden_dim = seq_emb
output_dim = seq_emb
n_layers = max_seq_len
batch_size = 1

device = 'cpu'


model = LSTMNet(input_dim, hidden_dim, output_dim, n_layers=max_seq_len, device=device)
h = model.init_hidden(batch_size)
#if model_type == "LSTM":
h = tuple([e.data for e in h])
len(h)

2

In [59]:
out, h = model(x_in.unsqueeze(0).float(), h)

In [62]:
print(len(h))
h[-1].shape

2


torch.Size([30, 1, 64])