In [1]:
import os
import json
import gzip
import pickle
import numpy as np
import glob
import random 

import pandas as pd
from urllib.request import urlopen
import matplotlib.pyplot as plt
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.metrics.pairwise import cosine_similarity

#nltk.download()
#nltk.download('stopwords')
#nltk.download('punkt')


#import more_itertools as mit
#from keras.preprocessing.sequence import pad_sequences


import torch
import torch.nn as nn
#from pytorch_pretrained_bert import BertTokenizer, BertModel
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset

In [2]:
bert_loaded = False

In [3]:
%%time

if not bert_loaded:
    bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    bert_tokenizer.add_special_tokens({"unk_token": '[UNK]', 'cls_token': '[CLS]', 
                                       'pad_token':'[PAD]', 'sep_token':'[SEP]'})
    print(len(bert_tokenizer))
    assert bert_tokenizer.cls_token == '[CLS]'
    bert_tokenizer.sep_token_id
    
    bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True,
                                            output_attentions=True)
    bert_model.resize_token_embeddings(len(bert_tokenizer))
    
    bert_loaded = True

30522
CPU times: user 2.07 s, sys: 488 ms, total: 2.56 s
Wall time: 9.32 s


In [4]:
os.listdir('../datasets')

['meta_Computers.json.gz',
 'Electronics_5.json.gz',
 'Books_5.json.gz',
 'one_week.tar.gz',
 'contentdata.tar.gz',
 'books-pickle']

In [5]:
(dirpath, dirnames, filenames) = next(os.walk('../datasets/books-pickle/'), (None, None, []))
filenames

['df4.pkl',
 'df.json',
 'df2.pkl',
 'df5.pkl',
 'item_stats.pkl',
 'df3.pkl',
 'df0.pkl',
 'df1.pkl',
 'pytorch_model.bin',
 'user_stats.pkl',
 'df.pkl',
 'config.json',
 'encoded_text.pkl']

In [6]:
books_path = '../datasets/Books_5.json.gz'
pkl_path = '../datasets/books-pickle/'

with open(pkl_path + "df.pkl", 'rb') as fin:
    df = pickle.load(fin)

with open(pkl_path + "user_stats.pkl", 'rb') as fin:
    user_dict = pickle.load(fin)
    
with open(pkl_path + "item_stats.pkl", 'rb') as fin:
    item_dict = pickle.load(fin)

In [7]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,itemID,reviewText,reviewWords
0,5,False,2005-03-30,A1REUF3A1YCPHM,1713353,A story children will love and learn from The ...,170
1,5,True,2016-06-20,AVP0HXC9FG790,1713353,Five Stars The kids loved it!,6
2,5,True,2016-01-24,A324TTUBKTN73A,1713353,Five Stars My students (3 & 4 year olds) loved...,15
3,5,False,2015-07-09,A2RE7WG349NV5D,1713353,Five Stars LOVE IT,4
4,5,True,2015-01-18,A32B7QIUDQCD0E,1713353,Five Stars Great!,3


In [8]:
user_df = pd.DataFrame.from_dict(user_dict, orient='index')

In [9]:
user_df.head()

Unnamed: 0,n_reviews,m_rating,m_words
A0334855HN6E38CXWXZR,1,5.0,12.0
A0790722OCX87RKL2J3T,1,5.0,71.0
A100JBBLCC0NUC,1,5.0,27.0
A101OKMJFCIWYH,2,5.0,26.0
A1022R52JDJVMA,1,5.0,58.0


In [10]:
item_df = pd.DataFrame.from_dict(item_dict, orient='index')

In [11]:
def truncate_seq(tokens, tokenizer, max_len=512):
    if len(tokens) < max_len:
        tokens=tokens[:-1]
        n = max_len - len(tokens) - 1
        tokens += n * [tokenizer.pad_token]
    elif len(tokens) > max_len:
        tokens=tokens[:max_len-1]
    else:
        return tokens
    
    tokens.append(tokenizer.sep_token)
    
    return tokens

def tokenize_text_to_ids(text, tokenizer, sent_tokenize, max_len=512):
    sents = sent_tokenize(text)
    tokens = []
    tokens.append(bert_tokenizer.cls_token)
    for s in sents:
        tokens.extend(bert_tokenizer.tokenize(s))
        tokens.append(bert_tokenizer.sep_token)
    
    tokens = truncate_seq(tokens, tokenizer, max_len)
    
    assert len(tokens) == max_len
    
    return bert_tokenizer.convert_tokens_to_ids(tokens)

In [12]:
word_embeddings = bert_model.get_input_embeddings()

Options for Encoding Sentences: 
1. Bert Embeddings (Raw): tokenise text and apply Bert Embeddings, no further processing [batch_size x seq_len x dim_e] => subsequent (sophisticated) Encoder should produce sentence representation [batch_size x dim_s]
2. Last Hidden States: sequence encoded by Bert, powerful processing [batch_size x seq_len x dim_e] => subsequent (simple) Encoder produce sentence representation [batch_size x dim_s]
3. Average over Last Hidden States: seq. encoded by Bert and average in the end to yield sentence representation [batch_size x dim_s]
    

In [16]:
%%time
max_elems=2
encoded_in = [bert_tokenizer.encode(sent, max_length=30, add_special_tokens=True, pad_to_max_length=True) for sent in list(df['reviewText'])[:3]]

CPU times: user 9.9 ms, sys: 240 µs, total: 10.1 ms
Wall time: 10.1 ms


In [17]:
x_in = torch.tensor(encoded_in, requires_grad=False).long()
#outputs = bert_model(x_in)[:2]

BERT for Sentence Encoding

1. Use last hidden state of CLS 
2. Use average of last hidden states of all tokens 
3. Use average over multiple hidden layers (A) for CLS or (B) for all tokens

In [15]:
def encode_sequence(df, tokenizer, sent_tokenize, bert_model, max_len=50, batch_size=2, method='cls_last', n=None):

    encoded_text = {}
    start_idx = 0
    stop_idx = batch_size

    while(start_idx < len(df)):

        if stop_idx > len(df):
            stop_idx = len(df)

        tokens = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text
                  in list(df['reviewText'])[start_idx:stop_idx]]

        x_in = torch.tensor(tokens, requires_grad=False, device=device).long()

        with torch.no_grad():
            last_hidden, pooled_out, hidden_outs, attention_weights = bert_model(x_in)

        assert last_hidden.shape[0] == batch_size


In [26]:
batch_size=2
max_len=50
method='sum_last_n'
n = 4

encoded_text = {}
start_idx = 0
stop_idx = batch_size


##BERT for Feature-Extraction

#tokenize
tokens = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text
          in list(df['reviewText'])[start_idx:stop_idx]]
x_in = torch.tensor(tokens, requires_grad=False).long()

#model forward pass
with torch.no_grad():
    last_hidden, pooled_out, hidden_outs, attention_weights = bert_model(x_in)
    
#save tensor of encoded text into separate dictionary
keys = range(start_idx, stop_idx+1)

    
if 'pooled_out' == method:
    x_out = pooled_out # batch_size x dim_emb
elif 'cls_last' == method:
    #take the embedding of CLS token of last hidden layer
    x_out = last_hidden[:,0,:] # batch_size x dim_emb
elif 'pool_all_last' == method:
    #average embeddings of last hidden layer of all tokens
    x_out = torch.mean(last_hidden, dim=1)
elif 'pool_cls_n'==method:
    # and n != None
    x_out = torch.mean(torch.cat([hidden[:,0,:].unsqueeze(1) for hidden in hidden_outs[-n:]], dim=1), dim=1)
elif 'pool_last_n'==method:
    #average embeddings of last N hidden layers of all tokens
    x_out = torch.mean(torch.cat(hidden_outs[-n:], dim=1), dim=1)
elif 'sum_last_n'==method and n:
    #sum embeddings of last N hidden layers of all tokens
    x_out = torch.sum(torch.cat(hidden_outs[-n:], dim=1), dim=1)
    #sum last four hidden => 95.9 F1 on dev set for NER
elif 'sum_all'==method and n:
    x_out = torch.sum(torch.cat(hidden_outs, dim=1), dim=1)

print(method)
print(x_out.shape)
encoded_text = {**encoded_text, **dict(zip(keys, x_out))}

sum_last_n
torch.Size([2, 768])


torch.Size([6, 768])

print("Encoding Text..")

bert_model.to(device)

encoded_text = {}
start_idx = 0
stop_idx = batch_size

while(start_idx < len(df)):

    if stop_idx > len(df):
        stop_idx = len(df)

    encoded_in = [tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len) for text
              in list(df['reviewText'])[start_idx:stop_idx]]

    encoded_in = torch.tensor(encoded_in, requires_grad=False, device=device).long()

    with torch.no_grad():
        last_hidden, pooled_out = bert_model(encoded_in)

    assert last_hidden.shape[0] == batch_size

    #save tensor of encoded text into separate dictionary
    keys = range(start_idx, stop_idx+1)
    encoded_text = {**encoded_text, **dict(zip(keys, last_hidden))}

    start_idx += batch_size
    stop_idx += batch_size

In [24]:
def create_user_representations(user_reviews, item_reviews, method='avg_items'):
    user_repr = {}
    for u in user_reviews.keys():
        
        if 'avg_items' == method:
            stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[u].keys()])
            #print(stacked_user_reviews.shape)
            user_repr[u] = torch.mean(stacked_reviews, dim=0)
        elif 'avg_items_user' == method:
            stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[u].keys()])
            stacked_reviews_user = torch.stack([user_reviews[u][r] for r in user_reviews[u].keys()])
            user_repr[u] = torch.mean(stacked_reviews+stacked_reviews_user, dim=0)
    
    return user_repr


def create_toy_representations(seq_emb=64, item_ids=range(100), user_ids = range(20), method='avg_items'):
        
    item_reviews = {}

    for i in item_ids:
        item_reviews[i] = torch.stack([torch.randn(seq_emb)] * random.randint(5,10), dim=0)
        item_reviews[i] = torch.mean(item_reviews[i], dim=0)
    
    
    user_reviews = {}
    
    for i in user_ids:
        user_reviews[i] = {}
        for r in random.sample(list(item_reviews.keys()), random.randint(5,10)):
        #user_reviews[i] = dict(zip(random.sample(list(item_reviews.keys()), random.randint(5,10)), torch.randn(seq_emb)))
            user_reviews[i][r] = torch.randn(seq_emb)
            
    return item_reviews, user_reviews, create_user_representations(user_reviews, item_reviews, method=method)

In [26]:
item_reviews, user_reviews, user_repr_avg = create_toy_representations()

In [27]:
user_repr_ui = create_user_representations(user_reviews, item_reviews, method='avg_items_user')

In [28]:
cos = []
for k in list(user_reviews[0].keys()):
    cos.append(cosine_similarity(user_repr_avg[0].unsqueeze(0), item_reviews[k].unsqueeze(0)).item())
    
print(np.mean(cos))
print(np.std(cos))

0.34152778796851635
0.06405663349608343


In [29]:
cos = []
for k in list(user_reviews[0].keys()):
    cos.append(cosine_similarity(user_repr_ui[0].unsqueeze(0), item_reviews[k].unsqueeze(0)).item())
    
print(np.mean(cos))
print(np.std(cos))

0.21049505844712257
0.08154537465453288


In [30]:
cosine_similarity(user_repr_ui[0].unsqueeze(0), user_repr_ui[0].unsqueeze(0)).item()

1.0

In [56]:
class GRUNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, device, drop_prob=0.2):
        super(GRUNet, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.gru = nn.GRU(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device)
        return hidden

class LSTMNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, device, drop_prob=0.2, bi=False):
        super(LSTMNet, self).__init__()
        self.device = device
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        self.lstm = nn.LSTM(input_dim, hidden_dim, n_layers, batch_first=True, dropout=drop_prob, bidirectional=bi)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        
    def forward(self, x, h):
        out, h = self.lstm(x, h)
        out = self.fc(self.relu(out[:,-1]))
        return out, h
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden


In [67]:
#encode user with GRU
seq_emb = 64
input_dim = seq_emb
hidden_dim = seq_emb
output_dim = seq_emb

stacked_reviews = torch.stack([item_reviews[i] for i in user_reviews[0].keys()])
n_layers = stacked_reviews.shape[0]
batch_size = 1
device = 'cpu'


model = GRUNet(input_dim, hidden_dim, output_dim, n_layers, device)
h = model.init_hidden(batch_size)
#if model_type == "GRU":
h = h.data
        
out, h = model(stacked_reviews.unsqueeze(0).float(), h)
print(len(h))
user_repr_gru = h[-1].detach()
user_repr_gru.shape

8


torch.Size([1, 64])

### Alternative Sequence Encoding - LSTM

In [57]:
#prep input
word_embeddings = bert_model.get_input_embeddings()
max_seq_len = 30

text = df['reviewText'].iloc[0]
token_ids = tokenize_text_to_ids(text, bert_tokenizer, sent_tokenize, max_len=max_seq_len)
x_in = word_embeddings(torch.tensor(token_ids, requires_grad=False).long())
x_in.shape

torch.Size([30, 768])

In [58]:
## encode sequence with LSTM
bert_dim = 768
input_dim = bert_dim
hidden_dim = seq_emb
output_dim = seq_emb
n_layers = max_seq_len
batch_size = 1

device = 'cpu'


model = LSTMNet(input_dim, hidden_dim, output_dim, n_layers=max_seq_len, device=device)
h = model.init_hidden(batch_size)
#if model_type == "LSTM":
h = tuple([e.data for e in h])
len(h)

2

In [59]:
out, h = model(x_in.unsqueeze(0).float(), h)

In [62]:
print(len(h))
h[-1].shape

2


torch.Size([30, 1, 64])