## Load Recommender

In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import MinMaxScaler

# dataset is accessible at https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt (https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Books_v1_02.tsv.gz)
df = pd.read_csv('amazon_reviews_us_Digital_Software_v1_00.tsv', sep='\t', dtype={'star_rating': float})

df.columns = ['marketplace', 'customer_id', 'review_id', 'product_id', 'product_parent', 'product_title', 'product_category', 'star_rating',
              'helpful_votes', 'total_votes', 'vine', 'verified_purchase', 'review_headline', 'review_body', 'review_date']

df.drop(['marketplace', 'product_parent', 'product_title', 'product_category', 'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
         'review_headline'],
        axis='columns', inplace=True)

customer_count = df['customer_id'].value_counts()
customers_with_multiple_reviews = customer_count[customer_count > 3].index
df = df[df['customer_id'].isin(customers_with_multiple_reviews)]

product_count = df['product_id'].value_counts()
products_with_multiple_reviews = product_count[product_count > 3].index
df = df[df['product_id'].isin(products_with_multiple_reviews)]

In [4]:
# load model
user_final_rating = pickle.load(open('./user_final_rating.pkl','rb'))

In [None]:
def apply_pivot(df, fillby=None):
    pivot_table = df.pivot_table(index='customer_id', columns='product_id', values='star_rating')
    if fillby is not None:
        pivot_table = pivot_table.fillna(fillby)
    return pivot_table

# Train-test split (training on older data to predict more recent data)
df = df.sort_values(by='review_date')

split_index = int(0.7 * len(df))

train = df[:split_index]
test = df[split_index:]

test = test[test.customer_id.isin(train.customer_id)] # to guarantee known customer ids in test
test = test[test.product_id.isin(train.product_id)] # to guarantee known product ids in test

df_train_pivot = apply_pivot(df = train, fillby = 0)
df_test_pivot = apply_pivot(df = test, fillby = 0)

## Load LSTM

In [5]:
import pickle
with open('parrot.pkl', 'rb') as f:
    vocab_to_int = pickle.load(f)

In [6]:
import torch
import torch.nn as nn
class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
 
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
 
        self.no_layers = no_layers
        self.vocab_size = vocab_size
    
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
    
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()
        
    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim) 
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)

        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
        
        
        
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim))
        
        hidden = (h0,c0)
        return hidden


ModuleNotFoundError: No module named 'torch'

In [None]:
no_layers = 2
vocab_size = len(vocab_to_int) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256

In [None]:
import os
# function for loaduing weights of a trained model
def load_weights(model, weights_dir):
    files = os.listdir(weights_dir)
    weight_paths = [os.path.join(weights_dir, basename) for basename in files]
    # get the latest file in the directory
    final_weight_file = os.path.basename(max(weight_paths, key=os.path.getctime))

    # first model needs to be loaded
    model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

    # fixes odd error when state_dict has prescript "model."" in keys
    state_dict = torch.load(os.path.join(weights_dir, final_weight_file))
    for key in list(state_dict.keys()):
        if key.startswith("model."):
            state_dict[key[6:]] = state_dict.pop(key)
    model.load_state_dict(state_dict)

    print('Loaded weights: ' + final_weight_file)
    
    return model

In [None]:
lstm_model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

lstm_model = load_weights(lstm_model, 'weights2')
lstm_model.eval()

Loaded weights: epoch-0_accuracy-0.84.pth


SentimentRNN(
  (embedding): Embedding(28008, 64)
  (lstm): LSTM(64, 256, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)

In [None]:
import numpy as np

def pad_features(reviews_int, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's or truncated to the input seq_length.
    '''
    features = np.zeros((len(reviews_int), seq_length), dtype = int)
    
    for i, review in enumerate(reviews_int):
        review_len = len(review)
        
        if review_len <= seq_length:
            zeroes = list(np.zeros(seq_length-review_len))
            new = zeroes+review
        elif review_len > seq_length:
            new = review[0:seq_length]
        
        features[i,:] = np.array(new)
    
    return features

from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuation
    test_text = ''.join([c for c in test_review if c not in punctuation])

    # splitting by spaces
    test_words = test_text.split()

    # tokens
    test_ints = []
    for word in test_words:
        if word in vocab_to_int.keys():
            test_ints.append(vocab_to_int[word])

    #test_ints.append([vocab_to_int[word] for word in test_words])
    test_ints = [test_ints] 

    return test_ints


In [None]:
import torch

def lstm_predict(test_review):
    
    lstm_model.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # pad tokenized sequence
    seq_length=250
    features = pad_features(test_ints, seq_length)
    
    # convert to tensor to pass into your model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = lstm_model.init_hidden(batch_size)
    
    #feature_tensor = feature_tensor.to(device)
    
    # get the output from the model
    output, h = lstm_model(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze()) 
    # printing output value, before rounding
    return pred.item()

In [None]:
review_text = "Terrible"
print(f'Review text: {review_text}')
print(f'Sentiment  : {lstm_predict(review_text)}')

Review text: Terrible
Sentiment  : 1.0


## Load BERT

In [None]:
from transformers import BertModel

MODEL_NAME = 'bert-base-cased'

bert_model = BertModel.from_pretrained(MODEL_NAME)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from torch import nn

class SentimentClassifier(nn.Module):
    def __init__(self,num_classes):
        super(SentimentClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        self.dropout = nn.Dropout(p = 0.3)
        self.linear = nn.Linear(self.bert.config.hidden_size,num_classes)
        self.softmax = nn.Softmax(dim = 1)
        
    def forward(self,input_ids , attention_mask):
        temp = self.bert(input_ids,attention_mask) # Here we have added one linear layer on top of 
        pooled_output = temp[1]                    # BERT-base with number of output = 3 
        out = self.dropout(pooled_output)          # 
        out = self.linear(out)
        return out

In [None]:
import os
import torch

num_classes = 2

# function for loaduing weights of a trained model
def load_weights(model, weights_dir):
    files = os.listdir(weights_dir)
    weight_paths = [os.path.join(weights_dir, basename) for basename in files]
    # get the latest file in the directory
    final_weight_file = os.path.basename(max(weight_paths, key=os.path.getctime))

    # first model needs to be loaded
    model = SentimentClassifier(num_classes)

    # fixes odd error when state_dict has prescript "model."" in keys
    state_dict = torch.load(os.path.join(weights_dir, final_weight_file))
    for key in list(state_dict.keys()):
        if key.startswith("model."):
            state_dict[key[6:]] = state_dict.pop(key)
    model.load_state_dict(state_dict)

    print('Loaded weights: ' + final_weight_file)
    
    return model

In [None]:
bert_model = SentimentClassifier(2)

bert_model = load_weights(bert_model, 'weights3')
bert_model.eval()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform

Loaded weights: epoch-0_accuracy-3.510.pth


SentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [None]:
import transformers
tokenizer = transformers.BertTokenizer.from_pretrained(MODEL_NAME)

def bert_predict(text):
    encoding = tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=128,
      truncation=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt')
    output = bert_model(encoding['input_ids'],encoding['attention_mask'])
    _,prediction = torch.max(output, dim=1)
    return prediction.item()

In [None]:
review_text = "Terrible, i hated it"
print(f'Review text: {review_text}')
print(f'Sentiment  : {bert_predict(review_text)}')

Review text: Terrible, i hated it
Sentiment  : 0




## Connect Recommender system with NLP models

In [None]:
def bert_get_sentiment_product(product_id):
    ## Get review list for given product
    product_name_review_list = df.loc[df['product_id'] == product_id]['review_body']
    sum = 0
    for review in product_name_review_list:
        sum += bert_predict(review)
    ## Predict sentiment
    return sum / len(product_name_review_list)

def bert_get_sentiment_products(products):
    list = []
    if len(products) > 1:
        for product in products:
            list.append(bert_get_sentiment_product(product))
    else:
        list.append(bert_get_sentiment_product(products))

    return list
    

In [None]:
def lstm_get_sentiment_product(product_id):
    ## Get review list for given product
    product_name_review_list = df.loc[df['product_id'] == product_id]['review_body']
    sum = 0
    for review in product_name_review_list:
        sum += lstm_predict(review)
    ## Predict sentiment
    return sum / len(product_name_review_list)

def lstm_get_sentiment_products(products):
    list = []
    if len(products) > 1:
        for product in products:
            list.append(lstm_get_sentiment_product(product))
    else:
        list.append(lstm_get_sentiment_product(products))

    return list

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def find_top_recommendations(pred_rating_df, userid, topn):
    recommendation = pred_rating_df.loc[userid].sort_values(ascending=False)[0:topn]
    recommendation = pd.DataFrame(recommendation).reset_index().rename(columns={userid:'predicted_ratings'})
    return recommendation


def bert_find_top_pos_recommendation(user_final_rating, user_id, no_recommendation):
    ## 10 is manually coded, need to change 
    ## Generate top recommenddations using user-user based recommendation system w/o using sentiment analysis  
    recommendation_user_user = find_top_recommendations(user_final_rating, user_id, 10)
    recommendation_user_user['userId'] = user_id
    ## filter out recommendations where predicted rating is zero
    recommendation_user_user = recommendation_user_user[recommendation_user_user['predicted_ratings']!=0]
    print("Recommended products for user id:{} without using sentiment".format(user_id))
    display(recommendation_user_user)
    ## Get overall sentiment score for each recommended product

    #recommendation_user_user['sentiment_score'] = recommendation_user_user['product_id'].apply(get_sentiment_product)
    sentiments = bert_get_sentiment_products(recommendation_user_user['product_id'])
    recommendation_user_user['sentiment_score'] = sentiments
    ## Transform scale of sentiment so that it can be manipulated with predicted rating score
    scaler = MinMaxScaler(feature_range=(1, 5))
    scaler.fit(recommendation_user_user[['sentiment_score']])
    recommendation_user_user['sentiment_score'] = scaler.transform(recommendation_user_user[['sentiment_score']])
    ## Get final product ranking score using 1*Predicted rating of recommended product + 2*normalized sentiment score on scale of 1–5 of recommended product 
    recommendation_user_user['product_ranking_score'] =  1*recommendation_user_user['predicted_ratings'] + \
                                                        2*recommendation_user_user['sentiment_score']
    print("Recommended products for user id:{} after using sentiment".format(user_id))
    ## Sort product ranking score in descending order and show only top `no_recommendation`
    display(recommendation_user_user.sort_values(by = ['product_ranking_score'],ascending = False).head(no_recommendation))



def lstm_find_top_pos_recommendation(user_final_rating, user_id, no_recommendation):
    ## 10 is manually coded, need to change 
    ## Generate top recommenddations using user-user based recommendation system w/o using sentiment analysis  
    recommendation_user_user = find_top_recommendations(user_final_rating, user_id, 10)
    recommendation_user_user['userId'] = user_id
    ## filter out recommendations where predicted rating is zero
    recommendation_user_user = recommendation_user_user[recommendation_user_user['predicted_ratings']!=0]
    print("Recommended products for user id:{} without using sentiment".format(user_id))
    display(recommendation_user_user)
    ## Get overall sentiment score for each recommended product

    #recommendation_user_user['sentiment_score'] = recommendation_user_user['product_id'].apply(get_sentiment_product)
    sentiments = lstm_get_sentiment_products(recommendation_user_user['product_id'])
    recommendation_user_user['sentiment_score'] = sentiments
    ## Transform scale of sentiment so that it can be manipulated with predicted rating score
    scaler = MinMaxScaler(feature_range=(1, 5))
    scaler.fit(recommendation_user_user[['sentiment_score']])
    recommendation_user_user['sentiment_score'] = scaler.transform(recommendation_user_user[['sentiment_score']])
    ## Get final product ranking score using 1*Predicted rating of recommended product + 2*normalized sentiment score on scale of 1–5 of recommended product 
    recommendation_user_user['product_ranking_score'] =  1*recommendation_user_user['predicted_ratings'] + \
                                                        2*recommendation_user_user['sentiment_score']
    print("Recommended products for user id:{} after using sentiment".format(user_id))
    ## Sort product ranking score in descending order and show only top `no_recommendation`
    display(recommendation_user_user.sort_values(by = ['product_ranking_score'],ascending = False).head(no_recommendation))
    

In [7]:
# Enter user id
user_id = 12975480
bert_find_top_pos_recommendation(user_final_rating, user_id, 10)
lstm_find_top_pos_recommendation(user_final_rating, user_id, 10)

NameError: name 'bert_find_top_pos_recommendation' is not defined

In [None]:
# Calculate precision at k

def precision_at_k(test, predicted, k):
    precision_values = []
    for user in test.index:
        actual_items = test.loc[user]  # Actual user preferences
        rated_items = actual_items[actual_items > 0].index  # Items rated by the user
        num_relevant = len(set(rated_items) & set(predicted))  # Number of relevant items
        precision = num_relevant / k  # Precision at K
        precision_values.append(precision)
    average_precision = np.mean(precision_values)
    return average_precision

k = 10

precision = precision_at_k(df_test_pivot, user_final_rating, k)
print("Precision at "+ str(k) + ": " + str(precision))