In [1]:
# !pip install tensorflow-recommenders
# !pip install tensorflow==2.7.1

In [2]:
#Software reviews.
#!wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Software.json.gz

#Software metadata.
#!wget http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_Software.json.gz

In [3]:
import os
import gzip
import json
import numpy as np
import pandas as pd
from typing import Dict, Text

import tensorflow as tf
import tensorflow_recommenders as tfrs

In [4]:
software_data_path = '/content/Software.json.gz'
software_metadata_path = '/content/meta_Software.json.gz'


software_cols = ['asin', 'reviewerID', 'overall']
software_metadata_cols = ['title', 'asin', 'rank']

In [5]:
software_data = pd.read_json(software_data_path, lines = True, compression = 'gzip')
software_data = software_data.loc[:3000, software_cols]
software_data.head(3)

Unnamed: 0,asin,reviewerID,overall
0,77613252,A240ORQ2LF9LUI,4
1,77613252,A1YCCU0YRLS0FE,4
2,77613252,A1BJHRQDYVAY2J,1


In [6]:
software_metadata = pd.read_json(software_metadata_path, lines = True, compression = 'gzip')
software_metadata = software_metadata.loc[:3000, software_metadata_cols]
software_metadata.head(3)

Unnamed: 0,title,asin,rank
0,HOLT PHYSICS LESSON PRESENTATION CD-ROM QUICK ...,0030672120,"25,550 in Software ("
1,"Sing, Watch, &amp; Learn Spanish (DVD + Guide)...",0071480935,"15,792 in Software ("
2,Connect with LearnSmart Access Card for Microb...,007329506X,"16,900 in Software ("


In [7]:
merged_df = software_data.merge(software_metadata, how = 'right')
merged_df = merged_df.dropna()
merged_df.head(3)

Unnamed: 0,asin,reviewerID,overall,title,rank
35,77613252,A240ORQ2LF9LUI,4.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("
36,77613252,A1YCCU0YRLS0FE,4.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("
37,77613252,A1BJHRQDYVAY2J,1.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("


In [8]:
combined = merged_df.drop_duplicates(['asin', 'reviewerID'])
user_software_matrix = combined.pivot(index = 'reviewerID', columns = 'asin', values = 'overall')
user_software_matrix.fillna(0, inplace = True)

user_software_matrix.head(3)

asin,0077613252,0077775473,0132147556,0321700945,0321719816,0321719824,0321898354,0615179088,0669524115,0669524425,...,B00000JLQY,B00000JLRD,B00000K4CS,B00000K4CY,B00000K4KA,B00000K4KT,B00001KPYS,B00001QGVR,B00001QGW2,B00001SHL1
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0716644Q5B66THVCUQ3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A100LLXMXDZHJZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1039YQ2N7GEP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
user_ids_vocabulary = tf.keras.layers.StringLookup(mask_token = None)
user_ids_vocabulary.adapt(merged_df['reviewerID'])

software_ids_vocabulary = tf.keras.layers.StringLookup(mask_token = None)
software_ids_vocabulary.adapt(merged_df['title'])

In [10]:
#Define a tfrs model.
class SoftwareModel(tfrs.Model):
  def __init__(self, user_model: tf.keras.Model, software_model: tf.keras.Model, task: tfrs.tasks.Retrieval):
    super().__init__()

    #Set up user and software representations.
    self.user_model = user_model
    self.software_model = software_model

    #Set up retrieval task.
    self.task = task

  #Loss function.
  def compute_loss(self, features: Dict[Text, tf.Tensor], training = False):
    user_embeddings = self.user_model(merged_df['reviewerID'])
    software_embeddings = self.software_model(merged_df['title'])

    return self.task(user_embeddings, software_embeddings)

In [11]:
#Convert 'reviewer_ID' and 'software_ID/asin' to a tf.data object.
reviewer_ID = tf.data.Dataset.from_tensor_slices(merged_df['reviewerID'])
software_ID = tf.data.Dataset.from_tensor_slices(merged_df['asin'])


In [12]:
#Define user and software models.
user_model = tf.keras.Sequential([
                                  user_ids_vocabulary,
                                  tf.keras.layers.Embedding(user_ids_vocabulary.vocab_size(), 64)
])

software_model = tf.keras.Sequential([
                                      software_ids_vocabulary,
                                      tf.keras.layers.Embedding(software_ids_vocabulary.vocab_size(), 64)
                                      
])

#Metrics objectives.
task = tfrs.tasks.Retrieval(metrics = tfrs.metrics.FactorizedTopK(
    software_ID.batch(128).map(software_model)
))



In [32]:
#Retrieval model.
model = SoftwareModel(user_model, software_model, task)
model.compile(optimizer = tf.keras.optimizers.Adagrad(learning_rate = 0.5))

In [33]:
#Train for 5 epochs.
model.fit(software_ID.batch(100), epochs = 1)



<keras.callbacks.History at 0x7ffa18f6a750>

In [35]:
index = tfrs.layers.factorized_top_k.BruteForce(
    model.user_model
)
index.index_from_dataset(
    software_ID.batch(100).map(lambda title: (title, model.software_model(title)))
)

#Get some recommendations.
_, titles = index(np.array(['46']))

print(f'Top 5 recommendations for user 46 is: {titles[0, :5]}')

Top 5 recommendations for user 46 is: [b'0077613252' b'0077613252' b'0077613252' b'0077613252' b'0077775473']


In [34]:
merged_df.loc[merged_df['asin'] == '0077613252']

Unnamed: 0,asin,reviewerID,overall,title,rank
35,77613252,A240ORQ2LF9LUI,4.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("
36,77613252,A1YCCU0YRLS0FE,4.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("
37,77613252,A1BJHRQDYVAY2J,1.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("
38,77613252,APRDVZ6QBIQXT,3.0,Connect Personal Health with LearnSmart 1 Seme...,"15,675 in Software ("


In [36]:
merged_df.loc[merged_df['asin'] == '0077775473']

Unnamed: 0,asin,reviewerID,overall,title,rank
50,77775473,A2JZTTBSLS1QXV,5.0,LearnSmart Access Card for Experience Psychology,"9,130 in Software ("
