In [1]:
import gzip
import json
import random
from collections import defaultdict
import tensorflow as tf

In [2]:
f = open('data/renttherunway_final_data.json')
data = []

for l in f:
    d = json.loads(l)
    data.append(d)

f.close()
print(data[0])

{'fit': 'fit', 'user_id': '420272', 'bust size': '34d', 'item_id': '2260466', 'weight': '137lbs', 'rating': '10', 'rented for': 'vacation', 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.", 'body type': 'hourglass', 'review_summary': 'So many compliments!', 'category': 'romper', 'height': '5\' 8"', 'size': 14, 'age': '28', 'review_date': 'April 20, 2016'}


In [3]:
len(data)

192544

In [4]:
userIDs = {}
itemIDs = {}
interactions = []

for d in data:
    u = d['user_id']
    i = d['item_id']
    # transform textual target to numerical target
    if d['fit'] == 'small':
        r = 0
    elif d['fit'] == 'fit':
        r = 1
    elif d['fit'] == 'large':
        r = 2
    if not u in userIDs: userIDs[u] = len(userIDs)
    if not i in itemIDs: itemIDs[i] = len(itemIDs)
    interactions.append((u,i,r))


In [6]:
from sklearn.model_selection import train_test_split

nTrain = int(len(interactions) * .9)
nTest = len(interactions) - nTrain
interactionsTrain, interactionsTest = train_test_split(interactions, test_size=nTest, random_state=32)

In [7]:
itemsPerUser = defaultdict(list)
usersPerItem = defaultdict(list)
for u,i,r in interactionsTrain:
    itemsPerUser[u].append(i)
    usersPerItem[i].append(u)
# using only train data

In [8]:
# training mean
mu = sum([r for _,_,r in interactionsTrain])/len(interactionsTrain)
print("mean:", mu)

mean: 0.9947082619208375


In [9]:
optimizer = tf.keras.optimizers.Adam(0.1)

In [10]:
class LatentFactorModel(tf.keras.Model):
    def __init__(self, mu, K, lamb):
        super(LatentFactorModel, self).__init__()
        # Initialize to average
        self.alpha = tf.Variable(mu)
        # Initialize to small random values
        self.betaU = tf.Variable(tf.random.normal([len(userIDs)],stddev=0.001)) # user feature weights
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001)) # item feature weights
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001)) # user latent factor 
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001)) # item latent factor
        self.lamb = lamb # regularizer 

    def predict(self, u, i):
        p = self.alpha + self.betaU[u] + self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    def reg(self):
        return self.lamb * (tf.reduce_sum(self.betaU**2) +\
                            tf.reduce_sum(self.betaI**2) +\
                            tf.reduce_sum(self.gammaU**2) +\
                            tf.reduce_sum(self.gammaI**2))

    def predictSample(self, sampleU, sampleI):
        # convert features into tensor 
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        # convert feature tensor into embeddings 
        beta_u = tf.nn.embedding_lookup(self.betaU, u)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        # put the transformed embedding into the formula to make prediction
        pred = self.alpha + beta_u + beta_i +\
            tf.reduce_sum(tf.multiply(gamma_u, gamma_i),1)
        return pred

    def call(self, sampleU, sampleI, sampleR):
        # returns the mean squared error of the predictions
        pred = self.predictSample(sampleU, sampleI) 
        r = tf.convert_to_tensor(sampleR, dtype=tf.float32)
        return tf.nn.l2_loss(pred - r)/len(sampleR)

In [11]:
modelLFM = LatentFactorModel(mu, 5, .00001)

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2021-11-28 12:33:04.067667: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-28 12:33:04.068559: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
def trainingStep(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleR = [], [], []
        for _ in range(Nsamples):
            u,i,r = random.choice(interactions)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleR.append(r)

        loss = model(sampleU, sampleI, sampleR)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for (grad, var) in zip(gradients, model.trainable_variables) if grad is not None)
    return loss.numpy()

In [13]:
for i in range(100):
    obj = trainingStep(modelLFM, interactionsTrain)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.10072959
iteration 20, objective = 0.09420042
iteration 30, objective = 0.09750782
iteration 40, objective = 0.09742604
iteration 50, objective = 0.09742217
iteration 60, objective = 0.095765315
iteration 70, objective = 0.096167274
iteration 80, objective = 0.095448315
iteration 90, objective = 0.095959686
iteration 100, objective = 0.09565516


In [14]:
u,i,r = interactionsTest[0]

print("if r != 1")
for j in range(10):
    u,i,r = interactionsTest[j]
    if r != 1:
        print("data number", j, modelLFM.predict(userIDs[u], itemIDs[i]).numpy(), r)

print("if r == 1")
for j in range(10):
    u,i,r = interactionsTest[j]
    if r == 1:
        print("data number", j, modelLFM.predict(userIDs[u], itemIDs[i]).numpy(), r)

if r != 1
if r == 1
data number 0 0.8677275 1
data number 1 0.7542016 1
data number 2 1.3238016 1
data number 3 0.8864313 1
data number 4 0.82434624 1
data number 5 0.7608871 1
data number 6 1.0434667 1
data number 7 0.9699783 1
data number 8 1.2444841 1
data number 9 1.1766328 1


In [15]:
predictions = [modelLFM.predict(userIDs[u], itemIDs[i]).numpy() for u,i,_ in interactionsTest]
labels = [r for u,i,r in interactionsTest]

In [16]:
print(len(predictions))
print(len(labels))


19255
19255


In [17]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [18]:
MSE(predictions, labels)

0.21393086082730717

In [19]:
opredictions = []
olabels = []
numCorrect = 0
for i in range(100):
    for k in range(100):
        plusdecimal = i/100
        minusdecimal = j/100
        numCorrect = 0
        for j,d in zip(range(len(data[nTrain:])), data[nTrain:]):
            if predictions[j] >= 1+plusdecimal and d['fit'] == 'large':
                numCorrect+=1
            elif predictions[j] <= 1-minusdecimal and d['fit'] == 'small':
                numCorrect+=1
            elif predictions[j] <= 1+plusdecimal and predictions[j] >= 1- minusdecimal and d['fit'] == 'fit':
                numCorrect+=1
        print(numCorrect/len(data[nTrain:]))

0.23775642690210336
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
0.44835107764217086
