In [39]:
import scipy
import gzip
import numpy as np
import pandas as pd
import xlearn as xl
import statistics

import gzip
import math
import nltk
import numpy
import scipy.optimize
import string
import random
import tensorflow as tf
import pandas as pd

from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn import svm, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from surprise import SVD, Dataset, Reader, KNNBasic
from surprise.model_selection import train_test_split
from skopt import gp_minimize

In [None]:
def parse(path):

  skip_count = 0

  g = gzip.open(path, 'r')
  for l in g:
    try:
        d = eval(l)
        u = d['user_id']
        i = d['item_id']

        if "age" not in d or "size" not in d or "height" not in d or "weight" not in d or "body type" not in d or "category" not in d:
          skip_count += 1
          continue

        yield u,i,d
          
    except:
      skip_count += 1
      continue

  print("Skipped %d items" % skip_count)

In [None]:
dataset = []
for l in parse("renttherunway_final_data.json.gz"):
    dataset.append(l)

Skipped 39434 items


In [None]:
allReviews = []
allItems = set()
itemCount = defaultdict(int)
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
allUI = {}
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

totalReviews = 0

for u, i, d in dataset:
  allReviews.append((u, i, d))
  allItems.add(i)
  itemCount[i] += 1
  totalReviews += 1
  itemsPerUser[u].add(i)
  usersPerItem[i].add(u)
  allUI[(u, i)] = int(d['rating'])
  ratingsPerUser[u].append(int(d['rating']))
  ratingsPerItem[i].append(int(d['rating']))

mostPopular = [(itemCount[x], x) for x in itemCount]
mostPopular.sort()
mostPopular.reverse()

In [None]:
def getMostPopular(threshold):
    return1 = set()
    count = 0

    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > threshold: break

    return return1

def getLeastPopular(threshold):
    return1 = set()
    count = 0

    for ic, i in reversed(mostPopular):
        count += ic
        return1.add(i)
        if count > threshold: break

    return return1

def getAccuracy(preds):
    correctCount = 0

    for p in preds:
        if (p[2] == 1 and p[1] in itemsPerUser[p[0]]) or (p[2] == 0 and p[1] not in itemsPerUser[p[0]]):
            # print("Correct: ", p)
            correctCount += 1

    baselineAccuracy = correctCount / len(preds)

    return baselineAccuracy

In [None]:
split_threshold = int(len(allReviews)*.7)

ratingsTrain = allReviews[:split_threshold]
ratingsValid = allReviews[split_threshold:]

In [None]:
ratingsValidNew = []
for _, _, d in ratingsValid:
    user = d['user_id']
    item = d['item_id']

    # randomly select an item that the user has not reviewed
    notReviewed = allItems - itemsPerUser[user]
    notReviewed = list(notReviewed)
    random.shuffle(notReviewed)
    newItem = notReviewed[0]

    ratingsValidNew.append((user, newItem, 0))

ratingsValidNew = ratingsValidNew + ratingsValid

In [None]:
userIDs, itemIDs = {}, {}
interactions = []

for user, item, details in allReviews:
    user = details['user_id']
    item = details['item_id']
    rating = int(details['rating'])

    if not user in userIDs: userIDs[user] = len(userIDs)
    if not item in itemIDs: itemIDs[item] = len(itemIDs)
    interactions.append((user, item, rating))

nUsers, nItems = len(userIDs), len(itemIDs)

In [54]:
items = list(itemIDs.keys())

class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [55]:
optimizer = tf.keras.optimizers.Adam(0.1)
# 5 latent factors default
modelBPR = BPRbatch(5, 0.00001)

In [56]:
def trainingStepBPR(model, interactions):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [57]:
for i in range(50):
    obj = trainingStepBPR(modelBPR, interactions)
    if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))

iteration 10, objective = 0.47060552
iteration 20, objective = 0.42704472
iteration 30, objective = 0.4229465
iteration 40, objective = 0.4249736
iteration 50, objective = 0.41257024


In [58]:
############### SANDBOX ###############

In [107]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

In [121]:
# create negative samples
negative_samples = []
for u in userIDs:
    for _ in range(2):
        i = random.choice(items)
        while i in itemsPerUser[u]:
            i = random.choice(items)
        negative_samples.append((u, i, 0))

interactions_net = []

for u, i, _ in negative_samples:
    interactions_net.append((u, i, 0))

for u, i, _ in interactions:
    interactions_net.append((u, i, 1))

# convert interactions to dataframe
interactions_df = pd.DataFrame(interactions_net, columns=['user', 'item', 'rating'])
interactions_df['user_index'] = interactions_df['user'].apply(lambda x: userIDs[x])
interactions_df['item_index'] = interactions_df['item'].apply(lambda x: itemIDs[x])
interactions_df['positive_interaction'] = interactions_df['rating']

# split into train and test
train_df, test_df = train_test_split(interactions_df, test_size=0.2, random_state=42)

In [122]:
# Define the NCF model using TensorFlow's Keras API
def create_ncf_model(num_users, num_items, embedding_size=64):
    user_input = Input(shape=(1,))
    item_input = Input(shape=(1,))

    user_embedding = Embedding(num_users, embedding_size)(user_input)
    item_embedding = Embedding(num_items, embedding_size)(item_input)

    concat = Concatenate()([user_embedding, item_embedding])
    flatten = Flatten()(concat)
    dense_layer = Dense(64, activation='relu')(flatten)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)

    model = Model(inputs=[user_input, item_input], outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [123]:
num_users = interactions_df['user_index'].nunique()
num_items = interactions_df['item_index'].nunique()

In [126]:
ncf_model = create_ncf_model(num_users, num_items)

history = ncf_model.fit(
    [train_df['user_index'], train_df['item_index']],
    train_df['positive_interaction'],
    epochs=3,
    batch_size=32,
    validation_data=([test_df['user_index'], test_df['item_index']], test_df['positive_interaction'])
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [75]:
############### SANDBOX ###############

In [145]:
leastPop = getLeastPopular(len(allItems) / 4)

def bprPredict(user, item):
    if item in leastPop:
        return 0
    else:
        ind_bpr = modelBPR.predict(userIDs[user], itemIDs[item]).numpy()
        if ind_bpr > 0.4:
            return 1
        else:
            return 0

In [147]:
validPreds = []

for u, i, _ in ratingsValidNew:
    validPreds.append((u, i, bprPredict(u, i)))

getAccuracy(validPreds)



KeyboardInterrupt: 

In [None]:
### allItems/3: /4
### ind: 0.4
### features: 5
### 0.8135

In [None]:
############### SANDBOX ###############

In [None]:
userIDsTrain, itemIDsTrain = {}, {}
userIDsValid, itemIDsValid = {}, {}

for u, i, d in ratingsTrain:
    if not u in userIDsTrain:
        userIDsTrain[u] = len(userIDsTrain)
    if not i in itemIDsTrain:
        itemIDsTrain[i] = len(itemIDsTrain)

for u, i, d in ratingsValid:
    if not u in userIDsValid:
        userIDsValid[u] = len(userIDsValid)
    if not i in itemIDsValid:
        itemIDsValid[i] = len(itemIDsValid)

nUsersTrain, nItemsTrain = len(userIDsTrain), len(itemIDsTrain)
nUsersValid, nItemsValid = len(userIDsValid), len(itemIDsValid)

In [None]:
dataAll = []
dataTrain = []
dataValid = []
allRatings = []

for u, i, d in allReviews:
    dataAll.append(d)
    allRatings.append(int(d['rating']))

global_median = int(statistics.median(allRatings))

In [None]:
df = pd.DataFrame(dataAll)

In [None]:
def convert_height_to_inches(height_str):
    feet = int(height_str.split("'")[0])
    inches = int(height_str.split("'")[1][1:-1])
    height_inches = feet * 12 + inches

    return height_inches

def convert_weight_to_lbs(weight_str):
    weight_lbs = int(weight_str[:-3])

    return weight_lbs

In [None]:
df['height'] = df['height'].apply(convert_height_to_inches)
df['weight'] = df['weight'].apply(convert_weight_to_lbs)

In [None]:
one_hot_rented = pd.get_dummies(df['rented for'])
one_hot_btype = pd.get_dummies(df['body type'])

In [None]:
df = df.drop(['user_id', 'item_id', 'review_date', 'review_summary', 'review_text', 'rented for', 'body type', 'category'], axis=1)
df = pd.concat([df, one_hot_rented, one_hot_btype], axis=1)

In [None]:
libsvm_data = df.apply(
    lambda row: f"{row['rating']} 1:{row['age']} 2:{row['size']} 3:{row['height']} 4:{row['weight']} "
    + " ".join([f"{i + 5}:{value}" for i, value in enumerate(row.iloc[7:])]) + "\n", axis=1)

In [None]:
with open('train.txt', 'w') as f:
    f.writelines(libsvm_data)

In [None]:
ffm_model = xl.create_fm() # Use field-aware factorization machine (ffm)
ffm_model.setTrain("./train.txt")    # Set the path of training dataset

param = {'task':'reg', 'lr':0.2, 'lambda':0.02, 'epoch':3, 'fold':5, 'k':5}

In [None]:
ffm_model.cv(param)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 4 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./train.txt_0.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./train.txt_1.bin) found. Skip converting text to binary.
[32m[------------] [0mFirst check if the text file has been already converted to

In [None]:
y_baseline = []
y_true = []

for u, i, d in ratingsValid:
    y_true.append(int(d['rating']))

for i in range(len(y_true)):
    y_baseline.append(global_median)

In [None]:
print(mean_squared_error(y_true, y_baseline))

2.9140704939803626


In [None]:
############### SANDBOX ###############

In [None]:
# Update alpha, beta_u, and beta_i until convergence

def iterate(lamb, alpha_old, betaU_old, betaI_old, reviewsTrain, trainRatings):

    betaU_new = {}
    betaI_new = {}

    alpha_num = 0
    for u, i, _ in reviewsTrain:
        alpha_num += allUI[(u, i)] - (betaU_old[u] + betaI_old[i])
    alpha_new = alpha_num / len(reviewsTrain)

    for u in itemsPerUser:
        beta_u_num = 0
        for i in itemsPerUser[u]:
            beta_u_num += allUI[(u, i)] - (alpha_new + betaI_old[i])
        betaU_new[u] = beta_u_num / (lamb + len(itemsPerUser[u]))

    for i in usersPerItem:
        beta_i_num = 0
        for u in usersPerItem[i]:
            beta_i_num += allUI[(u, i)] - (alpha_new + betaU_old[u])
        betaI_new[i] = beta_i_num / (lamb + len(usersPerItem[i]))

    y_pred = []

    for u, i, d in reviewsTrain:
        y_pred.append(alpha_new + betaU_new[u] + betaI_new[i])

    mse = mean_squared_error(trainRatings, y_pred)

    # print("=====================================")
    # print("MSE:", mse)
    # print("Change in alpha:", abs(alpha_new - alpha_old))
    
    if abs(alpha_new - alpha_old) > 0.005:
        return iterate(lamb, alpha_new, betaU_new, betaI_new, reviewsTrain, trainRatings)
    else:
        return alpha_new, betaU_new, betaI_new

In [None]:
def cross_validate_gd(k):

    mses = []

    # shuffle allReviews
    random.shuffle(allReviews)

    # split into k folds
    folds = []
    fold_size = int(len(allReviews) / k)

    for fold in range(k):
        folds.append(allReviews[fold * fold_size : (fold + 1) * fold_size])

    for fold in range(k):

        betaU = {}
        betaI = {}

        for u in ratingsPerUser:
            betaU[u] = 0

        for g in ratingsPerItem:
            betaI[g] = 0

        currValidation = folds[fold]

        # use every other fold as training
        currTraining = []

        for j in range(k):
            if j != fold:
                currTraining += folds[j]
        
        trainRatings = []
        validRatings = []

        for user, item, details in currTraining:
            trainRatings.append(int(details['rating']))

        for user, item, details in currValidation:
            validRatings.append(int(details['rating']))

        alpha = global_median

        final_alpha, betaU_new, betaI_new = iterate(4.3, alpha, betaU, betaI, currTraining, trainRatings)

        y_pred = []

        for user, item, details in currValidation:
            y_pred.append(final_alpha + betaU_new[user] + betaI_new[item])

        validMSE = mean_squared_error(validRatings, y_pred)

        print("=====================================")
        print(f"Fold K = {int(fold)+1}, Validation MSE: {validMSE}")
        print("=====================================")

        mses.append(validMSE)

    avg_mse = sum(mses) / len(mses)

    print("=====================================\n")
    print(f"Average {k}-Fold Gradient Descent CV MSE: {avg_mse}")
    print("\n=====================================")

In [None]:
cross_validate_gd(5)

Fold K = 1, Validation MSE: 1.3601847766252928
Fold K = 2, Validation MSE: 1.3828149511704142
Fold K = 3, Validation MSE: 1.3732938324618142
Fold K = 4, Validation MSE: 1.3734536419119359
Fold K = 5, Validation MSE: 1.3563587234919026

Average 5-Fold Gradient Descent CV MSE: 1.3692211851322718

