In [28]:
import gzip
import math
import nltk
import numpy as np
import pandas as pd
import random
import scipy.optimize
import statistics
import tensorflow as tf
import xlearn as xl

from collections import defaultdict
from scipy.sparse import csr_matrix
from sklearn import svm, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from surprise import SVD, Dataset, Reader, KNNBasic
from sklearn.model_selection import train_test_split
from skopt import gp_minimize
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Concatenate, Dense

In [2]:
def parse(path):

  skip_count = 0

  g = gzip.open(path, 'r')
  for l in g:
    try:
        d = eval(l)
        u = d['user_id']
        i = d['item_id']

        if "age" not in d or "size" not in d or "height" not in d or "weight" not in d or "body type" not in d or "category" not in d:
          skip_count += 1
          continue

        yield u,i,d
          
    except:
      skip_count += 1
      continue

  print("Skipped %d items" % skip_count)

In [3]:
dataset = []
for l in parse("renttherunway_final_data.json.gz"):
    dataset.append(l)

Skipped 39434 items


In [4]:
allReviews = []
allItems = set()
itemCount = defaultdict(int)
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
allUI = {}
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

totalReviews = 0

for u, i, d in dataset:
  allReviews.append((u, i, d))
  allItems.add(i)
  itemCount[i] += 1
  totalReviews += 1
  itemsPerUser[u].add(i)
  usersPerItem[i].add(u)
  allUI[(u, i)] = int(d['rating'])
  ratingsPerUser[u].append(int(d['rating']))
  ratingsPerItem[i].append(int(d['rating']))

mostPopular = [(itemCount[x], x) for x in itemCount]
mostPopular.sort()
mostPopular.reverse()

In [5]:
def getMostPopular(threshold):
    return1 = set()
    count = 0

    for ic, i in mostPopular:
        count += ic
        return1.add(i)
        if count > threshold: break

    return return1

def getLeastPopular(threshold):
    return1 = set()
    count = 0

    for ic, i in reversed(mostPopular):
        count += ic
        return1.add(i)
        if count > threshold: break

    return return1

def getAccuracy(preds):
    correctCount = 0

    for p in preds:
        if (p[2] == 1 and p[1] in itemsPerUser[p[0]]) or (p[2] == 0 and p[1] not in itemsPerUser[p[0]]):
            # print("Correct: ", p)
            correctCount += 1

    baselineAccuracy = correctCount / len(preds)

    return baselineAccuracy

In [6]:
split_threshold = int(len(allReviews)*.7)

ratingsTrain = allReviews[:split_threshold]
ratingsValid = allReviews[split_threshold:]

In [7]:
userIDs, itemIDs = {}, {}
interactions = []

for user, item, details in allReviews:
    user = details['user_id']
    item = details['item_id']
    rating = int(details['rating'])

    if not user in userIDs: userIDs[user] = len(userIDs)
    if not item in itemIDs: itemIDs[item] = len(itemIDs)
    interactions.append((user, item, rating))

nUsers, nItems = len(userIDs), len(itemIDs)

In [8]:
userIDsTrain, itemIDsTrain = {}, {}
userIDsValid, itemIDsValid = {}, {}

for u, i, d in ratingsTrain:
    if not u in userIDsTrain:
        userIDsTrain[u] = len(userIDsTrain)
    if not i in itemIDsTrain:
        itemIDsTrain[i] = len(itemIDsTrain)

for u, i, d in ratingsValid:
    if not u in userIDsValid:
        userIDsValid[u] = len(userIDsValid)
    if not i in itemIDsValid:
        itemIDsValid[i] = len(itemIDsValid)

nUsersTrain, nItemsTrain = len(userIDsTrain), len(itemIDsTrain)
nUsersValid, nItemsValid = len(userIDsValid), len(itemIDsValid)

In [9]:
dataAll = []
dataTrain = []
dataValid = []
allRatings = []

for u, i, d in allReviews:
    dataAll.append(d)
    allRatings.append(int(d['rating']))

global_median = int(statistics.median(allRatings))

In [10]:
############### PREDICTION TASK 1.1: BPR PREDICTION ###############

In [11]:
items = list(itemIDs.keys())

class BPRbatch(tf.keras.Model):
    def __init__(self, K, lamb):
        super(BPRbatch, self).__init__()
        # Initialize variables
        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
        self.lamb = lamb

    def predict(self, u, i):
        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
        return p

    # Regularizer
    def reg(self):
        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
                            tf.nn.l2_loss(self.gammaU) +\
                            tf.nn.l2_loss(self.gammaI))
    
    def score(self, sampleU, sampleI):
        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
        beta_i = tf.nn.embedding_lookup(self.betaI, i)
        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
        return x_ui

    def call(self, sampleU, sampleI, sampleJ):
        x_ui = self.score(sampleU, sampleI)
        x_uj = self.score(sampleU, sampleJ)
        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))

In [12]:
def trainingStepBPR(model, interactions, optimizer):
    Nsamples = 50000
    with tf.GradientTape() as tape:
        sampleU, sampleI, sampleJ = [], [], []
        for _ in range(Nsamples):
            u,i,_ = random.choice(interactions) # positive sample
            j = random.choice(items) # negative sample
            while j in itemsPerUser[u]:
                j = random.choice(items)
            sampleU.append(userIDs[u])
            sampleI.append(itemIDs[i])
            sampleJ.append(itemIDs[j])

        loss = model(sampleU,sampleI,sampleJ)
        loss += model.reg()
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients((grad, var) for
                              (grad, var) in zip(gradients, model.trainable_variables)
                              if grad is not None)
    return loss.numpy()

In [13]:
leastPop = getLeastPopular(len(allItems) / 4)

In [14]:
def bprPredict(user, item, modelBPR):
    if item in leastPop:
        return 0
    else:
        ind_bpr = modelBPR.predict(userIDs[user], itemIDs[item]).numpy()
        if ind_bpr > 0.4:
            return 1
        else:
            return 0

In [15]:
def getNegativeSamples(dataset):
    negativeSamples = []
    
    for d in dataset:
        user = d[0]

        # randomly select an item that the user has not reviewed
        notReviewed = allItems - itemsPerUser[user]
        notReviewed = list(notReviewed)
        # pick a random index in notReviewed
        rand_ind = random.randint(0, len(notReviewed) - 1)
        newItem = notReviewed[rand_ind]

        negativeSamples.append((user, newItem, 0))

    return negativeSamples + dataset

In [19]:
def bprCV(k):

    folds = []
    accuracies = []

    fold_size = int(len(interactions) / k)

    for fold in range(k):
        folds.append(interactions[fold * fold_size : (fold + 1) * fold_size])

    for fold in range(k):

        currValid = folds[fold]
        currTraining = []

        optimizer = tf.keras.optimizers.Adam(0.1)
        # 5 latent factors default
        modelBPR = BPRbatch(5, 0.00001)

        for j in range(k):
            if j != fold:
                currTraining += folds[j]

        print("===========================\n")

        for i in range(30):
            obj = trainingStepBPR(modelBPR, currTraining, optimizer)
            if (i % 10 == 9): print(f"Fold {fold} iteration " + str(i+1) + ", objective = " + str(obj))

        validPreds = []

        negativeValid = getNegativeSamples(currValid)

        for u, i, _ in negativeValid:
            validPreds.append((u, i, bprPredict(u, i, modelBPR)))

        curr_acc = getAccuracy(validPreds)

        print("\n")
        print(f"Fold {fold} Validation Accuracy: {curr_acc}")
        print("\n===========================")

        accuracies.append(curr_acc)

    print(f"Average Accuracy Across {k} Folds: {sum(accuracies) / len(accuracies)}")
    print("\n")

In [20]:
bprCV(5)

Fold 0 iteration 10, objective = 0.46806264
Fold 0 iteration 20, objective = 0.42419165
Fold 0 iteration 30, objective = 0.42145234
Fold 0 iteration 40, objective = 0.41671234
Fold 0 iteration 50, objective = 0.40817404


Fold 0 Validation Accuracy: 0.7481712494285154

Fold 1 iteration 10, objective = 0.4696231
Fold 1 iteration 20, objective = 0.42380306
Fold 1 iteration 30, objective = 0.41693705
Fold 1 iteration 40, objective = 0.41588974
Fold 1 iteration 50, objective = 0.41044435


Fold 1 Validation Accuracy: 0.7490692965841552

Fold 2 iteration 10, objective = 0.46994984
Fold 2 iteration 20, objective = 0.41997802
Fold 2 iteration 30, objective = 0.4174055
Fold 2 iteration 40, objective = 0.41531417
Fold 2 iteration 50, objective = 0.409226


Fold 2 Validation Accuracy: 0.7500979687806153

Fold 3 iteration 10, objective = 0.46958163
Fold 3 iteration 20, objective = 0.42584887
Fold 3 iteration 30, objective = 0.42156363
Fold 3 iteration 40, objective = 0.4158792
Fold 3 iteration 50

In [24]:
############### PREDICTION TASK 1.2: NEURAL COLLABORATIVE FILTERING PREDICTION ###############

In [29]:
# create negative samples

temp = getNegativeSamples(interactions)
interactions_net = []

for i in temp:
    if i[2] != 0:
        interactions_net.append((i[0], i[1], 1))
    else:
        interactions_net.append((i[0], i[1], 0))

# convert interactions to dataframe
interactions_df = pd.DataFrame(interactions_net, columns=['user', 'item', 'rating'])
interactions_df['user_index'] = interactions_df['user'].apply(lambda x: userIDs[x])
interactions_df['item_index'] = interactions_df['item'].apply(lambda x: itemIDs[x])
interactions_df['positive_interaction'] = interactions_df['rating']

# split into train and test
train_df, test_df = train_test_split(interactions_df, test_size=0.3, random_state=42)

In [30]:
# Define the NCF model using TensorFlow's Keras API
def create_ncf_model(num_users, num_items, embedding_size=64):
    user_input = Input(shape=(1,))
    item_input = Input(shape=(1,))

    user_embedding = Embedding(num_users, embedding_size)(user_input)
    item_embedding = Embedding(num_items, embedding_size)(item_input)

    concat = Concatenate()([user_embedding, item_embedding])
    flatten = Flatten()(concat)
    dense_layer = Dense(64, activation='relu')(flatten)
    output_layer = Dense(1, activation='sigmoid')(dense_layer)

    model = Model(inputs=[user_input, item_input], outputs=output_layer)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [31]:
num_users = interactions_df['user_index'].nunique()
num_items = interactions_df['item_index'].nunique()

In [32]:
ncf_model = create_ncf_model(num_users, num_items)

history = ncf_model.fit(
    [train_df['user_index'], train_df['item_index']],
    train_df['positive_interaction'],
    epochs=3,
    batch_size=32,
    validation_data=([test_df['user_index'], test_df['item_index']], test_df['positive_interaction'])
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [33]:
############### TASK 2.0: BASELINE REGRESSION ###############

In [34]:
y_baseline = []
y_true = []

for u, i, d in ratingsValid:
    y_true.append(int(d['rating']))

for i in range(len(y_true)):
    y_baseline.append(global_median)
    
print(mean_squared_error(y_true, y_baseline))

2.9140704939803626


In [35]:
############### TASK 2.1: LATENT FACTOR REGRESSION ###############

In [36]:
# Update alpha, beta_u, and beta_i until convergence

def iterate(lamb, alpha_old, betaU_old, betaI_old, reviewsTrain, trainRatings):

    betaU_new = {}
    betaI_new = {}

    alpha_num = 0
    for u, i, _ in reviewsTrain:
        alpha_num += allUI[(u, i)] - (betaU_old[u] + betaI_old[i])
    alpha_new = alpha_num / len(reviewsTrain)

    for u in itemsPerUser:
        beta_u_num = 0
        for i in itemsPerUser[u]:
            beta_u_num += allUI[(u, i)] - (alpha_new + betaI_old[i])
        betaU_new[u] = beta_u_num / (lamb + len(itemsPerUser[u]))

    for i in usersPerItem:
        beta_i_num = 0
        for u in usersPerItem[i]:
            beta_i_num += allUI[(u, i)] - (alpha_new + betaU_old[u])
        betaI_new[i] = beta_i_num / (lamb + len(usersPerItem[i]))
    
    if abs(alpha_new - alpha_old) > 0.005:
        return iterate(lamb, alpha_new, betaU_new, betaI_new, reviewsTrain, trainRatings)
    else:
        return alpha_new, betaU_new, betaI_new

In [37]:
def LatentFactorCV(k):

    mses = []

    # shuffle allReviews
    random.shuffle(allReviews)

    # split into k folds
    folds = []
    fold_size = int(len(allReviews) / k)

    for fold in range(k):
        folds.append(allReviews[fold * fold_size : (fold + 1) * fold_size])

    for fold in range(k):

        betaU = {}
        betaI = {}

        for u in ratingsPerUser:
            betaU[u] = 0

        for g in ratingsPerItem:
            betaI[g] = 0

        currValidation = folds[fold]

        # use every other fold as training
        currTraining = []

        for j in range(k):
            if j != fold:
                currTraining += folds[j]
        
        trainRatings = []
        validRatings = []

        for user, item, details in currTraining:
            trainRatings.append(int(details['rating']))

        for user, item, details in currValidation:
            validRatings.append(int(details['rating']))

        alpha = global_median

        final_alpha, betaU_new, betaI_new = iterate(4.3, alpha, betaU, betaI, currTraining, trainRatings)

        y_pred = []

        for user, item, details in currValidation:
            y_pred.append(final_alpha + betaU_new[user] + betaI_new[item])

        validMSE = mean_squared_error(validRatings, y_pred)

        print("=====================================")
        print(f"Fold K = {int(fold)+1}, Validation MSE: {validMSE}")
        print("=====================================")

        mses.append(validMSE)

    avg_mse = sum(mses) / len(mses)

    print("=====================================\n")
    print(f"Average {k}-Fold Gradient Descent CV MSE: {avg_mse}")
    print("\n=====================================")

In [38]:
LatentFactorCV(5)

Fold K = 1, Validation MSE: 1.3836023007282647
Fold K = 2, Validation MSE: 1.3811487286133814
Fold K = 3, Validation MSE: 1.3430821002561888
Fold K = 4, Validation MSE: 1.3711317674868169
Fold K = 5, Validation MSE: 1.3680693041942387

Average 5-Fold Gradient Descent CV MSE: 1.369406840255778



In [39]:
############### TASK 2.2: FACTORIZATION MACHINE REGRESSION ###############

In [40]:
df = pd.DataFrame(dataAll)

In [41]:
def convert_height_to_inches(height_str):
    feet = int(height_str.split("'")[0])
    inches = int(height_str.split("'")[1][1:-1])
    height_inches = feet * 12 + inches

    return height_inches

def convert_weight_to_lbs(weight_str):
    weight_lbs = int(weight_str[:-3])

    return weight_lbs

In [42]:
df['height'] = df['height'].apply(convert_height_to_inches)
df['weight'] = df['weight'].apply(convert_weight_to_lbs)

In [43]:
one_hot_rented = pd.get_dummies(df['rented for'])
one_hot_btype = pd.get_dummies(df['body type'])

In [44]:
df = df.drop(['user_id', 'item_id', 'review_date', 'review_summary', 'review_text', 'rented for', 'body type', 'category'], axis=1)
df = pd.concat([df, one_hot_rented, one_hot_btype], axis=1)

In [45]:
libsvm_data = df.apply(
    lambda row: f"{row['rating']} 1:{row['age']} 2:{row['size']} 3:{row['height']} 4:{row['weight']} "
    + " ".join([f"{i + 5}:{value}" for i, value in enumerate(row.iloc[7:])]) + "\n", axis=1)

In [46]:
with open('train.txt', 'w') as f:
    f.writelines(libsvm_data)

In [47]:
ffm_model = xl.create_fm() # Use field-aware factorization machine (ffm)
ffm_model.setTrain("./train.txt")    # Set the path of training dataset

param = {'task':'reg', 'lr':0.2, 'lambda':0.02, 'epoch':3, 'fold':5, 'k':5}

In [48]:
ffm_model.cv(param)

[32m[1m----------------------------------------------------------------------------------------------
           _
          | |
     __  _| |     ___  __ _ _ __ _ __
     \ \/ / |    / _ \/ _` | '__| '_ \ 
      >  <| |___|  __/ (_| | |  | | | |
     /_/\_\_____/\___|\__,_|_|  |_| |_|

        xLearn   -- 0.40 Version --
----------------------------------------------------------------------------------------------

[32m[------------] [0mxLearn uses 4 threads for training task.
[32m[1m[ ACTION     ] Read Problem ...[0m
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./train.txt_0.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mFirst check if the text file has been already converted to binary format.
[32m[------------] [0mBinary file (./train.txt_1.bin) NOT found. Convert text file to binary file.
[32m[------------] [0mFirst check if the text file has been already 