In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split

In [2]:
dir_path = './ml-100k/'
df_user = pd.read_csv(dir_path+'u.user', sep='|', names=['user_id' ,'age' , 'gender' , 'occupation', 'zip_code'])
df_item = pd.read_csv(dir_path+'u.item', sep='|', names=['movie id','movie title', 'release date', 'video release date',
              'IMDb URL', 'unknown', 'Action', 'Adventure', 'Animation',
              'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama','Fantasy',
              'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War','Western'], encoding='latin-1')
header = ['user_id', 'item_id', 'rating', 'timestamp']
df_data = pd.read_csv(dir_path+'u.data', sep='\t', names=header)
df_train, df_test = train_test_split(df_data, test_size=0.25, random_state=2018)

In [3]:
df_train_data = df_train.set_index(['user_id','item_id'])['rating'].unstack().fillna(0)
df_test_data = df_test.set_index(['user_id','item_id'])['rating'].unstack().fillna(0)

In [4]:
df_score = df_data.set_index(['user_id','item_id'])['rating'].unstack().fillna(0)

In [5]:
df_score.loc[1, 1]

5.0

In [40]:
import random
def get_batch(token, batch_size):
    index = random.sample(range(len(token)),batch_size)
    batch = []
    for i in index:
        batch.append(token[i])
    return batch
    

In [42]:
# test get_batch
tokens = tuple(zip(df_data['user_id'], df_data['item_id']))
batch = get_batch(tokens, 24)

487

In [55]:
def svdCostAndGradient(userVector,itemVector, score):
    predicted = userVector.dot(itemVector.T)
    cost = 0.5 * np.sqrt(np.square(score - predicted))
    delta = score - predicted
    gradUser = delta * (itemVector.flatten()) # (K,1)
    gradItem = delta * (userVector.flatten())
    return cost, gradUser, gradItem

In [56]:
def svdModel(userMatrix, itemMatrix, scoreMatrix, tokens, batch_size=1):
    U, V = scoreMatrix.shape
    # initialize
    gradUser = np.zeros(userMatrix.shape)
    gradItem = np.zeros(itemMatrix.shape)
    cost = 0.0
    
    batch = get_batch(tokens, batch_size)
    for sample in batch:
        i, j = sample[0], sample[1]
        uvec = userMatrix[i, :]
        ivec = itemMatrix[j, :]
        score = scoreMatrix[i, j]
        cc,gu,gi = svdCostAndGradient(uvec, ivec, score)
        gradUser[i] += gu
        gradItem[i] += gi
        cost += cc
        
    return cost, gradUser, gradItem

In [57]:
# sgd
# initialize 
U,V = df_score.shape
K = 15
userMatrix = np.ones(shape=(U, K))
itemMatrix = np.ones(shape=(V, K))
scoreMatrix = df_score.values
tokens = tuple(zip(df_data['user_id'], df_data['item_id']))
maxepoch = 200
for epoch in range(maxepoch):
    cost, gradUser, gradItem = svdModel(userMatrix, itemMatrix, scoreMatrix, tokens)
    userMatrix -= gradUser
    itemMatrix -= gradItem
    print(cost)

7.5
7.5
5.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
120.0
7.5
7.5
6.5
7.5
7.5
7.5
7.5
1920.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
120.0
7.5
7.5
7.5
7.5
7.5
7.5
120.0
7.5
7.5
5.0
7.5
120.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
5.0
5.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
5.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
90.0
7.5
7.5
118.0
7.5
120.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
6.0
120.0
5.5
7.5
7.5
7.5
7.5
120.0
7.5
21720.0
7.5
7.5
7.5
5.0
7.5
7.0
7.5
7.5
7.5
7.5
120.0
7.5
6.0
7.5
7.5
120.0
7.5
7.5
460920.0
120.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
120.0
7.5
7.5
7.5
460920.0
7.5
118.0
1890.0
7.5
120.0
5.5
7.5
460920.0
5.5
120.0
120.0
7.5
1920.0
7.5
120.0
7.5
7.5
5.5
120.0
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
7.5
120.0
7.5
120.0
120.0
7.5
7403520.0
120.0
120.0
7.5
7.5
7.5
5.5
7.5
120.0
7.5
7.5
7.5
7.5
5.0
120.0
120.0
1920.0
7.5
7.5
7.5
120.0
7.5
7.5
7.5
7.5
7.5
7.5
119.5
