In [1]:
# !wget https://files.grouplens.org/datasets/movielens/ml-25m.zip
# !unzip ml-25m.zip

--2024-04-15 18:02:54--  https://files.grouplens.org/datasets/movielens/ml-25m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 261978986 (250M) [application/zip]
Saving to: ‘ml-25m.zip’


2024-04-15 18:03:16 (12.4 MB/s) - ‘ml-25m.zip’ saved [261978986/261978986]

Archive:  ml-25m.zip
   creating: ml-25m/
  inflating: ml-25m/tags.csv         
  inflating: ml-25m/links.csv        
  inflating: ml-25m/README.txt       
  inflating: ml-25m/ratings.csv      
  inflating: ml-25m/genome-tags.csv  
  inflating: ml-25m/genome-scores.csv  
  inflating: ml-25m/movies.csv       


In [2]:
# !head ml-25m/ratings.csv

userId,movieId,rating,timestamp
1,296,5.0,1147880044
1,306,3.5,1147868817
1,307,5.0,1147868828
1,665,5.0,1147878820
1,899,3.5,1147868510
1,1088,4.0,1147868495
1,1175,3.5,1147868826
1,1217,3.5,1147878326
1,1237,5.0,1147868839


In [4]:
import numpy as np

with open("ml-25m/ratings.csv", "r") as f:
    print(f.readline())

    users = []
    movies = []
    ratings = []

    for line in f:
        uid, mid, rating, _ = line.split(",")
        users.append(int(uid))
        movies.append(int(mid))
        ratings.append(float(rating))
    
    users = np.array(users)
    movies = np.array(movies)
    ratings = np.array(ratings)

userId,movieId,rating,timestamp



In [7]:
# 예측 값
alpha = ratings.mean()

# 배열에서 상수 값을 빼게 되면, 각 배열 값마다 상수 값을 빼주게 된다.
rmse = ((ratings - alpha) ** 2).mean() ** 0.5
rmse

1.0607439399275531

In [8]:
user_bias = np.zeros(users.max() + 1)
movie_bias = np.zeros(movies.max() + 1)

In [13]:

lr = 1
lmd = 0.001

n_ratings = len(ratings)
n_users = len(users)
n_movies = len(movies)

for epoch in range(100):
    h = alpha + user_bias[users] + movie_bias[movies]
    diff = h - ratings

    # evaluation
    rmes = (diff ** 2).mean() ** 0.5
    if epoch%10 == 0:
        print(f"epoch: {epoch}, rmes: {rmes}")

    # Gradient Descent
    grd_alpha = diff.mean()
    grd_user_bias = np.bincount(users, weights=diff)/n_users + lmd * user_bias/n_users
    grd_movie_bias = np.bincount(movies, weights=diff)/n_movies + lmd * movie_bias/n_movies

    # update
    alpha = alpha - lr * grd_alpha
    user_bias = user_bias - lr * grd_user_bias
    movie_bias = movie_bias - lr * grd_movie_bias

h = alpha + user_bias[users] + movie_bias[movies]
diff = h - ratings

rmes = (diff ** 2).mean() ** 0.5
print(f"epoch: {epoch}, rmes: {rmes}")

epoch: 0, rmes: 1.0500697217574777
epoch: 10, rmes: 1.0491422633554863
epoch: 20, rmes: 1.0482365844497037
epoch: 30, rmes: 1.0473518959003612
epoch: 40, rmes: 1.0464874427776174
epoch: 50, rmes: 1.045642502744145
epoch: 60, rmes: 1.044816384516842
epoch: 70, rmes: 1.0440084264038854
epoch: 80, rmes: 1.0432179949134248
epoch: 90, rmes: 1.0424444834304643
epoch: 99, rmes: 1.0416873109585632


In [19]:
import torch

ratings_tensor = torch.from_numpy(ratings)

alpha = torch.tensor(ratings.mean())
alpha.requires_grad_(True)
user_bias = torch.zeros(users.max() + 1, requires_grad=True)
movie_bias = torch.zeros(movies.max() + 1, requires_grad=True)

optim = torch.optim.SGD([alpha, user_bias, movie_bias], lr=1)

lmd = 0.001

for epoch in range(100):
    h = alpha + user_bias[users] + movie_bias[movies]
    mse = ((h - ratings_tensor) ** 2).mean()
    reg = lmd * ((movie_bias ** 2).mean() + (user_bias ** 2).mean())
    cost = mse + reg

    optim.zero_grad()
    cost.backward()
    optim.step()

    with torch.no_grad():
        rmse = ((h - ratings_tensor) ** 2).mean() ** 0.5
        if epoch%10 == 0:
            print(f"epoch: {epoch}, rmse: {rmse}")

rmse = ((h - ratings_tensor) ** 2).mean() ** 0.5
if epoch%10 == 0:
    print(f"epoch: {epoch}, rmse: {rmse}")

epoch: 0, rmse: 1.0607439399275533
epoch: 10, rmse: 1.0583792494820634
epoch: 20, rmse: 1.056137807615803
epoch: 30, rmse: 1.0540102929190227
epoch: 40, rmse: 1.0519882175370794
epoch: 50, rmse: 1.0500638241837847
epoch: 60, rmse: 1.048230026362517
epoch: 70, rmse: 1.0464803395411784
epoch: 80, rmse: 1.0448088417849521
epoch: 90, rmse: 1.0432100976233305


In [26]:
lmd = 0.001

alpha = ratings.mean()
user_bias = np.zeros(users.max() + 1)
movie_bias = np.zeros(movies.max() + 1)

for epoch in range(10):
    h = alpha + user_bias[users] + movie_bias[movies]
    rmse = ((h - ratings) ** 2).mean() ** 0.5
    print(f"epoch: {epoch}, rmse: {rmse}")

    alpha = (ratings - (user_bias[users] + movie_bias[movies])).mean()
    user_bias = np.bincount(users, weights=(ratings - (alpha + movie_bias[movies]))) / (np.bincount(users) + lmd)
    movie_bias = np.bincount(movies, weights=(ratings - (alpha + user_bias[users]))) / (np.bincount(movies) + lmd)

h = alpha + user_bias[users] + movie_bias[movies]
rmse = ((h - ratings) ** 2).mean() ** 0.5
print(f"final rmse: {rmse}")

epoch: 0, rmse: 1.0607439399275531
epoch: 1, rmse: 0.8663159834756426
epoch: 2, rmse: 0.8510867185839471
epoch: 3, rmse: 0.8503568292676987
epoch: 4, rmse: 0.8503078899843288
epoch: 5, rmse: 0.8503025377563744
epoch: 6, rmse: 0.8503016450633218
epoch: 7, rmse: 0.8503014646716495
epoch: 8, rmse: 0.8503014258804191
epoch: 9, rmse: 0.8503014173907169
final rmse: 0.8503014155289849
