In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [2]:
ratings = pd.read_csv('./ml-25m/ratings.csv')
movies = pd.read_csv('./ml-25m/movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
def filter_data(data, min_user_count, min_movie_count):
    user_counts = data['userId'].value_counts()
    movie_counts = data['movieId'].value_counts()

    filtered_users = user_counts[user_counts >= min_user_count].index
    filtered_movies = movie_counts[movie_counts >= min_movie_count].index

    filtered_data = data[(data['userId'].isin(filtered_users)) & (data['movieId'].isin(filtered_movies))]

    return filtered_data

In [6]:
min_user_count = 11
min_movie_count = 11
filtered_data = filter_data(ratings, min_user_count, min_movie_count)

In [7]:
ratings = filtered_data

movie_to_index = {movie_id: i for i, movie_id in enumerate(ratings['movieId'].unique())}
user_to_index = {user_id: i for i, user_id in enumerate(ratings['userId'].unique())}
ratings['movieId'] = ratings['movieId'].map(movie_to_index)
ratings['userId'] = ratings['userId'].map(user_to_index)

ratings = ratings.drop(columns='timestamp')
test_ratings = ratings.groupby('userId', group_keys=False).sample(5)
train_ratings = ratings.drop(test_ratings.index)

In [8]:
train_ratings = train_ratings.reset_index(drop=True)
test_ratings = test_ratings.reset_index(drop=True)

In [9]:
print(train_ratings.shape)
print(test_ratings.shape)
print(train_ratings['userId'].nunique())
print(test_ratings['userId'].nunique())
print(train_ratings['movieId'].nunique())
print(test_ratings['movieId'].nunique())

(24068148, 3)
(812705, 3)
162541
162541
23357
13869


In [10]:
class MFDataset(Dataset):
    def __init__(self, ratings):
        self.X = ratings[['userId', 'movieId']].to_numpy()
        self.y = ratings['rating'].to_numpy()
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        X = torch.IntTensor(self.X[index])
        y = torch.FloatTensor([self.y[index]])
        return X, y

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

batch_size = 64

train_dataset = MFDataset(train_ratings)
test_dataset = MFDataset(test_ratings)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X: torch.Size([64, 2])
Shape of y: torch.Size([64, 1]) torch.float32


In [12]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X[:, 0], X[:, 1])
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % 100 == 0:
            print(f'Batch {batch}, Loss: {loss.item()}')

In [16]:
class MF(nn.Module):
    def __init__(self, num_user, num_movie, k):
        super().__init__()
        self.num_user = num_user
        self.num_movie = num_movie
        self.user_embed = nn.Embedding(num_user, k)
        self.movie_embed = nn.Embedding(num_movie, k)
        # self.b_u = nn.Embedding(self.num_user, 1)
        # self.b_i = nn.Embedding(self.num_movie, 1)
        self.b_u = np.zeros((self.num_user, 1))
        self.b_i = np.zeros((self.num_movie, 1))
    
    def forward(self, user, movie):
        # user와 movie 모두 id가 1부터 시작하므로 1을 빼주어야 올바르게 인덱싱을 할 수 있다.
        p = self.user_embed(user)
        q = self.movie_embed(movie)
        b_u = self.b_u(user)
        b_i = self.b_i(movie)
            
        # pred = p @ q.T# + self.b_u + self.b_i + self.b
        pred = (p * q).sum(axis=1) + np.squeeze(b_u) + np.squeeze(b_i)
        return pred.view(-1)

In [19]:
num_user = train_ratings['userId'].nunique()
num_movie = train_ratings['movieId'].nunique()
model = MF(num_user, num_movie, 2)

for param in model.parameters():
    print(param)

Parameter containing:
tensor([[-0.0549, -1.0322],
        [ 0.5214,  0.6533],
        [-2.4192, -0.5731],
        ...,
        [ 0.7316, -0.5677],
        [ 1.1468, -0.4026],
        [-1.2227, -1.2621]], requires_grad=True)
Parameter containing:
tensor([[ 2.0507, -0.9570],
        [ 0.9378, -0.5764],
        [-1.5844, -0.8464],
        ...,
        [-1.0091,  1.1134],
        [ 0.3471,  0.3550],
        [-0.3609,  0.7056]], requires_grad=True)


In [None]:
epochs = 20

num_user = train_ratings['userId'].nunique()
num_movie = train_ratings['movieId'].nunique()
model = MF(num_user, num_movie, 2)

loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train(train_dataloader, model, loss_fn, optimizer)