In [1]:
import pandas as pd
import numpy as np

import os
import glob
from datetime import date
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, TensorDataset, DataLoader
from sklearn import preprocessing

In [2]:
if glob.glob('logs') != ['logs']:
    os.mkdir('logs')
else:
    pass

if glob.glob('weights') != ['weights']:
    os.mkdir('weights')
else:
    pass

In [3]:
log_model = 'ncf'
today = date.today()
log_formatter = logging.Formatter("%(asctime)s %(message)s")
logger = logging.getLogger()

log_file_name = "./logs/{}_{}".format(today, log_model)

file_handler = logging.FileHandler("{}.log".format(log_file_name))
file_handler.setFormatter(log_formatter)
logger.addHandler(file_handler)
logger.setLevel(logging.DEBUG)

In [4]:
ratings_cols = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('./ml-1m/ml-1m/ratings.dat', sep='::', engine='python', names=ratings_cols)

num_users = ratings.UserID.unique().shape[0]
num_items = ratings.MovieID.unique().shape[0]
print('no. users: %d, no. items: %d' %(num_users, num_items))

class CustomDataset(Dataset):
    def __init__(self, users, items, y):
        self.x = torch.cat([
            torch.LongTensor(users).unsqueeze(0).transpose(0, 1),
            torch.LongTensor(items).unsqueeze(0).transpose(0, 1)
        ], axis=1)
        self.y = torch.FloatTensor(y)

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.y)

no. users: 6040, no. items: 3706


In [7]:
le1 = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()

In [10]:
ratings.MovieID[5:25]

5     1197
6     1287
7     2804
8      594
9      919
10     595
11     938
12    2398
13    2918
14    1035
15    2791
16    2687
17    2018
18    3105
19    2797
20    2321
21     720
22    1270
23     527
24    2340
Name: MovieID, dtype: int64

In [15]:
batch_size = 256

train_dataset = CustomDataset(
    le1.fit_transform(ratings.UserID),
    le2.fit_transform(ratings.MovieID),
    ratings.Rating.values
)
train_loader = DataLoader(train_dataset, batch_size=batch_size)

In [20]:
train_dataset[7]

(tensor([   0, 2599]), tensor(5.))

In [21]:
class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=128, hidden_size=256):
        super(NCF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.mlp = nn.Sequential(
            nn.Linear(emb_size * 2, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, data):
        x, y = data[:, :1], data[:, 1:]
        u, v = self.user_emb(x), self.item_emb(y)
        uv = torch.cat((u, v), dim=1)
        return self.mlp(uv.view(uv.size(0), -1)).squeeze()


In [28]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print('DEVICE : %s' % device)

DEVICE : cuda


In [29]:
model = NCF(num_users, num_items, emb_size=256, hidden_size=256).to(device)
print('MODEL SUMMARY :', model)

MODEL SUMMARY : NCF(
  (user_emb): Embedding(6040, 256)
  (item_emb): Embedding(3706, 256)
  (mlp): Sequential(
    (0): Linear(in_features=512, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [10]:
model.train()

EPOCHS = 100
batch_size = 128
learning_rate = 0.0005

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

for e in range(EPOCHS):
    print('Start Epoch:', e+1)
    total_loss = 0.0
    total_batches = 0
    
    for batch_idx, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = x.to(device), y.to(device)
        
        # Forward pass
        y_hat = model(x)
        
        # Compute loss
        loss = loss_fn(y_hat, y)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_batches += 1
    
    avg_loss = total_loss / total_batches
    print("Epoch {}: Average Loss: {:.4f}".format(e+1, avg_loss))

print('Training Complete')


Start Epoch: 1
Epoch 1: Average Loss: 1.0560
Start Epoch: 2
Epoch 2: Average Loss: 0.9490
Start Epoch: 3
Epoch 3: Average Loss: 0.9002
Start Epoch: 4
Epoch 4: Average Loss: 0.8535
Start Epoch: 5
Epoch 5: Average Loss: 0.8324
Start Epoch: 6
Epoch 6: Average Loss: 0.8114
Start Epoch: 7
Epoch 7: Average Loss: 0.7975
Start Epoch: 8
Epoch 8: Average Loss: 0.7755
Start Epoch: 9
Epoch 9: Average Loss: 0.7653
Start Epoch: 10
Epoch 10: Average Loss: 0.7514
Start Epoch: 11
Epoch 11: Average Loss: 0.7310
Start Epoch: 12
Epoch 12: Average Loss: 0.7208
Start Epoch: 13
Epoch 13: Average Loss: 0.7038
Start Epoch: 14
Epoch 14: Average Loss: 0.6906
Start Epoch: 15
Epoch 15: Average Loss: 0.6715
Start Epoch: 16
Epoch 16: Average Loss: 0.6535
Start Epoch: 17
Epoch 17: Average Loss: 0.6409
Start Epoch: 18
Epoch 18: Average Loss: 0.6229
Start Epoch: 19
Epoch 19: Average Loss: 0.6055
Start Epoch: 20
Epoch 20: Average Loss: 0.5907
Start Epoch: 21
Epoch 21: Average Loss: 0.5728
Start Epoch: 22
Epoch 22: Avera

In [11]:
# Save the model weights
torch.save(model.state_dict(), "model_weights.pth")


In [51]:
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.MSELoss()

In [40]:
model.eval()

ratings['le_UserID'] = le1.transform(ratings.UserID)
ratings['le_MovieID'] = le1.transform(ratings.MovieID)

In [41]:
class TopNCF:
    def __init__(self, model, user_encoder, movie_encoder, movie_dataset):
        self.model = model
        self.user_encoder = user_encoder
        self.movie_encoder = movie_encoder
        self.movie_dataset = movie_dataset

    def recommend_movies(self, user_id, top_k=10):
        user_tensor = torch.LongTensor([self.user_encoder.transform([user_id])[0]]).to(device)
        movie_tensor = torch.LongTensor(range(self.movie_encoder.classes_.shape[0])).to(device)

        user_tensor = user_tensor.repeat(len(movie_tensor), 1)

        data = torch.cat((user_tensor, movie_tensor.unsqueeze(1)), dim=1)
        predictions = self.model(data).detach().cpu().numpy()

        top_indices = predictions.argsort(axis=0)[-top_k:][::-1]
        top_movie_ids = self.movie_encoder.classes_[top_indices]

        return top_movie_ids

    def print_top_movies(self, user_id, top_k=10):
        top_movie_ids = self.recommend_movies(user_id, top_k)
        print("Top", top_k, "movies for user", user_id)
        for movie_id in top_movie_ids:
            movie_title = self.movie_dataset.loc[self.movie_dataset['MovieID'] == movie_id, 'Title'].values[0]
            print(f"MovieID: {movie_id}, Title: {movie_title}")





In [45]:
# Load the movies dataset from a .dat file into a pandas DataFrame
movies_cols = ['MovieID', 'Title', 'Genres']
movies_df = pd.read_csv('./ml-1m/ml-1m/movies.dat', sep='::', engine='python', names=movies_cols, encoding='latin-1')

# Assuming the movies dataset has columns ['MovieID', 'Title']
movie_dataset = movies_df[['MovieID', 'Title']]

# Usage example
top_ncf = TopNCF(model, le1, le2, movie_dataset)
user_id = 10
top_ncf.print_top_movies(user_id)


Top 10 movies for user 10
MovieID: 2503, Title: Apple, The (Sib) (1998)
MovieID: 53, Title: Lamerica (1994)
MovieID: 2019, Title: Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
MovieID: 670, Title: World of Apu, The (Apur Sansar) (1959)
MovieID: 318, Title: Shawshank Redemption, The (1994)
MovieID: 3245, Title: I Am Cuba (Soy Cuba/Ya Kuba) (1964)
MovieID: 3232, Title: Seven Chances (1925)
MovieID: 527, Title: Schindler's List (1993)
MovieID: 2905, Title: Sanjuro (1962)
MovieID: 2197, Title: Firelight (1997)


In [46]:
# Load the movies dataset from a .dat file into a pandas DataFrame
movies_cols = ['MovieID', 'Title', 'Genres']
movies_df = pd.read_csv('./ml-1m/ml-1m/movies.dat', sep='::', engine='python', names=movies_cols, encoding='latin-1')

# Assuming the movies dataset has columns ['MovieID', 'Title']
movie_dataset = movies_df[['MovieID', 'Title']]

# Usage example
top_ncf = TopNCF(model, le1, le2, movie_dataset)
user_id = 22
top_ncf.print_top_movies(user_id)


Top 10 movies for user 22
MovieID: 501, Title: Naked (1993)
MovieID: 2503, Title: Apple, The (Sib) (1998)
MovieID: 2019, Title: Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
MovieID: 670, Title: World of Apu, The (Apur Sansar) (1959)
MovieID: 318, Title: Shawshank Redemption, The (1994)
MovieID: 527, Title: Schindler's List (1993)
MovieID: 2905, Title: Sanjuro (1962)
MovieID: 2197, Title: Firelight (1997)
MovieID: 53, Title: Lamerica (1994)
MovieID: 1223, Title: Grand Day Out, A (1992)
