In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
import pandas as pd
import numpy as np
import sklearn.metrics
from collections import defaultdict
import random

In [3]:
from sklearn.metrics import roc_auc_score

In [4]:
RATING_THRESHOLD = 4
PREFIX_LEN = 8

In [5]:
df = pd.read_csv('big_data/rating.csv')
df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [6]:
movies_df = pd.read_csv('big_data/movie.csv')
movies_df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [7]:
movie_by_id = {movie_id: name for movie_id, name in zip(movies_df['movieId'], movies_df['title'])}

In [8]:
n_users = max(df['userId']) + 1

In [9]:
n_movies = max(df['movieId']) + 1

In [10]:
df = df.sort_values(by=['timestamp'])

In [11]:
items_by_user = defaultdict(list)
requests_by_user = defaultdict(list)

In [12]:
for userId, movieId, rating in zip(df['userId'], df['movieId'], df['rating']):
    if rating > RATING_THRESHOLD:
        items_by_user[userId].append(movieId)

In [13]:
for userId, movies in items_by_user.items():
    for i in range(PREFIX_LEN, len(movies)):
        requests_by_user[userId].append((movies[i - PREFIX_LEN:i], movies[i]))

In [22]:
def requests_to_dataset(requests, n_movies, n_negative):
    users, movies, targets = [], [], []
    for user, session in requests.items():
        for prev, movie in session:
            users.append(user)
            movies.append(movie)
            targets.append(1)
            
            for i in range(n_negative):
                users.append(user)
                movies.append(random.randint(0, n_movies - 1))
                targets.append(0)
                
    return TensorDataset(torch.LongTensor(users), torch.LongTensor(movies), torch.FloatTensor(targets).view(-1, 1))

In [19]:
def split_requests(requests):
    train_requests, val_requests, test_requests = {}, {}, {}
    for user, datas in requests.items():
        user_cnt = len(datas)
        train_size = (8 * user_cnt - 1) // 10 + 1
        val_size = (user_cnt - train_size) // 2
        
        train_requests[user] = datas[:train_size]
        val_requests[user] = datas[train_size:(train_size + val_size)]
        test_requests[user] = datas[(train_size + val_size):]
    return train_requests, val_requests, test_requests

In [20]:
train_requests, val_requests, test_requests = split_requests(requests_by_user)

In [23]:
train_dataset = requests_to_dataset(train_requests, n_movies, 10)
print(len(train_dataset))

30850479


In [24]:
val_dataset = requests_to_dataset(val_requests, n_movies, 10)
print(len(val_dataset))

3326873


In [26]:
test_dataset = requests_to_dataset(test_requests, n_movies, 10)
print(len(test_dataset))

3836074


In [28]:
class NMF(nn.Module):
    def __init__(self, n_users, n_items, embeddings_dim=64, hidden_size=64):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, embeddings_dim)
        self.item_embedding = nn.Embedding(n_items, embeddings_dim)
        self.encoder = nn.Sequential(
            nn.Linear(2 * embeddings_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size))
        self.predictor = nn.Sequential(
            nn.Linear(embeddings_dim + hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1))

    def forward(self, user, item):
        u = self.user_embedding(user)
        i = self.item_embedding(item)
        
        cross = u * i
        hidden = self.encoder(torch.cat([u, i], dim=-1))
        return self.predictor(torch.cat([cross, hidden], dim=-1))

In [47]:
def train(model, loss, opt, train, test, n_epochs, batch_size, device='cuda'):
    train_loader = DataLoader(train, batch_size, shuffle=True)
    test_loader = DataLoader(test, batch_size, shuffle=False)
    
    model.to(device)
    
    for i in range(n_epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            users, movies, targets = batch
            predictions = model(users.to(device), movies.to(device))
            loss_value = loss(predictions, targets.to(device))
            loss_value.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            test_predictions = np.zeros(len(test))
            test_targets = np.zeros(len(test))
            
            ptr = 0
            for batch in test_loader:
                users, movies, targets = batch
                predictions = model(users.to(device), movies.to(device))
                predictions = predictions.detach().cpu().numpy().flatten()
                test_predictions[ptr:ptr + len(predictions)] = predictions
                test_targets[ptr:ptr + len(targets)] = targets.numpy().flatten()
            
            print(f'After {i + 1} epochs AUC = {roc_auc_score(test_targets, test_predictions)}') 

In [35]:
model = NMF(n_users, n_movies)

In [36]:
loss = nn.BCEWithLogitsLoss()

In [37]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [38]:
train(model, loss, optimizer, train_dataset, val_dataset, 5, 50000)

After 1 epochs AUC = 0.6591797963655154
After 2 epochs AUC = 0.6754230200423118
After 3 epochs AUC = 0.6785140648417621
After 4 epochs AUC = 0.7113097227880175
After 5 epochs AUC = 0.7178320666615207


## Similar movies

In [40]:
from numpy.linalg import norm

In [41]:
embeddings = model.item_embedding.weight.detach().cpu().numpy()

In [42]:
def cosine_similarity(x, y):
    return x.dot(y) / (norm(x) * norm(y))

In [43]:
def get_similar(embeddings, vector, n_samples=10):
    results = []
    for movie_id, movie_name in movie_by_id.items():
        sim = cosine_similarity(embeddings[movie_id], vector)
        results.append((sim, movie_id, movie_name))
    results.sort(reverse=True)
    return results[:n_samples]

In [44]:
def show_similars_for(movie_id):
    for sim, movie_id, movie_name in get_similar(embeddings, embeddings[movie_id]):
        print(f'sim={sim:.2f}| id:{movie_id}\t{movie_name}')

In [45]:
show_similars_for(76093)

sim=1.00| id:76093	How to Train Your Dragon (2010)
sim=0.48| id:54771	Invasion, The (2007)
sim=0.44| id:827	Convent, The (O Convento) (1995)
sim=0.44| id:30810	Life Aquatic with Steve Zissou, The (2004)
sim=0.43| id:7920	Desperate Living (1977)
sim=0.43| id:1649	Fast, Cheap & Out of Control (1997)
sim=0.42| id:7151	Girl with a Pearl Earring (2003)
sim=0.42| id:2113	Graveyard Shift (Stephen King's Graveyard Shift) (1990)
sim=0.42| id:128616	As We Were Dreaming (2015)
sim=0.42| id:47904	Notti bianche, Le (White Nights) (1957)


In [46]:
show_similars_for(480)

sim=1.00| id:480	Jurassic Park (1993)
sim=0.47| id:69406	Proposal, The (2009)
sim=0.46| id:61075	Elegy (2008)
sim=0.44| id:8492	Christmas Carol, A (Scrooge) (1951)
sim=0.44| id:46976	Stranger than Fiction (2006)
sim=0.42| id:62235	Red (2008)
sim=0.42| id:37976	Flowers of St. Francis (Francesco, giullare di Dio) (1950)
sim=0.41| id:4933	Earthling, The (1980)
sim=0.41| id:899	Singin' in the Rain (1952)
sim=0.40| id:98491	Paperman (2012)


## Recommendations

In [48]:
def get_train_movies(user_id):
    requests = train_requests[user_id]
    return requests[0][0] + [request[1] for request in requests]

In [51]:
def show_user_recommendations(user_id):
    train_movies = get_train_movies(user_id)
    print('User movies:')
    for movie_id in train_movies:
        print(f'\t{movie_by_id[movie_id]}')
    recommendations = []
    user = torch.LongTensor([user_id]).to('cuda')
    model.eval()
    with torch.no_grad():
        for i in range(n_movies):
            score = model(user, torch.LongTensor([i]).to('cuda')).item()
            recommendations.append((score, i))
    recommendations.sort(reverse=True)
    print('\nRecomendations:')
    for score, i in recommendations[:10]:
        print(f'\t{movie_by_id[i]}')

In [52]:
show_user_recommendations(46380)

User movies:
	American President, The (1995)
	Clueless (1995)
	Waiting to Exhale (1995)
	Sense and Sensibility (1995)
	Persuasion (1995)
	Apollo 13 (1995)
	Before Sunrise (1995)
	Bullets Over Broadway (1994)
	Clerks (1994)
	Hoop Dreams (1994)
	Miami Rhapsody (1995)
	Remains of the Day, The (1993)

Recomendations:
	Shawshank Redemption, The (1994)
	Pulp Fiction (1994)
	Star Wars: Episode IV - A New Hope (1977)
	Silence of the Lambs, The (1991)
	Schindler's List (1993)
	Blade Runner (1982)
	Forrest Gump (1994)
	Star Wars: Episode V - The Empire Strikes Back (1980)
	Fight Club (1999)
	Lord of the Rings: The Return of the King, The (2003)


In [53]:
show_user_recommendations(2)

User movies:
	Mr. Holland's Opus (1995)
	Friday the 13th (1980)
	Star Wars: Episode VI - Return of the Jedi (1983)
	Star Trek: First Contact (1996)
	Terminator 2: Judgment Day (1991)
	Stand by Me (1986)
	Rules of Engagement (2000)
	From Russia with Love (1963)
	Time Machine, The (1960)
	From Dusk Till Dawn (1996)
	Femme Nikita, La (Nikita) (1990)
	Jurassic Park (1993)
	Fantastic Voyage (1966)
	Legends of the Fall (1994)
	Abbott and Costello Meet Frankenstein (1948)
	Grumpy Old Men (1993)
	Amityville Horror, The (1979)
	Creature from the Black Lagoon, The (1954)
	Lost World: Jurassic Park, The (1997)
	2001: A Space Odyssey (1968)
	Star Wars: Episode V - The Empire Strikes Back (1980)
	Back to the Future (1985)
	Alien (1979)

Recomendations:
	Shawshank Redemption, The (1994)
	Pulp Fiction (1994)
	Star Wars: Episode IV - A New Hope (1977)
	Matrix, The (1999)
	Forrest Gump (1994)
	Silence of the Lambs, The (1991)
	Lord of the Rings: The Return of the King, The (2003)
	Usual Suspects, The (

## Metrics

In [54]:
def calculate_metrics(pred, targets, users):
    user_pred = {}
    user_target = {}
    for p, t, user in zip(pred, targets, users):
        if user not in user_pred:
            user_pred[user] = []
            user_target[user] = []
        user_pred[user].append(p)
        user_target[user].append(t)
    auc_per_user = []
    for user in user_pred:
        try:
            auc_per_user.append(roc_auc_score(user_target[user], user_pred[user]))
        except ValueError:
            auc_per_user.append(0.5)
    metric_values = {
        'per_user_auc': np.mean(auc_per_user), 
        'auc': roc_auc_score(targets, pred)
    }
    return metric_values

In [55]:
test_loader = DataLoader(test_dataset, 50000, shuffle=False)

In [57]:
model.eval()
test_users = np.zeros(len(test_dataset))
test_predictions = np.zeros(len(test_dataset))
test_targets = np.zeros(len(test_dataset))
with torch.no_grad():         
    ptr = 0
    for batch in test_loader:
        users, movies, targets = batch
        predictions = model(users.to('cuda'), movies.to('cuda'))
        predictions = predictions.detach().cpu().numpy().flatten()
        test_users[ptr:ptr + len(predictions)] = users.numpy().flatten()
        test_predictions[ptr:ptr + len(predictions)] = predictions
        test_targets[ptr:ptr + len(targets)] = targets.numpy().flatten()

In [58]:
calculate_metrics(test_predictions, test_targets, test_users)

{'per_user_auc': 0.9796062694292754, 'auc': 0.6788107461266588}