In [1]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import TensorDataset, DataLoader

In [2]:
import pandas as pd
import numpy as np
import sklearn.metrics
from collections import defaultdict
import random

In [3]:
from sklearn.metrics import roc_auc_score

In [4]:
RATING_THRESHOLD = 4
PREFIX_LEN = 8

In [36]:
df = pd.read_csv('big_data/rating.csv')
df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [37]:
movies_df = pd.read_csv('big_data/movie.csv')
movies_df.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [38]:
movie_by_id = {movie_id: name for movie_id, name in zip(movies_df['movieId'], movies_df['title'])}

In [6]:
n_users = max(df['userId']) + 1

In [7]:
n_movies = max(df['movieId']) + 1

In [8]:
df = df.sort_values(by=['timestamp'])

In [9]:
items_by_user = defaultdict(list)
requests_by_user = defaultdict(list)

In [10]:
for userId, movieId, rating in zip(df['userId'], df['movieId'], df['rating']):
    if rating > RATING_THRESHOLD:
        items_by_user[userId].append(movieId)

In [11]:
for userId, movies in items_by_user.items():
    for i in range(PREFIX_LEN, len(movies)):
        requests_by_user[userId].append((movies[i - PREFIX_LEN:i], movies[i]))

In [12]:
print(f'Unique users with at least {PREFIX_LEN + 1} ratings: {len(requests_by_user)}')

Unique users with at least 9 ratings: 100243


In [13]:
def requests_to_dataset(requests, n_movies, n_negative):
    users, prev_movies, movies, targets = [], [], [], []
    for user, session in requests.items():
        for prev, movie in session:
            users.append(user)
            prev_movies.append(prev)
            movies.append(movie)
            targets.append(1)
            
            for i in range(n_negative):
                users.append(user)
                prev_movies.append(prev)
                movies.append(random.randint(0, n_movies - 1))
                targets.append(0)
                
    return TensorDataset(torch.LongTensor(users), torch.LongTensor(prev_movies), 
                         torch.LongTensor(movies), torch.FloatTensor(targets).view(-1, 1))

In [14]:
def split_requests(requests):
    train_requests, val_requests, test_requests = {}, {}, {}
    for user, datas in requests.items():
        user_cnt = len(datas)
        train_size = (8 * user_cnt - 1) // 10 + 1
        val_size = (user_cnt - train_size) // 2
        
        train_requests[user] = datas[:train_size]
        val_requests[user] = datas[train_size:(train_size + val_size)]
        test_requests[user] = datas[(train_size + val_size):]
    return train_requests, val_requests, test_requests

In [15]:
train_requests, val_requests, test_requests = split_requests(requests_by_user)

In [16]:
train_dataset = requests_to_dataset(train_requests, n_movies, 10)
print(len(train_dataset))

30850479


In [17]:
val_dataset = requests_to_dataset(val_requests, n_movies, 10)
print(len(val_dataset))

3326873


In [18]:
test_dataset = requests_to_dataset(test_requests, n_movies, 10)
print(len(val_dataset))

3326873


In [19]:
class SimpleAttention(nn.Module):
    def __init__(self, n_items, embeddings_dim=64, hidden_size=64):
        super().__init__()
        self.item_embedding = nn.Embedding(n_items, embeddings_dim)
        self.attention = nn.Sequential(
            nn.Linear(2 * embeddings_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )
        self.predictor = nn.Sequential(
            nn.Linear(2 * embeddings_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1))

    def forward(self, user, movies, item):
        m = self.item_embedding(movies)
        i = self.item_embedding(item)
        
        query = torch.cat([m, i.unsqueeze(-2).expand_as(m)], dim=-1)
        weights = F.softmax(self.attention(query), dim=-2)
        hidden = torch.cat([i, torch.sum(m * weights, dim=-2)], dim=-1)
        
        return self.predictor(hidden)

In [20]:
def train(model, loss, opt, train, test, n_epochs, batch_size, device='cuda'):
    train_loader = DataLoader(train, batch_size, shuffle=True)
    test_loader = DataLoader(test, batch_size, shuffle=False)
    
    model.to(device)
    
    for i in range(n_epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            users, prev_movies, movies, targets = batch
            predictions = model(users.to(device), prev_movies.to(device), movies.to(device))
            loss_value = loss(predictions, targets.to(device))
            loss_value.backward()
            optimizer.step()
        
        model.eval()
        with torch.no_grad():
            test_predictions = np.zeros(len(test))
            test_targets = np.zeros(len(test))
            
            ptr = 0
            for batch in test_loader:
                users, prev_movies, movies, targets = batch
                predictions = model(users.to(device), prev_movies.to(device), movies.to(device))
                predictions = predictions.detach().cpu().numpy().flatten()
                test_predictions[ptr:ptr + len(predictions)] = predictions
                test_targets[ptr:ptr + len(targets)] = targets.numpy().flatten()
            
            print(f'After {i + 1} epochs AUC = {roc_auc_score(test_targets, test_predictions)}') 

In [31]:
model = SimpleAttention(n_movies)

In [32]:
loss = nn.BCEWithLogitsLoss()

In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [34]:
train(model, loss, optimizer, train_dataset, val_dataset, 5, 25000)

After 1 epochs AUC = 0.7480036130889769
After 2 epochs AUC = 0.7563702234837912
After 3 epochs AUC = 0.774307173783972
After 4 epochs AUC = 0.7896151573585418
After 5 epochs AUC = 0.7835022350172225


## Similar movies

In [49]:
from numpy.linalg import norm

In [50]:
embeddings = model.item_embedding.weight.detach().cpu().numpy()

In [66]:
def cosine_similarity(x, y):
    return x.dot(y) / (norm(x) * norm(y))

In [67]:
def get_similar(embeddings, vector, n_samples=10):
    results = []
    for movie_id, movie_name in movie_by_id.items():
        sim = cosine_similarity(embeddings[movie_id], vector)
        results.append((sim, movie_id, movie_name))
    results.sort(reverse=True)
    return results[:n_samples]

In [68]:
def show_similars_for(movie_id):
    for sim, movie_id, movie_name in get_similar(embeddings, embeddings[movie_id]):
        print(f'sim={sim:.2f}| id:{movie_id}\t{movie_name}')

In [70]:
show_similars_for(76093)

sim=1.00| id:76093	How to Train Your Dragon (2010)
sim=0.43| id:4903	In the Bedroom (2001)
sim=0.43| id:118930	Bill Burr: I'm Sorry You Feel That Way (2014)
sim=0.43| id:7198	Pick-up Artist, The (1987)
sim=0.42| id:1305	Paris, Texas (1984)
sim=0.42| id:99446	Ultrasuede: In Search of Halston (2010)
sim=0.42| id:96588	Pitch Perfect (2012)
sim=0.42| id:27036	Merlin (1998)
sim=0.41| id:3980	Men of Honor (2000)
sim=0.41| id:82589	Mother and Child (2009)


In [87]:
show_similars_for(480)

sim=1.00| id:480	Jurassic Park (1993)
sim=0.47| id:410	Addams Family Values (1993)
sim=0.47| id:1014	Pollyanna (1960)
sim=0.43| id:454	Firm, The (1993)
sim=0.42| id:4361	Tootsie (1982)
sim=0.42| id:755	Kim (1950)
sim=0.42| id:223	Clerks (1994)
sim=0.42| id:2706	American Pie (1999)
sim=0.41| id:2105	Tron (1982)
sim=0.41| id:356	Forrest Gump (1994)


## Recommendations

In [88]:
def get_train_movies(user_id):
    requests = train_requests[user_id]
    return requests[0][0] + [request[1] for request in requests]

In [104]:
def show_user_recommendations(user_id):
    train_movies = get_train_movies(user_id)
    print('User movies:')
    for movie_id in train_movies:
        print(f'\t{movie_by_id[movie_id]}')
    recommendations = []
    user = torch.LongTensor([user_id]).to('cuda')
    movies = torch.LongTensor([train_movies]).to('cuda')
    model.eval()
    with torch.no_grad():
        for i in range(n_movies):
            score = model(user, movies, torch.LongTensor([i]).to('cuda')).item()
            recommendations.append((score, i))
    recommendations.sort(reverse=True)
    print('\nRecomendations:')
    for score, i in recommendations[:10]:
        print(f'\t{movie_by_id[i]}')

In [105]:
show_user_recommendations(46380)

User movies:
	American President, The (1995)
	Clueless (1995)
	Waiting to Exhale (1995)
	Sense and Sensibility (1995)
	Persuasion (1995)
	Apollo 13 (1995)
	Before Sunrise (1995)
	Bullets Over Broadway (1994)
	Clerks (1994)
	Hoop Dreams (1994)
	Miami Rhapsody (1995)
	Remains of the Day, The (1993)

Recomendations:
	Usual Suspects, The (1995)
	Fargo (1996)
	Schindler's List (1993)
	L.A. Confidential (1997)
	Silence of the Lambs, The (1991)
	Apollo 13 (1995)
	Like Water for Chocolate (Como agua para chocolate) (1992)
	Quiz Show (1994)
	Little Women (1994)
	Firm, The (1993)


In [106]:
show_user_recommendations(2)

User movies:
	Mr. Holland's Opus (1995)
	Friday the 13th (1980)
	Star Wars: Episode VI - Return of the Jedi (1983)
	Star Trek: First Contact (1996)
	Terminator 2: Judgment Day (1991)
	Stand by Me (1986)
	Rules of Engagement (2000)
	From Russia with Love (1963)
	Time Machine, The (1960)
	From Dusk Till Dawn (1996)
	Femme Nikita, La (Nikita) (1990)
	Jurassic Park (1993)
	Fantastic Voyage (1966)
	Legends of the Fall (1994)
	Abbott and Costello Meet Frankenstein (1948)
	Grumpy Old Men (1993)
	Amityville Horror, The (1979)
	Creature from the Black Lagoon, The (1954)
	Lost World: Jurassic Park, The (1997)
	2001: A Space Odyssey (1968)
	Star Wars: Episode V - The Empire Strikes Back (1980)
	Back to the Future (1985)
	Alien (1979)

Recomendations:
	Shawshank Redemption, The (1994)
	Pulp Fiction (1994)
	Godfather, The (1972)
	Usual Suspects, The (1995)
	Schindler's List (1993)
	Braveheart (1995)
	Silence of the Lambs, The (1991)
	Raiders of the Lost Ark (Indiana Jones and the Raiders of the Los

## Metrics

In [119]:
def calculate_metrics(pred, targets, users):
    user_pred = {}
    user_target = {}
    for p, t, user in zip(pred, targets, users):
        if user not in user_pred:
            user_pred[user] = []
            user_target[user] = []
        user_pred[user].append(p)
        user_target[user].append(t)
    auc_per_user = []
    for user in user_pred:
        try:
            auc_per_user.append(roc_auc_score(user_target[user], user_pred[user]))
        except ValueError:
            auc_per_user.append(0.5)
    metric_values = {
        'per_user_auc': np.mean(auc_per_user), 
        'auc': roc_auc_score(targets, pred)
    }
    return metric_values

In [111]:
test_loader = DataLoader(test_dataset, 50000, shuffle=False)

In [113]:
model.eval()
test_users = np.zeros(len(test_dataset))
test_predictions = np.zeros(len(test_dataset))
test_targets = np.zeros(len(test_dataset))
with torch.no_grad():         
    ptr = 0
    for batch in test_loader:
        users, prev_movies, movies, targets = batch
        predictions = model(users.to('cuda'), prev_movies.to('cuda'), movies.to('cuda'))
        predictions = predictions.detach().cpu().numpy().flatten()
        test_users[ptr:ptr + len(predictions)] = users.numpy().flatten()
        test_predictions[ptr:ptr + len(predictions)] = predictions
        test_targets[ptr:ptr + len(targets)] = targets.numpy().flatten()

In [120]:
calculate_metrics(test_predictions, test_targets, test_users)

{'per_user_auc': 0.9806393391258358, 'auc': 0.7688662538466688}