In [1]:
import time
import warnings
import datetime as dt

import torch
from tqdm import tqdm

from src import pipeline, models, training, evaluation
from src.stopper import EarlyStopper

warnings.filterwarnings('ignore')

stopper = EarlyStopper(patience=5)
data, movies = pipeline.clean_data()
train, test = pipeline.split_data(data, dt.datetime(2005, 9, 1))


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = models.Autoencoder(len(movies)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()
BATCH_SIZE = 128
TOP_N_NUM = 500
NOISING = True

def apply_func_over_df(df, func, batch_size, shuffle, **kwargs):
    current_index = 0
    total_loss = 0

    if shuffle:
        df = df.sample(frac=1)

    while current_index < len(df):
        batch_data = df.iloc[current_index:(current_index+batch_size)]
        current_index += batch_size
        total_loss += func(batch_data, **kwargs)
    return total_loss

for epoch in range(1, 100):
    epoch_start = time.time()
    model.train()
    train_loss = apply_func_over_df(
        df=train, 
        func=training.train_step, 
        batch_size=BATCH_SIZE, 
        shuffle=True,
        model=model, 
        optim=optimizer, 
        n_movies=len(movies), 
        device=device, 
        loss_fn=loss_fn,
        noising=NOISING
    )
    model.eval()
    test_loss = apply_func_over_df(
        df=test, 
        func=training.test_step, 
        batch_size=BATCH_SIZE, 
        shuffle=False,
        model=model, 
        n_movies=len(movies), 
        device=device, 
        loss_fn=loss_fn
    )

    embeddings = evaluation.get_embeddings(len(movies), model, device)
    top_n_score = apply_func_over_df(
        df=test.iloc[:(len(test) // 20)],
        func=evaluation.evaluate_batch,
        batch_size=BATCH_SIZE,
        shuffle=False,
        model=model,
        n_recommendations=TOP_N_NUM,
        device=device,
        n_movies=len(movies),
        embeddings=embeddings,
    )
    top_n_score /= (len(test) // 20)

    if stopper.is_better(top_n_score):
        print(f"Saving model with loss {top_n_score:.2f}")  
        torch.save(model, 'model.pt')

    if stopper.should_stop(top_n_score):
        print(f"Stopping early at epoch {epoch} with best score {stopper.best_score:.2f}")
        break

    print(f'[Epoch {epoch}] Test Loss: {test_loss:.2f} | Train Loss: {train_loss:.2f} | Time: {time.time()-epoch_start:.2f}s | Top-{TOP_N_NUM} Score: {top_n_score}')

Cleaning data...
Saving model with loss 0.10
[Epoch 1] Test Loss: 2545.84 | Train Loss: 43.81 | Time: 340.56s | Top-500 Score: 0.09758955517150694
Saving model with loss 0.11
[Epoch 2] Test Loss: 948.36 | Train Loss: 34.70 | Time: 349.62s | Top-500 Score: 0.10775985574681766
Saving model with loss 0.12
[Epoch 3] Test Loss: 250.31 | Train Loss: 33.67 | Time: 354.08s | Top-500 Score: 0.1202464185670912
Saving model with loss 0.12
[Epoch 4] Test Loss: 412.24 | Train Loss: 33.16 | Time: 347.79s | Top-500 Score: 0.12446063778909824
Saving model with loss 0.13
[Epoch 5] Test Loss: 434.54 | Train Loss: 32.84 | Time: 350.85s | Top-500 Score: 0.13129307533801693
[Epoch 6] Test Loss: 171.91 | Train Loss: 32.55 | Time: 337.88s | Top-500 Score: 0.12698164639012155


In [2]:
stopper.best_score

0.1140655057553661

In [None]:
''