This pipeline is designed to train the model in an automated way.
First, we will define a config with all the specifications. This dictionary is vital to keep track of the models we have used.
Then run the model and extract the metrics that we need, to evaluate the training.

In [1]:
#import config from file
import json

with open("config.json", 'r') as f:
    config = json.load(f)

print(config)

{'model': 'sequence', 'submodel': 'implicit', 'loss': 'pointwise', 'representation': 'pooling', 'embedding_dim': 32, 'n_iter': 10, 'batch_size': 256, 'l2': 0.0, 'lr': 0.01, 'optim': None, 'use_cuda': False, 'sparse': False, 'random_state': None, 'num_negative_samples': 5, 'dataset': 'Movielens', 'size': '100K'}


In [22]:
# or define it manually

# config for sequence
config = {
    # About the model
    "model": "sequence",    # sequence or factorizer
    "submodel": "implicit",    # implicit or explicit
    "loss": "bpr",    # one of ‘pointwise’, ‘bpr’, ‘hinge’, ‘adaptive_hinge’ or 'regression', 'poisson','logistic'
    "representation": "pooling",    # for sequence one of ‘pooling’, ‘cnn’, ‘lstm’, ‘mixture’, for factorizer always NONE !!!
    "embedding_dim": 32, 
    "n_iter": 10,
    "batch_size": 256,
    "l2": 0.0,
    "lr": 0.01,
    "optim": None,
    "use_cuda": False, 
    "sparse": False, 
    "random_state": None, 
    "num_negative_samples": 5,

    # About the database
    "dataset": "Movielens",    # Movielens, Synthetic or Goodbooks
    "size": "100K",
    # synthetic has a lot of different parameters but I'm not sure we are going to use it
}

In [25]:
# config for factorizer
config = {
    # About the model
    "model": "factorizer",    # sequence or factorizer
    "submodel": "implicit",    # implicit or explicit
    "loss": "bpr",    # one of ‘pointwise’, ‘bpr’, ‘hinge’, ‘adaptive_hinge’ or 'regression', 'poisson','logistic'
    "representation": "None",    # for sequence one of ‘pooling’, ‘cnn’, ‘lstm’, ‘mixture’, for factorizer always None !!!
    "embedding_dim": 32, 
    "n_iter": 10,
    "batch_size": 256,
    "l2": 0.0,
    "lr": 0.01,
    "optim": None,
    "use_cuda": False, 
    "sparse": False, 
    "random_state": None, 
    "num_negative_samples": 5,

    # About the database
    "dataset": "Movielens",    # Movielens, Synthetic or Goodbooks
    "size": "100K",
    # synthetic has a lot of different parameters but I'm not sure we are going to use it
}

In [9]:
from settings_definition import *

In [17]:
def evaluate_model(model, test):
    """This function evaluates the metric agreed by the team.
    At the moment we have not decided which ones so I just pick 2 as an example."""
    from spotlight.evaluation import mrr_score, precision_recall_score, rmse_score, sequence_mrr_score, sequence_precision_recall_score
    eval = {"mrr": mrr_score(model, test), "precision_recall": precision_recall_score(model, test), #"rmse": rmse_score(model, test),
            #"sequence_mrr":sequence_mrr_score(model, test), "sequence_precision_recall": sequence_precision_recall_score(model, test)
           }
    return eval

In [18]:
import os
os.chdir("../spotlight")
# for me, it only works if I'm in the spotlight directory

In [28]:
# Execution

from spotlight.cross_validation import random_train_test_split

dataset = define_dataset(config)
train, test = random_train_test_split(dataset)
if config["model"] == "sequence":
    train, test = train.to_sequence(), test
model = define_model(config)
model.fit(train)

eval = evaluate_model(model, test)
eval

AttributeError: 'str' object has no attribute 'parameters'

In [15]:
import torch

savePath = "../pipeline/trained_model_" + config['model']
torch.save(model, savePath)