In [1]:
"""Example for Disentangled Multimodal Recommendation, with only feedback and textual modality.
For an example including image modality please see dmrl_clothes_example.py"""

import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality

# The necessary data can be loaded as follows
docs, item_ids = citeulike.load_text()
feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids))

item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
)

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.2,
    exclude_unknowns=True,
    verbose=True,
    seed=123,
    rating_threshold=0.5,
    item_text=item_text_modality,
)

# Instantiate DMRL recommender
dmrl_recommender = cornac.models.dmrl.DMRL(
    batch_size=4096,
    epochs=20,
    log_metrics=False,
    learning_rate=0.01,
    num_factors=2,
    decay_r=0.5,
    decay_c=0.01,
    num_neg=3,
    embedding_dim=100,
)

# Use Recall@300 for evaluations
rec_300 = cornac.metrics.Recall(k=300)
prec_30 = cornac.metrics.Precision(k=30)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split, models=[dmrl_recommender], metrics=[prec_30, rec_300]
).run()

  from .autonotebook import tqdm as notebook_tqdm


Data from https://static.preferred.ai/cornac/datasets/citeulike/text.zip
will be cached into C:\Users\User\.cornac\citeulike/raw-data.csv


8.06MB [00:01, 5.01MB/s]                            


Unzipping ...
File cached!
Data from https://static.preferred.ai/cornac/datasets/citeulike/users.zip
will be cached into C:\Users\User\.cornac\citeulike/users.dat


467kB [00:00, 511kB/s]                             


Unzipping ...
File cached!
rating_threshold = 0.5
exclude_unknowns = True




---
Training data:
Number of users = 5551
Number of items = 16949
Number of ratings = 168396
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 5551
Number of items = 16949
Number of ratings = 42053
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5551
Total items = 16949

[DMRL] Training started!


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Pre-encoding the entire corpus. This might take a while.
Using device cpu for training
  batch 5 loss: 2840.7529296875
  batch 10 loss: 2840.68525390625
  batch 15 loss: 2827.552294921875
  batch 20 loss: 2791.51708984375
  batch 25 loss: 2779.209716796875
  batch 30 loss: 2778.14609375
  batch 35 loss: 2774.850927734375
  batch 40 loss: 2762.80126953125
Epoch: 0 is done
  batch 5 loss: 2714.612890625
  batch 10 loss: 2691.10478515625
  batch 15 loss: 2656.2841796875
  batch 20 loss: 2618.62783203125
  batch 25 loss: 2548.287841796875
  batch 30 loss: 2466.146435546875
  batch 35 loss: 2372.91669921875
  batch 40 loss: 2266.306396484375
Epoch: 1 is done
  batch 5 loss: 1900.1217041015625
  batch 10 loss: 1783.2693359375
  batch 15 loss: 1664.2704345703125
  batch 20 loss: 1598.8288818359374
  batch 25 loss: 1522.9539794921875
  batch 30 loss: 1465.861572265625
  batch 35 loss: 1423.40009765625
  batch 40 loss: 1373.8553955078125
Epoch: 2 is done
  batch 5 loss: 989.97392578125
  batch 

Ranking: 100%|██████████| 5444/5444 [10:51<00:00,  8.36it/s]


TEST:
...
     | Precision@30 | Recall@300 | Train (s) | Test (s)
---- + ------------ + ---------- + --------- + --------
DMRL |       0.0642 |     0.6266 | 1870.7587 | 651.5259






# Exploring what each modality in the example look like

### ratings

In [3]:
import cornac
from cornac.data import Reader
from cornac.datasets import citeulike
from cornac.eval_methods import RatioSplit
from cornac.data import TextModality

# The necessary data can be loaded as follows
docs, item_ids = citeulike.load_text()
feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids))

item_text_modality = TextModality(
    corpus=docs,
    ids=item_ids,
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
feedback

[('1', '70', 1.0),
 ('1', '495', 1.0),
 ('1', '1631', 1.0),
 ('1', '2317', 1.0),
 ('1', '2526', 1.0),
 ('1', '2846', 1.0),
 ('1', '2931', 1.0),
 ('1', '3171', 1.0),
 ('1', '3297', 1.0),
 ('1', '3332', 1.0),
 ('1', '3404', 1.0),
 ('1', '3591', 1.0),
 ('1', '3595', 1.0),
 ('1', '3770', 1.0),
 ('1', '3950', 1.0),
 ('1', '4626', 1.0),
 ('1', '4662', 1.0),
 ('1', '4871', 1.0),
 ('1', '4889', 1.0),
 ('1', '5114', 1.0),
 ('1', '5324', 1.0),
 ('1', '5325', 1.0),
 ('1', '5614', 1.0),
 ('1', '5991', 1.0),
 ('1', '6103', 1.0),
 ('1', '6874', 1.0),
 ('1', '6968', 1.0),
 ('1', '7106', 1.0),
 ('1', '7801', 1.0),
 ('1', '7867', 1.0),
 ('1', '8903', 1.0),
 ('1', '9907', 1.0),
 ('1', '10008', 1.0),
 ('1', '10204', 1.0),
 ('1', '10272', 1.0),
 ('1', '10288', 1.0),
 ('1', '10508', 1.0),
 ('1', '10588', 1.0),
 ('1', '11009', 1.0),
 ('1', '11105', 1.0),
 ('1', '11226', 1.0),
 ('1', '11320', 1.0),
 ('1', '11650', 1.0),
 ('1', '11853', 1.0),
 ('1', '11919', 1.0),
 ('1', '12684', 1.0),
 ('1', '12716', 1.0),
 

### Text

In [3]:
item_text_modality

<cornac.data.text.TextModality at 0x2331df78c10>

In [4]:
docs

["The metabolic world of Escherichia coli is not small. To elucidate the organizational and evolutionary principles of the metabolism of living organisms, recent studies have addressed the graph-theoretic analysis of large biochemical networks responsible for the synthesis and degradation of cellular building blocks [Jeong, H., Tombor, B., Albert, R., Oltvai, Z. N. \\& Barab\\{\\'a\\}si, A. L. (2000) Nature 407, 651-654; Wagner, A. \\& Fell, D. A. (2001) Proc. R. Soc. London Ser. B 268, 1803-1810; and Ma, H.-W. \\& Zeng, A.-P. (2003) Bioinformatics 19, 270-277]. In such studies, the global properties of the network are computed by considering enzymatic reactions as links between metabolites. However, the pathways computed in this manner do not conserve their structural moieties and therefore do not correspond to biochemical pathways on the traditional metabolic map. In this work, we reassessed earlier results by digitizing carbon atomic traces in metabolic reactions annotated for Esche

docs is the list of all the text strings

In [6]:
item_ids

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '126',
 '127',
 '128',
 '129',
 '130',
 '131',
 '132',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '13

In [9]:
import torch

In [10]:
encoded_corpus = torch.load('temp/encoded_corpus.pt')

In [11]:
encoded_corpus

tensor([[-0.1556,  0.1022, -0.0095,  ..., -0.2524, -0.2963, -0.0900],
        [-0.4095,  0.1304,  0.0308,  ...,  0.4947,  0.2034,  0.0846],
        [ 0.4423,  0.1621,  0.4447,  ..., -0.0036, -0.3720,  0.0240],
        ...,
        [-0.2899, -0.1725,  0.0080,  ...,  0.1382,  0.3344, -0.3326],
        [-0.4259, -0.0597, -0.2117,  ...,  0.0700,  0.3439, -0.0537],
        [-0.1999,  0.0034, -0.1960,  ..., -0.2166,  0.5256, -0.0830]])

In [12]:
len(encoded_corpus)

16980

In [15]:
encoded_corpus.size()

torch.Size([16980, 384])

In [7]:
"""Try without text modality"""

# The necessary data can be loaded as follows
# docs, item_ids = citeulike.load_text()
feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids))

# item_text_modality = TextModality(
#     corpus=docs,
#     ids=item_ids,
# )

# Define an evaluation method to split feedback into train and test sets
ratio_split = RatioSplit(
    data=feedback,
    test_size=0.2,
    exclude_unknowns=True,
    verbose=True,
    seed=123,
    rating_threshold=0.5,
    # item_text=item_text_modality,
)

# Instantiate DMRL recommender
dmrl_recommender = cornac.models.dmrl.DMRL(
    batch_size=4096,
    epochs=20,
    log_metrics=False,
    learning_rate=0.01,
    num_factors=2,
    decay_r=0.5,
    decay_c=0.01,
    num_neg=3,
    embedding_dim=100,
)

# Use Recall@300 for evaluations
rec_300 = cornac.metrics.Recall(k=300)
prec_30 = cornac.metrics.Precision(k=30)

# Put everything together into an experiment and run it
cornac.Experiment(
    eval_method=ratio_split, models=[dmrl_recommender], metrics=[prec_30, rec_300]
).run()

rating_threshold = 0.5
exclude_unknowns = True




---
Training data:
Number of users = 5551
Number of items = 16949
Number of ratings = 168396
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 5551
Number of items = 16949
Number of ratings = 42053
Number of unknown users = 0
Number of unknown items = 0
---
Total users = 5551
Total items = 16949

[DMRL] Training started!
Using device cpu for training




RuntimeError: mat1 and mat2 shapes cannot be multiplied (16384x100 and 150x2)