In [1]:
import pandas as pd
from torchctr.layers import EmbeddingLayer
from torchctr.datasets import (FeatureDict, get_movielens, make_datasets, read_data, defaults, fillna, make_dataloader)
from torchctr.datasets.data import RecommendDataset

In [2]:
# step 1: download dataset
root = get_movielens('../datasets', 'ml-1m')

Downloading...
Using downloaded and verified file: ../datasets\ml-1m\raw\ml-1m.zip
Extracting...
Done!


In [3]:
# step 2: read data
users = read_data(root / 'users.dat', sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
movies = read_data(root / 'movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
ratings = read_data(root / 'ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

In [4]:
# step 3: make dataset
dataset = pd.merge(ratings, users, on='UserID')
dataset = pd.merge(dataset, movies, on='MovieID')

In [5]:
# subsample
dataset = dataset.iloc[5000:10000, :]

In [6]:
# step 4: make features and dataloader
sparse_features = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'MovieID']
sequence_features = ['Genres']
dataset = fillna(dataset, dataset.columns, fill_v='unk')
features = FeatureDict(sparse_features, None, sequence_features)
input, _ = make_datasets(dataset, features, sep='|')
loader = make_dataloader(input, dataset['Rating'].values, batch_size=64, shuffle=True)

Making dataset Done!


In [7]:
# step 5: build model
model = EmbeddingLayer(input).to(defaults.device)
print(model)
out = model(input)
print(out.shape, out, sep='\n')
# print(input)

EmbeddingLayer(
  (sparse_embeds): ModuleList(
    (0): Embedding(3205, 147)
    (1): Embedding(2, 2)
    (2): Embedding(7, 5)
    (3): Embedding(21, 9)
    (4): Embedding(2153, 118)
    (5): Embedding(4, 3)
  )
  (sequence_embeds): ModuleList(
    (0): EmbeddingBag(7, 5, mode=mean)
  )
  (drop): Dropout(p=0.0)
)
torch.Size([5000, 289])
tensor([[ 1.0832, -0.3852,  0.9774,  ...,  0.4901,  0.2720,  0.2515],
        [-2.9299,  1.2940, -0.9595,  ...,  0.4901,  0.2720,  0.2515],
        [ 2.9813,  0.2656,  0.1590,  ...,  0.4901,  0.2720,  0.2515],
        ...,
        [ 0.6574,  0.1386,  0.7176,  ...,  1.2335,  0.4204,  0.3841],
        [ 0.0121, -0.4749, -0.2445,  ...,  1.2335,  0.4204,  0.3841],
        [-0.6250,  1.1999,  0.7947,  ...,  1.2335,  0.4204,  0.3841]],
       grad_fn=<CatBackward>)


In [8]:
len(input.sequence_data.data[0])

16244

In [9]:
len(input.sequence_data.bag_offsets[0])

5000

In [10]:
targets = dataset['Rating'].values

In [11]:
data = RecommendDataset(input, targets)

In [12]:
len(data)

5000

In [13]:
len(input.sequence_data.data[0])

16244

In [14]:
import numpy as np

In [15]:
%%time
data1, offsets = [], np.zeros((data.lens, len(input.sequence_data.bag_offsets)), dtype=int)
for x, y in zip(input.sequence_data.data, input.sequence_data.bag_offsets):
    tmp = []
    for idx, item in enumerate(y):
        tmp1 = []
        if idx == data.lens - 1:
            tmp1.extend(x[item:])
        else:
            tmp1.extend(x[item:y[idx + 1]])
    data1.append(tmp)

Wall time: 3.99 ms


In [16]:
# input.sequence_data.data/

In [17]:
data1[0][3:8]

[]

In [18]:
%%time
data1, offsets = [], []
for i in range(data.lens):
    tmp = []
    for x, y in zip(input.sequence_data.data, input.sequence_data.bag_offsets): 
        if i == data.lens - 1:
            t = x[y[-1]:]
            t = [t] if isinstance(t, int) else t
            tmp.append(t)
        else:
            t = x[y[i]:y[i + 1]]
            t = [t] if isinstance(t, int) else t
            tmp.append(t)
    data1.append(tmp)

Wall time: 6.98 ms


In [19]:
offsets=np.zeros((data.lens, len(input.sequence_data.bag_offsets)), dtype=int)

In [20]:
offsets[3:7]

array([[0],
       [0],
       [0],
       [0]])

In [21]:
data1[3:7]

[[[2, 3, 4]], [[2, 3, 4]], [[2, 3, 4]], [[2, 3, 4]]]

In [22]:
data1[3:7][1][0]

[2, 3, 4]

In [23]:
for i in len(input.sequence_data.bag_offsets):
    y = []
    for t in range(4):
        y.extend(data1[3:7][t][i])

TypeError: 'int' object is not iterable

In [None]:
input.sequence_data.bag_offsets[0][235]

In [None]:
# input.sequence_data

In [None]:
# data.sequence_data

In [None]:
# for data, target in loader:
#     print(data, target)