In [1]:
import pandas as pd
import torch
from torchctr.layers import EmbeddingLayer, EmbeddingDropout
from torchctr.datasets import (FeatureDict, get_movielens, make_datasets, read_data, defaults, fillna, make_dataloader, DataMeta)

## step 1: download dataset

In [2]:
root = get_movielens('../datasets', 'ml-1m')

Downloading...
Using downloaded and verified file: ../datasets/ml-1m/raw/ml-1m.zip
Extracting...
Done!


## step 2: read data

In [3]:
users = read_data(root / 'users.dat', sep='::', names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
movies = read_data(root / 'movies.dat', sep='::', names=['MovieID', 'Title', 'Genres'])
ratings = read_data(root / 'ratings.dat', sep='::', names=['UserID', 'MovieID', 'Rating', 'Timestamp'])

## step 3: make dataset

In [4]:
dataset = pd.merge(ratings, users, on='UserID')
dataset = pd.merge(dataset, movies, on='MovieID')

## subsample(optional)

In [5]:
dataset = dataset.iloc[5000:10000, :]

## step 4: make features and dataloader

In [6]:
sparse_features = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code', 'MovieID']
sequence_features = ['Genres']
dataset = fillna(dataset, dataset.columns, fill_v='unk')
features = FeatureDict(sparse_features, None, sequence_features)
input, _ = make_datasets(dataset, features, sep='|')
# loader = make_dataloader(input, dataset['Rating'].values, batch_size=64, shuffle=True)

Making dataset Done!


## step 5: build model

In [7]:
model = EmbeddingLayer(input, emb_drop=0.1).to(defaults.device)
print(model)
out = model(input)

EmbeddingLayer(
  (sparse_embeds): ModuleList(
    (0): EmbeddingDropout(
      (emb): Embedding(3205, 147)
    )
    (1): EmbeddingDropout(
      (emb): Embedding(2, 2)
    )
    (2): EmbeddingDropout(
      (emb): Embedding(7, 5)
    )
    (3): EmbeddingDropout(
      (emb): Embedding(21, 9)
    )
    (4): EmbeddingDropout(
      (emb): Embedding(2153, 118)
    )
    (5): EmbeddingDropout(
      (emb): Embedding(4, 3)
    )
  )
  (sequence_embeds): ModuleList(
    (0): Embedding(7, 5)
  )
)


In [8]:
out[:, -5:]

tensor([[-0.8287,  0.3714, -0.7944,  0.5302, -0.1847],
        [-0.8287,  0.3714, -0.7944,  0.5302, -0.1847],
        [-0.8287,  0.3714, -0.7944,  0.5302, -0.1847],
        ...,
        [-0.4850, -0.0608,  1.1737,  0.4636, -0.4604],
        [-0.4850, -0.0608,  1.1737,  0.4636, -0.4604],
        [-0.4850, -0.0608,  1.1737,  0.4636, -0.4604]], device='cuda:0',
       grad_fn=<SliceBackward>)

In [9]:
ly = EmbeddingDropout(torch.nn.Embedding(7, 5), 0.1)

In [10]:
ly

EmbeddingDropout(
  (emb): Embedding(7, 5)
)

In [11]:
ly.emb.weight

Parameter containing:
tensor([[ 0.0122,  0.7963, -0.9860,  2.5023,  0.9121],
        [-0.2414, -1.1864, -0.0428,  1.4428,  0.6048],
        [-3.1064, -0.8661, -0.4674, -0.6350, -0.0244],
        [-1.4281, -0.2473,  1.4546,  0.1025, -0.1300],
        [-2.0995,  0.1254,  0.0183, -0.6482,  0.9680],
        [ 0.2651, -2.6695, -0.7403, -1.3880,  0.3184],
        [-0.6377,  0.6056,  0.6045, -0.6367, -0.1732]], requires_grad=True)

In [12]:
input.sequence_data.data

array([[0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [13]:
y = torch.as_tensor(input.sequence_data.data).float()

In [14]:
y @ ly.emb.weight/y.sum(1).view(-1,1)

tensor([[-2.2113, -0.3294,  0.3352, -0.3936,  0.2712],
        [-2.2113, -0.3294,  0.3352, -0.3936,  0.2712],
        [-2.2113, -0.3294,  0.3352, -0.3936,  0.2712],
        ...,
        [-0.9172, -1.2720, -0.3610, -1.0181,  0.6432],
        [-0.9172, -1.2720, -0.3610, -1.0181,  0.6432],
        [-0.9172, -1.2720, -0.3610, -1.0181,  0.6432]], grad_fn=<DivBackward0>)

In [15]:
input.sequence_data.nunique

[7]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
def sequence_feature_encoding(data, features_names, sep: str = ','):
    """Encoding for sequence features."""

    if not features_names:
        return None
    data_value, nuniques = [], []
    for feature in features_names:
        vocab = set.union(*[set(str(x).strip().split(sep=sep)) for x in data[feature]])
        vec = CountVectorizer(vocabulary=vocab)
        multi_hot = vec.transform(data[feature])
        data_value.append(multi_hot.toarray())
        nuniques.append(len(vocab))
    data_meta = DataMeta(np.hstack(data_value), None, features_names, nuniques)
    return data_meta

In [17]:
dataset.sample(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Gender,Age,Occupation,Zip-code,Title,Genres
8756,2396,1,5,999463533,1,2,17,54,Ben-Hur (1959),Action|Adventure|Drama
8845,2867,1,4,959889599,1,3,1,581,Ben-Hur (1959),Action|Adventure|Drama
7664,2417,0,5,964113264,1,2,14,2061,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
9482,1127,3,5,974177311,1,2,7,1637,"Christmas Story, A (1983)",Comedy|Drama
7709,2491,0,3,963196099,0,5,0,133,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
8167,3133,0,3,957361499,1,2,1,1483,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
5151,1674,2,4,967416107,1,1,12,1123,"Bug's Life, A (1998)",Animation|Children's|Comedy
6165,319,0,4,975607581,1,3,16,614,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
8179,3148,0,5,957213786,0,4,15,1476,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
9677,1652,3,5,967468418,1,2,14,217,"Christmas Story, A (1983)",Comedy|Drama


In [18]:
x = sequence_feature_encoding(dataset, ['Genres'], '|')

In [19]:
vocab = set.union(*[set(str(x).strip().split(sep='|')) for x in dataset['Genres']])
vocab

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Drama',
 'Romance'}

In [20]:
vec = CountVectorizer(vocabulary=vocab)

In [21]:
# [','.join(str(x).strip().split(sep='|')) for x in dataset['Genres']]

In [22]:
vec.fit([' '.join(str(x).strip().split(sep='|')) for x in dataset['Genres']])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary={"Children's", 'Romance', 'Adventure', 'Drama', 'Animation', 'Action', 'Comedy'})

In [23]:
multi_hot = vec.transform(['Action Comedy', 'Action'])

In [24]:
list(multi_hot.toarray())

[array([0, 0, 0, 0, 0, 0, 0]), array([0, 0, 0, 0, 0, 0, 0])]

In [25]:
CountVectorizer(token_pattern=r'(?u)\b\w+\b', analyzer='word').fit_transform(['1 2 31', 'a, b, c3', '中 0']).toarray()

array([[0, 1, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [26]:
corpus = [' '.join(str(x).strip().split(sep='|')) for x in dataset['Genres']]

In [27]:
vocab = set.union(*[set(x.split(' ')) for x in corpus])

In [28]:
vec = CountVectorizer(token_pattern=r'(?u)\b[\w\']+\b')
# vec = CountVectorizer(vocabulary=vocab)

In [29]:
vec.fit_transform(corpus).toarray()

array([[0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [30]:
vec.vocabulary_

{'animation': 2,
 "children's": 3,
 'comedy': 4,
 'action': 0,
 'adventure': 1,
 'romance': 6,
 'drama': 5}