In [None]:
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip
!unzip ml-1m.zip
!pip install lightfm

In [2]:
import time
import math
import itertools
import numpy as np
import pandas as pd
import torch as torch
import torch.utils.data as data
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
from sklearn.manifold import TSNE
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k, auc_score

# 1) Factorization Machines (FM)


Factorization Machines (FM) is a generalization of the linear regression model and the matrix factorization model. It is a generic supervised learning model that map arbitrary real-valued features into a low-dimensional latent factor space and can be applied naturally to a wide variety of prediction tasks including regression, classification, and ranking.

<img src='https://miro.medium.com/max/1400/1*QMTYo7IjZOYS3LW2boSZJw.webp'>


https://towardsdatascience.com/factorization-machines-for-item-recommendation-with-implicit-feedback-data-5655a7c749db

## 1.1 Data Loading

For a start, we want to represent the user-item interaction as a one-hot encoding vector, where each row of the transformed will only have a single active user and item. We can then add in auxiliary features (e.g. other movies the user has rated, last movie rated, time he consumed that movie, etc) either as one-hot encodings or normalized vectors.

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
movies_df = pd.read_csv(
    'ml-1m/movies.dat', 
    sep='::',
    names=['movieId', 'title', 'genres'],
    encoding='latin-1', 
    engine='python'
)
movies_df['movieId_index'] = movies_df['movieId'].astype('category').cat.codes

users_df = pd.read_csv(
    'ml-1m/users.dat',sep='::',
    header=None,
    names=['userId', 'gender', 'age', 'occupation', 'zipcode'],
    engine='python'
    )
users_df['gender_index'] = users_df['gender'].astype('category').cat.codes
users_df['age_index'] = users_df['age'].astype('category').cat.codes
users_df['occupation_index'] = users_df['occupation'].astype('category').cat.codes
users_df['userId_index'] = users_df['userId'].astype('category').cat.codes

ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['userId', 'movieId', 'rating', 'time'],
    engine='python'
    )
ratings = ratings.join(movies_df.set_index('movieId'), on='movieId')
ratings = ratings.join(users_df.set_index('userId'), on='userId')
feature_columns = [
    'userId_index', 'movieId_index', 'age_index', 'gender_index', 'occupation_index'
]

features_sizes = {
    'userId_index': len(ratings['userId_index'].unique()),
    'movieId_index': len(ratings['movieId_index'].unique()),
    'age_index': len(ratings['age_index'].unique()),
    'gender_index': len(ratings['gender_index'].unique()),
    'occupation_index': len(ratings['occupation_index'].unique()),
}

next_offset = 0
features_offsets = {}
for k, v in features_sizes.items():
    features_offsets[k] = next_offset
    next_offset += v

for column in feature_columns:
    ratings[column] = ratings[column].apply(lambda c: c + features_offsets[column]) 
ratings[[*feature_columns, 'rating']].head(5)

Unnamed: 0,userId_index,movieId_index,age_index,gender_index,occupation_index,rating
0,0,7216,9746,9753,9765,5
1,0,6695,9746,9753,9765,3
2,0,6942,9746,9753,9765,3
3,0,9379,9746,9753,9765,4
4,0,8326,9746,9753,9765,5


In [4]:
data_x = torch.tensor(ratings[feature_columns].values)
data_y = torch.tensor(ratings['rating'].values).float()
dataset = data.TensorDataset(data_x, data_y)

batch_size = 256
train_n = int(len(dataset) * 0.9)
valid_n = len(dataset) - train_n
splits = [train_n, valid_n]
trainset, devset = torch.utils.data.random_split(dataset, splits)
train_dataloader = data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
dev_dataloader = data.DataLoader(devset, batch_size=batch_size, shuffle=False)
x, y = next(iter(train_dataloader))
print(x)

tensor([[ 766, 9067, 9748, 9754, 9767],
        [2899, 7055, 9747, 9754, 9767],
        [5285, 7657, 9750, 9754, 9771],
        ...,
        [3069, 6775, 9747, 9753, 9759],
        [2250, 9498, 9749, 9754, 9769],
        [3506, 9052, 9748, 9754, 9755]])


## 1.2 Modeling

In [5]:
# The embeddings are initialized with a truncated normal function
# from the fastai library and improves learning speed a lot
def trunc_normal_(x, mean=0., std=1.):
    "Truncated normal initialization."
    return x.normal_().fmod_(2).mul_(std).add_(mean)

class FMModel(nn.Module):
    def __init__(self, num_inputs, emb_dim):
        super().__init__()

        self.embeddings = nn.Embedding(num_inputs, emb_dim)
        self.w0 = nn.Parameter(torch.zeros(1))
        self.bias = nn.Embedding(num_inputs, 1)
        
        with torch.no_grad(): trunc_normal_(self.embeddings.weight, std=0.01)
        with torch.no_grad(): trunc_normal_(self.bias.weight, std=0.01)

    def forward(self, X):
        
        # [m, feat_size, k]
        emb = self.embeddings(X)

        # calculate the matrix factorization in complexity of O(num_inputs * emb_dim)
        pow_of_sum = emb.sum(dim=1).pow(2) # [m, emb_dim]
        sum_of_pow = emb.pow(2).sum(dim=1) # [m, emb_dim]
        pairwise = (pow_of_sum - sum_of_pow).sum(1) * 0.5 # [m]
        bias = self.bias(X).squeeze().sum(1) # [m]
        return torch.sigmoid(self.w0 + bias + pairwise)

### testing
model = FMModel(data_x.max() + 1, 120).to(device)
x, y = next(iter(train_dataloader))
result = model(x)
print(result.size())

torch.Size([256])


In [6]:
def fit(iterator, model, optimizer, criterion):
    train_loss = 0
    model.train()
    for x, y in iterator:
        optimizer.zero_grad()
        y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
        loss.backward()
        optimizer.step()
    return train_loss / len(iterator.dataset)

def test(iterator, model, criterion):
    train_loss = 0
    model.eval()
    for x,y in iterator:                    
        with torch.no_grad():
            y_hat = model(x.to(device))
        loss = criterion(y_hat, y.to(device))
        train_loss += loss.item()*x.shape[0]
    return train_loss / len(iterator.dataset)

def train_n_epochs(model, n, optimizer,scheduler):
    criterion = nn.MSELoss().to(device)
    for epoch in range(n):
        start_time = time.time()
        train_loss = fit(train_dataloader, model, optimizer, criterion)
        valid_loss = test(dev_dataloader, model, criterion)
        scheduler.step()
        secs = int(time.time() - start_time)
        print(f'epoch {epoch}. time: {secs}[s]')
        print(f'\ttrain rmse: {(math.sqrt(train_loss)):.4f}')
        print(f'\tvalidation rmse: {(math.sqrt(valid_loss)):.4f}')

### testing
model = FMModel(data_x.max() + 1, 120).to(device)
decay = 1e-5
lr = 0.001
epochs = 10
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=decay)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[7], gamma=0.1)
criterion = nn.MSELoss().to(device)

for epoch in range(epochs):
    train_loss = fit(train_dataloader, model, optimizer, criterion)
    valid_loss = test(dev_dataloader, model, criterion)
    scheduler.step()
    break

## 1.3 Embedding Generation

In [7]:
movies = ratings.drop_duplicates('movieId_index').copy()
movie_embeddings = model.embeddings(
    torch.tensor(movies['movieId_index'].values,device=device).long())

movies['embedding'] = movie_embeddings.tolist()
movie_biases = model.bias(torch.tensor(movies['movieId_index'].values,device=device).long())
movies['bias'] = movie_biases.cpu().detach().numpy()
movies[['title', 'movieId_index', 'embedding', 'bias']].head()

Unnamed: 0,title,movieId_index,embedding,bias
0,One Flew Over the Cuckoo's Nest (1975),7216,"[-0.02224455028772354, -0.029453467577695847, ...",0.034443
1,James and the Giant Peach (1996),6695,"[-0.011640001088380814, -0.01070540864020586, ...",0.028304
2,My Fair Lady (1964),6942,"[-0.017789075151085854, -0.01796894334256649, ...",0.038714
3,Erin Brockovich (2000),9379,"[-0.02009616047143936, -0.021841367706656456, ...",0.032325
4,"Bug's Life, A (1998)",8326,"[-0.03267267718911171, -0.028340822085738182, ...",0.029687


# 2) Field Aware Factorization Machines (FFM)

Comparing FFM versus FM, FFM learns multiple latent vector for each feature while FM learns a latent vector for each feature. One can interpret the former as trying to represent the interactions in a more granular-level. As such, the number of latent features k needed to represent such granular interactions is lesser i.e. k in FFM << k in FM.

In the official FFM paper, it is empirically proven that for large, sparse datasets with many categorical features, FFM performs better. Conversely, for small and dense datasets or numerical datasets, FFM may not be as effective as FM. FFM is also prone to overfitting on the training dataset, hence one should use a standalone validation set and use early stopping when the loss increases.

https://towardsdatascience.com/an-intuitive-explanation-of-field-aware-factorization-machines-a8fee92ce29f

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
movies_df = pd.read_csv(
    'ml-1m/movies.dat', 
    sep='::',
    names=['movieId', 'title', 'genres'],
    encoding='latin-1', 
    engine='python'
)
movies_df['movieId_index'] = movies_df['movieId'].astype('category').cat.codes

users_df = pd.read_csv(
    'ml-1m/users.dat',sep='::',
    header=None,
    names=['userId', 'gender', 'age', 'occupation', 'zipcode'],
    engine='python'
    )
users_df['gender_index'] = users_df['gender'].astype('category').cat.codes
users_df['age_index'] = users_df['age'].astype('category').cat.codes
users_df['occupation_index'] = users_df['occupation'].astype('category').cat.codes
users_df['userId_index'] = users_df['userId'].astype('category').cat.codes

ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['userId', 'movieId', 'rating', 'time'],
    engine='python'
    )
ratings = ratings.join(movies_df.set_index('movieId'), on='movieId')
ratings = ratings.join(users_df.set_index('userId'), on='userId')
feature_columns = [
    'userId_index', 'movieId_index', 'age_index', 'gender_index', 'occupation_index'
]

features_sizes = {
    'userId_index': len(ratings['userId_index'].unique()),
    'movieId_index': len(ratings['movieId_index'].unique()),
    'age_index': len(ratings['age_index'].unique()),
    'gender_index': len(ratings['gender_index'].unique()),
    'occupation_index': len(ratings['occupation_index'].unique()),
}

ratings[[*feature_columns, 'rating']].head(5)

Unnamed: 0,userId_index,movieId_index,age_index,gender_index,occupation_index,rating
0,0,1176,0,0,10,5
1,0,655,0,0,10,3
2,0,902,0,0,10,3
3,0,3339,0,0,10,4
4,0,2286,0,0,10,5


In [9]:
class FeaturesLinear(torch.nn.Module):

    def __init__(self, field_dims, output_dim=1):
        super().__init__()
        self.fc = torch.nn.Embedding(field_dims, output_dim)
        self.bias = torch.nn.Parameter(torch.zeros((output_dim,)))
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim=1) + self.bias

class FieldAwareFactorizationMachine(torch.nn.Module):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.num_fields = len(field_dims)
        self.embeddings = torch.nn.ModuleList([
            torch.nn.Embedding(sum(field_dims), embed_dim) for _ in range(self.num_fields)
        ])
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.long)
        for embedding in self.embeddings:
            torch.nn.init.xavier_uniform_(embedding.weight.data)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        xs = [self.embeddings[i](x) for i in range(self.num_fields)]
        ix = list()
        for i in range(self.num_fields - 1):
            for j in range(i + 1, self.num_fields):
                ix.append(xs[j][:, i] * xs[i][:, j])
        ix = torch.stack(ix, dim=1)
        return ix

class FieldAwareFactorizationMachineModel(torch.nn.Module):

    def __init__(self, field_dims, embed_dim):
        super().__init__()
        self.linear = FeaturesLinear(field_dims)
        self.ffm = FieldAwareFactorizationMachine(field_dims, embed_dim)

    def forward(self, x):
        """
        :param x: Long tensor of size ``(batch_size, num_fields)``
        """
        ffm_term = torch.sum(torch.sum(self.ffm(x), dim=1), dim=1, keepdim=True)
        x = self.linear(x) + ffm_term
        return torch.sigmoid(x.squeeze(1))

# 3) LightFM

One of the strong points for LightFM is that the model does not suffer from cold start problem, both user and item cold start. The reason is that LightFM allows building a hybrid recommender system.

[WARP and BPR](https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/)

## 3.1 Data Loader

In [10]:
from lightfm.data import Dataset

In [11]:
movies_df = pd.read_csv(
    'ml-1m/movies.dat', 
    sep='::',
    names=['movieId', 'title', 'genres'],
    encoding='latin-1', 
    engine='python'
)
movies_df['genres_index'] = movies_df['genres'].astype('category').cat.codes

users_df = pd.read_csv(
    'ml-1m/users.dat',sep='::',
    header=None,
    names=['userId', 'gender', 'age', 'occupation', 'zipcode'],
    engine='python'
    )
users_df['gender_index'] = users_df['gender'].astype('category').cat.codes
users_df['age_index'] = users_df['age'].astype('category').cat.codes
users_df['occupation_index'] = users_df['occupation'].astype('category').cat.codes

ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep='::',
    names=['userId', 'movieId', 'rating', 'time'],
    engine='python'
    )

users_df = users_df[users_df['userId'].isin(ratings['userId'].tolist())]
movies_df = movies_df[movies_df['movieId'].isin(ratings['movieId'].tolist())]
movies_df['movieId'] = movies_df['movieId'].astype('category').cat.codes
users_df['userId'] = users_df['userId'].astype('category').cat.codes

ratings = ratings.join(movies_df.set_index('movieId'), on='movieId')
ratings = ratings.join(users_df.set_index('userId'), on='userId')
ratings.rename(columns={'userId' : 'user_id', 'movieId' : 'product_id'}, inplace=True)

user_feature_columns = ['age_index', 'gender_index', 'occupation_index']
item_feature_columns = ['genres_index']
ratings.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['movieId'] = movies_df['movieId'].astype('category').cat.codes


Unnamed: 0,user_id,product_id,rating,time,title,genres,genres_index,gender,age,occupation,zipcode,gender_index,age_index,occupation_index
0,1,1193,5,978300760,Heathers (1989),Comedy,176.0,M,56.0,16.0,70072,1.0,6.0,16.0
1,1,661,3,978302109,Operation Dumbo Drop (1995),Action|Adventure|Comedy|War,15.0,M,56.0,16.0,70072,1.0,6.0,16.0


In [12]:
def create_rate_matrix(df, shuffle=True, split_ratio=0.8):

    if shuffle:
        df = df.sample(frac=1).reset_index(drop = True)
    split_point = np.int(np.round(df.shape[0] * split_ratio))
    # df_train = df.iloc[0: split_point]
    df_train = df.copy()
    df_test = df.iloc[split_point::]
    df_test = df_test[
        (df_test['user_id'].isin(df_train['user_id'])) & \
        (df_test['product_id'].isin(df_train['product_id']))
        ]

    id_cols = ['user_id', 'product_id']
    trans_cat_train = dict()
    trans_cat_test = dict()
  
    encoder = dict()
    for k in id_cols:
        le = LabelEncoder()
        trans_cat_train[k] = le.fit_transform(df_train[k].values)
        trans_cat_test[k] = le.transform(df_test[k].values)
        encoder[k] = le

    trans_cat_train['rating'] = df_train['rating']
    trans_cat_test['rating'] = df_test['rating']
    
    users = np.unique(trans_cat_train['user_id'])
    items = np.unique(trans_cat_train['product_id'])
    n_users = len(users)
    n_items = len(items)    
    
    rate_matrix = dict()
    rate_matrix['train'] = coo_matrix((trans_cat_train['rating'],
                                       (trans_cat_train['user_id'],
                                        trans_cat_train['product_id'])),
                                      shape = (n_users, n_items))
    
    rate_matrix['test'] = coo_matrix((trans_cat_test['rating'],
                                      (trans_cat_test['user_id'],
                                       trans_cat_test['product_id'])),
                                     shape = (n_users, n_items))
    
    return rate_matrix, users, items, encoder

rating_matrix, users, items, encoder_dict = create_rate_matrix(
    ratings, shuffle=True, split_ratio=0.8)
pd.DataFrame(rating_matrix['train'].toarray())

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  split_point = np.int(np.round(df.shape[0] * split_ratio))


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3696,3697,3698,3699,3700,3701,3702,3703,3704,3705
0,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6035,0,0,0,2,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6036,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
def generate_feature_list(df, columns):
    '''
    Generate the list of features of corresponding columns to list
    In order to fit the lightdm Dataset
    '''
    features = df[columns].apply(lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

def prepare_item_features(df, columns, id_col_name):
    '''
    Prepare the corresponding feature formats for 
    the lightdm.dataset's build_item_features function
    '''
    features = df[columns].apply(lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(df[id_col_name], features))
    return features

fitting_user_features = generate_feature_list(users_df, user_feature_columns)
lightdm_user_features = prepare_item_features(users_df, user_feature_columns, 'userId')

fitting_item_features = generate_feature_list(movies_df, item_feature_columns)
lightdm_item_features = prepare_item_features(movies_df, item_feature_columns, 'movieId')

In [25]:
dataset = Dataset()
dataset.fit(
    users, 
    items, 
    item_features=fitting_item_features,
    user_features=fitting_user_features
    )

item_feature = dataset.build_item_features(
    lightdm_item_features, 
    normalize = True
)

user_feature = dataset.build_user_features(
    lightdm_user_features, 
    normalize = True
)

## 3.2 Model Training

In [31]:
# collaborative filtering
model_cf = LightFM(
    loss='warp',
    no_components=160,
    item_alpha=1e-7,
    learning_rate=0.02,
    max_sampled=10
)

model_cf.fit(rating_matrix['train'], epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7fb6246cac70>

In [33]:
df_result = pd.DataFrame(columns = ['Method', 'Evaluation Metric', 'Train', 'Test'])
auc_train = auc_score(model_cf, rating_matrix['train']).mean()
auc_test = auc_score(model_cf, rating_matrix['test']).mean()
precision_train = precision_at_k(model_cf, rating_matrix['train'], k = 10).mean()
precision_test = precision_at_k(model_cf, rating_matrix['test'], k = 10).mean()
precision_train = precision_at_k(model, rating_matrix['train'], item_features = item_feature, k = 10).mean()
precision_test = precision_at_k(model, rating_matrix['test'], item_features = item_feature, k = 10).mean()

print(
    f'auc_train: {auc_train}',
    f'auc_test: {auc_test}',
    f'precision_train: {precision_train}',
    f'precision_test: {precision_test}',
)

In [34]:
# no_components (int, optional) – the dimensionality of the feature latent embeddings
# item_alpha (float, optional) – L2 penalty on item features. 
# max_sampled (int, optional) – maximum number of negative samples used during WARP fitting.

# hybrid
model_hybrid = LightFM(
    loss= 'warp',
    no_components=160,
    item_alpha=1e-7,
    learning_rate=0.02,
    max_sampled=10)

model_hybrid.fit(
    rating_matrix['train'], 
    item_features=item_feature, 
    user_feature=user_feature, 
    epochs=10, 
    num_threads=4
)

In [35]:
df_result = pd.DataFrame(columns = ['Method', 'Evaluation Metric', 'Train', 'Test'])
auc_train = auc_score(model_hybrid, rating_matrix['train']).mean()
auc_test = auc_score(model_hybrid, rating_matrix['test']).mean()
precision_train = precision_at_k(model_hybrid, rating_matrix['train'], k = 10).mean()
precision_test = precision_at_k(model_hybrid, rating_matrix['test'], k = 10).mean()
precision_train = precision_at_k(model_hybrid, rating_matrix['train'], item_features = item_feature, k=10).mean()
precision_test = precision_at_k(model_hybrid, rating_matrix['test'], item_features = item_feature, k=10).mean()

print(
    f'auc_train: {auc_train}',
    f'auc_test: {auc_test}',
    f'precision_train: {precision_train}',
    f'precision_test: {precision_test}',
)

### 3.3 Prediction

In [None]:
user_index = None
target_item_indices = 100

if user_index is not None:
    predictions = model.predict([user_index, ], np.array(target_item_indices))
else:
    predictions = model.predict(0, np.array(target_item_indices), user_features=user_feature[0])
print(predictions)