In [None]:
# Built-in Python libraries
from pathlib import Path
import logging
import sys
from typing import Tuple, List, Dict
import os

# PyData libraries
import pandas as pd
import numpy as np
import seaborn as sns

# LensKit libraries
from lenskit.datasets import MovieLens
import lenskit.crossfold as xf
from lenskit.algorithms import basic, bias, user_knn, item_knn, als, svd, ranking
from lenskit import Recommender, topn, batch, util, Predictor
from lenskit.metrics.predict import rmse
from lenskit.metrics.topn import ndcg, hit

In [None]:
#import data
train_df = pd.read_csv('./Data/train.csv')
titles_df = pd.read_csv('./Data/anime.csv')
titles_df = titles_df[~titles_df['genre'].isna()]
test_df = pd.read_csv('./Data/test.csv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7020008 entries, 0 to 7020007
Data columns (total 4 columns):
 #   Column      Dtype
---  ------      -----
 0   Unnamed: 0  int64
 1   item        int64
 2   user        int64
 3   rating      int64
dtypes: int64(4)
memory usage: 214.2 MB


In [None]:
class RunAlgorithmTags():
    def __init__(self,algo,train,train_tags):
        self.algo=algo
        self.fittable = util.clone(self.algo)
        self.fittable = Recommender.adapt(self.algo)
        self.fittable.fit(train,tags=train_tags)
        self.n_jobs = 10
    def make_recs(self,test,n=20):
        self.users = test['user'].unique()
        return batch.recommend(self.fittable,self.users,n,n_jobs=self.n_jobs)

In [None]:
#make tags
titles_df['tag'] = titles_df['genre'].str.split(', ')
genres = titles_df[['anime_id','tag']].rename(columns={'anime_id':'item'})
tag_df = genres.explode('tag')
train_tag_df = pd.merge(train_df[['user','item']],tag_df,on='item')

In [None]:
from lenskit.algorithms import torchtag as tt

n_features=[50,100,200]
epochs=[3,5]
reg =[.01,.1,.5]
lr=.001

def calcNDCG(algorithm,test_df):
    recs = algorithm.make_recs(test_df)
    recs.to_csv('recs.csv')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg,k=20)
    scores = rla.compute(recs.copy(), test_df.copy(),include_missing=True)
    avg_ndcg = scores['ndcg'].mean()
    return recs, avg_ndcg

In [None]:
best_features = n_features[0]
best_epochs = epochs[0]
best_reg = reg[0]
best_ndcg = 0

for run in n_features:
    algo = tt.TorchTagMF(n_features=run,epochs=best_epochs,reg = best_reg,lr=lr)
    algorithm = RunAlgorithmTags(algo,train_df,train_tag_df)
    recs, ndcg = calcNDCG(algorithm,test_df)
    print(f'{run}: {ndcg:.4f}')
    if ndcg > best_ndcg:
        best_features = run
        best_ndcg = ndcg

#Test Epochs
algo = tt.TorchTagMF(n_features=best_features,epochs=epochs[1],reg = best_reg,lr=lr)
algorithm = RunAlgorithmTags(algo,train_df,train_tag_df)
recs, ndcg = calcNDCG(algorithm,test_df)
print(f'{epochs[1]}: {ndcg:.4f}')
if ndcg > best_ndcg:
    best_epochs = epochs[1]
    best_ndcg = ndcg

#Test reg
for run in reg[1:]:
    algo = tt.TorchTagMF(n_features=best_features,epochs=best_epochs,reg = run,lr=lr)
    algorithm = RunAlgorithmTags(algo,train_df,train_tag_df)
    recs, ndcg = calcNDCG(algorithm,test_df)
    print(f'{run}: {ndcg:.4f}')
    if ndcg > best_ndcg:
        best_reg= run
        best_ndcg = ndcg

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

50: 0.0492


  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

100: 0.0660


  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

200: 0.0823


  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

5: 0.0777


  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

0.1: 0.0936


  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

0.5: 0.0743


In [None]:
algo = tt.TorchTagMF(n_features=200,epochs=7,reg = .1,lr=lr)
algorithm = RunAlgorithmTags(algo,train_df,train_tag_df)
recs, ndcg = calcNDCG(algorithm,test_df)
print(f'{ndcg:.4f}')
if ndcg > best_ndcg:
    best_reg= reg[1]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

  0%|          | 0/6856 [00:00<?, ?it/s]

0.1163


NameError: name 'best_ndcg' is not defined

In [None]:
eval_df = pd.read_csv('./Data/eval.csv')
eval_recs = algorithm.make_recs(eval_df)
eval_recs.to_parquet('torchtag_eval.parquet', index=False)