# 0. Configuration

In [213]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [228]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import numpy as np
import pandas as pd

from itertools import islice, cycle, product
from more_itertools import pairwise

from lightfm.data import Dataset
from lightfm import LightFM
from datetime import datetime
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')


## 1. 1. Helper functions to avoid copy paste

In [215]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [348]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [349]:
interactions["timestamp"] = pd.to_datetime(interactions["timestamp"], unit="s")
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,2009-12-14 02:52:24
1,1,1029,3.0,2009-12-14 02:52:59
2,1,1061,3.0,2009-12-14 02:53:02
3,1,1129,2.0,2009-12-14 02:53:05
4,1,1172,4.0,2009-12-14 02:53:25


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [350]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [351]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [352]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['id'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


## 2.2 Data preparation using LightFM Dataset

To use this class we need the in the following format:
- userId
- movieId
- user_features - user feature names
- item_features - item feature names

It has several methods:
- build_interactions - definition of user / item interactions matrix using iterators on top of tuples:
1. (userId, movieId);
2. (userId, movieId, weight / rating)
- build_user_features/build_item_features - defition of user/item features using iterators on top of tuples:
1. (userId, [user_feature_name1, user_feature_name2, ...]);
2. (userId, {user_feature_name1: weight});
3. The same goes for item features


In [353]:
# init class
dataset = Dataset()

In [354]:
# fit tuple of user and movie interactions
dataset.fit(interactions['userId'].unique(), interactions['movieId'].unique())

We do not have users data in MovieLens dataset so let's skip part features generation

In [355]:
# now, we define lightfm mapper to use it later for checks
lightfm_mapping = dataset.mapping()
# lightfm_mapping

In [356]:
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('user mapper length - ', len(lightfm_mapping['users_mapping']))
print('user features mapper length - ', len(lightfm_mapping['user_features_mapping']))
print('movies mapper length - ', len(lightfm_mapping['items_mapping']))
print('Users movie features mapper length - ', len(lightfm_mapping['item_features_mapping']))

user mapper length -  671
user features mapper length -  671
movies mapper length -  9066
Users movie features mapper length -  9066


As we do not have user / movie features their length are the same as userId and movieId

In [357]:
# here we create inverted mappers to check recommendations later
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

As we mentioned earlier, we need to create iterators

In [358]:
def df_to_tuple_iterator(df: pd.DataFrame):
    '''
    :df: pd.DataFrame, interactions dataframe
    returs iterator
    '''
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [359]:
class TimeRangeSplit():
        
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='M', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='user_id',
              item_column='item_id',
              datetime_column='date',
              fold_stats=False):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in pairwise(date_range):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            if fold_stats:
                fold_info['Train'] = len(train_idx)

            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = np.setdiff1d(
                    df.loc[test_idx, user_column].unique(), 
                    df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)

            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)

            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)

            if fold_stats:
                fold_info['Test'] = len(test_idx)

            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

In [368]:
last_date = interactions_filtered['timestamp'].max().normalize()
folds = 20
start_date = last_date - pd.Timedelta(weeks=folds)
start_date, last_date

(Timestamp('2016-05-29 00:00:00'), Timestamp('2016-10-16 00:00:00'))

In [369]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1)
cv.max_n_splits, cv.get_n_splits(interactions_filtered, datetime_column='timestamp')


(20, 4)

In [370]:
cv.date_range

DatetimeIndex(['2016-05-31', '2016-06-30', '2016-07-31', '2016-08-31',
               '2016-09-30', '2016-10-31', '2016-11-30', '2016-12-31',
               '2017-01-31', '2017-02-28', '2017-03-31', '2017-04-30',
               '2017-05-31', '2017-06-30', '2017-07-31', '2017-08-31',
               '2017-09-30', '2017-10-31', '2017-11-30', '2017-12-31',
               '2018-01-31'],
              dtype='datetime64[ns]', freq='M')

In [371]:
folds_with_stats = list(cv.split(
    interactions_filtered, 
    user_column='userId',
    item_column='movieId',
    datetime_column='timestamp',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

In [372]:
folds_info_with_stats.head()

Unnamed: 0,Start date,End date,Train,New users,New users interactions,New items,New items interactions,Known interactions,Test
0,2016-05-31,2016-06-30,43982,4,199,9,9,0,39
1,2016-06-30,2016-07-31,44229,3,147,6,6,0,52
2,2016-07-31,2016-08-31,44434,4,318,8,8,0,85
3,2016-08-31,2016-09-30,44845,1,26,1,1,0,16


We will test our data sequnetially for these periods, frequency set as M

In [373]:
fold_dates = [(info['Start date'], info['End date']) for _, _, info in folds_with_stats]
fold_dates

[(Timestamp('2016-05-31 00:00:00', freq='M'),
  Timestamp('2016-06-30 00:00:00', freq='M')),
 (Timestamp('2016-06-30 00:00:00', freq='M'),
  Timestamp('2016-07-31 00:00:00', freq='M')),
 (Timestamp('2016-07-31 00:00:00', freq='M'),
  Timestamp('2016-08-31 00:00:00', freq='M')),
 (Timestamp('2016-08-31 00:00:00', freq='M'),
  Timestamp('2016-09-30 00:00:00', freq='M'))]

In [374]:
train_ds, test_ds, info = folds_with_stats[0]

train = interactions_filtered.loc[train_ds]
test = interactions_filtered.loc[test_ds]
train.shape, test.shape

((43982, 4), (39, 4))

In [376]:
# defining train set on the whole interactions dataset (as HW you will have to split into test and train for evaluation)
train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(train[['userId', 'movieId']]))

In [377]:
train_mat

<671x9066 sparse matrix of type '<class 'numpy.int32'>'
	with 43982 stored elements in COOrdinate format>

In [378]:
train_mat_weights

<671x9066 sparse matrix of type '<class 'numpy.float32'>'
	with 43982 stored elements in COOrdinate format>

## 2.3. Model Training & Evaluation

### 2.3.1. Train Model

In [379]:
# set params
NO_COMPONENTS = 64
LEARNING_RATE = .03
LOSS = 'warp'
MAX_SAMPLED = 5
RANDOM_STATE = 42
EPOCHS = 20

In [380]:
# init model
lfm_model = LightFM(
    no_components = NO_COMPONENTS,
    learning_rate = LEARNING_RATE,
    loss = LOSS,
    max_sampled = MAX_SAMPLED,
    random_state = RANDOM_STATE
    )

In [381]:
# execute training
for _ in tqdm_notebook(range(EPOCHS), total = EPOCHS):
    lfm_model.fit_partial(
        train_mat, 
        num_threads = 4
    )

  0%|          | 0/20 [00:00<?, ?it/s]

### 2.3.2. Evaluate the Model

In [385]:
# let's check on test
top_N = 10
user_id = test['userId'].iloc[0]
row_id = lightfm_mapping['users_mapping'][user_id]
print(f'Rekko for user {user_id}, row number in matrix - {row_id}')

Rekko for user 15, row number in matrix - 14


In [386]:
all_cols = list(lightfm_mapping['items_mapping'].values())
len(all_cols)

9066

In [387]:
pred = lfm_model.predict(
    row_id,
    all_cols,
    num_threads = 4)
pred, pred.shape

(array([-4.510231 , -4.0927887, -4.5332355, ..., -2.9237242, -4.83543  ,
        -4.726346 ], dtype=float32),
 (9066,))

In [388]:
top_cols = np.argpartition(pred, -np.arange(top_N))[-top_N:][::-1]
top_cols

array([ 49,  99,  92,  72, 119, 480, 122, 157, 173,  64])

In [389]:
pred[top_cols]

array([4.3027835, 3.9803643, 3.9174566, 3.633384 , 3.3633723, 3.3061836,
       3.293293 , 3.0814886, 3.0187354, 2.6255205], dtype=float32)

In [193]:
# crate mapper for movieId and title names
item_name_mapper = dict(zip(movies_metadata['id'], movies_metadata['original_title']))

In [393]:
recs = pd.DataFrame({'col_id': top_cols})
recs['movieId'] = recs['col_id'].map(lightfm_mapping['items_inv_mapping'].get).astype(str)
recs['title'] = recs['movieId'].map(item_name_mapper)
recs

Unnamed: 0,col_id,movieId,title
0,49,296,Terminator 3: Rise of the Machines
1,99,318,The Million Dollar Hotel
2,92,593,Солярис
3,72,480,Monsoon Wedding
4,119,2762,Young and Innocent
5,480,4993,5 Card Stud
6,122,2959,License to Wed
7,157,858,Sleepless in Seattle
8,173,1089,Point Break
9,64,377,A Nightmare on Elm Street


In [433]:
recs = pd.DataFrame({
    'user_id': test['userId'].unique()
})

In [428]:
def lfm_recommend(model, item_ids, known_items, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [429]:
known_items = train.groupby('userId')['movieId'].apply(list).to_dict()
known_items[180]

['24', '223', '527', '700', '1073', '1127', '1265', '2013', '2502', '2699']

In [434]:
mapper = lfm_recommend(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=4
)

In [435]:
recs['movies_id'] = recs['user_id'].map(mapper)
recs.head()

Unnamed: 0,user_id,movies_id
0,15,"[296, 318, 593, 480, 2762, 4993, 2959, 858, 10..."
1,62,"[2959, 4226, 2762, 318, 4973, 296, 4011, 858, ..."
2,72,"[4993, 2959, 58559, 8961, 4226, 4886, 2762, 31..."
3,73,"[1917, 1580, 4896, 1573, 364, 780, 293, 2115, ..."
4,262,"[2959, 4226, 296, 1732, 4973, 111, 2762, 4011,..."


Here we can see the recommended movies for 2 users (head) with the rank on the test dataset.

In [436]:
recs = recs.explode('movies_id')
recs['rank'] = recs.groupby('user_id').cumcount() + 1
recs.head(20)

Unnamed: 0,user_id,movies_id,rank
0,15,296,1
0,15,318,2
0,15,593,3
0,15,480,4
0,15,2762,5
0,15,4993,6
0,15,2959,7
0,15,858,8
0,15,1089,9
0,15,608,10


As we don't have y_true taking zero and one values for ndcg

In [474]:
from math import log2
def compute_gain(y_value: float, gain_scheme: str) -> float:
    
    gain = {'exp2': 2 ** y_value - 1,
            'const': y_value}

    return float(gain[gain_scheme])

In [475]:
def dcgk(y_true: np.array, y_pred: np.array, k: int, gain_scheme: str) -> float:
    
    dcg = 0
    argsort = np.argsort(y_pred)[::-1]
    y_true_sorted = np.take(y_true, argsort[:k])

    for idx, val in enumerate(y_true_sorted, 1):
        gain = compute_gain(val, gain_scheme)
        dcg += gain / log2(idx + 1)
        
    return dcg

def ndcgk(y_true: np.array, ys_pred: np.array, k: int, gain_scheme: str = 'const') -> float:
    
    # pred dcg then we calc the same to find max possible
    preds_dcg = dcgk(y_true, ys_pred, k, gain_scheme)
    max_possible_dcg = dcgk(y_true, y_true, k, gain_scheme)

    return preds_dcg / max_possible_dcg

In [476]:
y_pred = np.array([1,2,3,4,5,6,7,8,9,10])
y_true = np.array([0,0,0,1,0,0,1,0,0,1])

In [477]:
ndcgk(y_true, y_pred, 10, 'exp2')

0.8278123145308894

# TODO
- Make train/test split -- train the model appropiately and predict on test set;
- Wrap up in function recommendations - lfm_recommend();
- Calculate `NDCG@10` on test set