In [None]:
import os
import sys
import time
import typing as tp
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np

import optuna
import pandas as pd
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import MAP, Precision, Recall, calc_metrics
from rectools.models import (
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    PopularModel,
    RandomModel,
)
from tqdm import tqdm

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!{sys.executable} - m pip install lightfm
!{sys.executable} - m  pip install rectools
!{sys.executable} - m  pip install seaborn
!{sys.executable} - m  pip install ipykernel
!{sys.executable} - m  pip install Jinja2
!{sys.executable} - m  pip install hnswlib
!{sys.executable} - m  pip install implicit
!{sys.executable} - m  pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!mkdir ../data
!wget https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip -O ../data/data_original.zip
!unzip ../data/data_original.zip -d ../data

mkdir: cannot create directory ‘../data’: File exists
--2022-12-13 16:15:25--  https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 78795385 (75M) [application/zip]
Saving to: ‘../data/data_original.zip’


2022-12-13 16:15:37 (7.90 MB/s) - ‘../data/data_original.zip’ saved [78795385/78795385]

Archive:  ../data/data_original.zip
replace ../data/kion_train/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: ../data/kion_train/interactions.csv  
  inflating: ../data/__MACOSX/kion_train/._interactions.csv  
  inflating: ../data/kion_train/users.csv  
  inflating: ../data/__MACOSX/kion_train/._users.csv  
  inflating: ../data/kion_train/items.csv  
  inflating: ../data/__MACOSX/kion_train/._items.csv  


In [None]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

# LOAD DATA 

In [None]:
interactions = pd.read_csv("../data/kion_train/interactions.csv")
users = pd.read_csv("../data/kion_train/users.csv")
items = pd.read_csv("../data/kion_train/items.csv")

CPU times: user 2.65 s, sys: 194 ms, total: 2.85 s
Wall time: 2.93 s


# Preprocess

In [None]:
Columns.Datetime = "last_watch_dt"

In [None]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True)

In [None]:
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format='%Y-%m-%d')

In [None]:
max_date = interactions[Columns.Datetime].max()

In [None]:
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [None]:
train = interactions[interactions[Columns.Datetime]
                     < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime]
                    >= max_date - pd.Timedelta(days=7)].copy()

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [None]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [None]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [None]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Prepare features

## User features

In [None]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [None]:
users.fillna("Unknown", inplace=True)

In [None]:
users.nunique()

user_id     840197
age              7
income           7
sex              3
kids_flg         2
dtype: int64

In [None]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [None]:
users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
5,1037719,age_45_54,income_60_90,М,0
...,...,...,...,...,...
840184,529394,age_25_34,income_40_60,Ж,0
840186,80113,age_25_34,income_40_60,Ж,0
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1


In [None]:
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [None]:
user_features.query(f"id == 973171")

Unnamed: 0,id,value,feature
0,973171,М,sex
0,973171,age_25_34,age
0,973171,income_60_90,income


# Item features

In [None]:
items.isnull().sum()

item_id             0
content_type        0
title               0
title_orig       4745
release_year       98
genres              0
countries          37
for_kids        15397
age_rating          2
studios         14898
directors        1509
actors           2619
description         2
keywords          423
dtype: int64

In [None]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [None]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю..."
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж..."


In [None]:
items.nunique()

item_id         14019
content_type        2
title           13454
title_orig       9724
release_year      104
genres           2559
countries         666
for_kids            2
age_rating          6
studios            38
directors        7414
actors          11830
description     13791
keywords        13583
dtype: int64

### Genre

In [None]:
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(
    ", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [None]:
content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"

In [None]:
content_feature

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


In [None]:
item_features = pd.concat((genre_feature, content_feature))

In [None]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,series,content_type
15959,2367,series,content_type
15960,10632,series,content_type
15961,4538,series,content_type


# Metrics

In [None]:
metrics_name = {
    "Precision": Precision,
    "Recall": Recall,
    "MAP": MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [None]:
metrics

{'Precision@1': Precision(k=1),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

# Models

In [None]:
import dill as pickle
!mkdir models
!{sys.executable} - m pip install dill

mkdir: cannot create directory ‘models’: File exists
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 1
N_FACTORS = (32, 64, 128, 256)
N_EPOCHS = 1  # Lightfm
USER_ALPHA = 0  # Lightfm
ITEM_ALPHA = 0  # Lightfm
LEARNING_RATE = 0.05  # Lightfm

In [None]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.24 s, sys: 50.7 ms, total: 1.29 s
Wall time: 1.45 s


In [None]:
TEST_USERS = test[Columns.User].unique()

In [None]:
models = {
    "popular": PopularModel(),
}

In [None]:
metrics

{'Precision@1': Precision(k=1),
 'Precision@2': Precision(k=2),
 'Precision@3': Precision(k=3),
 'Precision@4': Precision(k=4),
 'Precision@5': Precision(k=5),
 'Precision@6': Precision(k=6),
 'Precision@7': Precision(k=7),
 'Precision@8': Precision(k=8),
 'Precision@9': Precision(k=9),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@2': Recall(k=2),
 'Recall@3': Recall(k=3),
 'Recall@4': Recall(k=4),
 'Recall@5': Recall(k=5),
 'Recall@6': Recall(k=6),
 'Recall@7': Recall(k=7),
 'Recall@8': Recall(k=8),
 'Recall@9': Recall(k=9),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@2': MAP(k=2, divide_by_k=False),
 'MAP@3': MAP(k=3, divide_by_k=False),
 'MAP@4': MAP(k=4, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@6': MAP(k=6, divide_by_k=False),
 'MAP@7': MAP(k=7, divide_by_k=False),
 'MAP@8': MAP(k=8, divide_by_k=False),
 'MAP@9': MAP(k=9, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

In [None]:
map_score = 0


def objective(trial):
    model = LightFMWrapperModel(
        LightFM(
            no_components=trial.suggest_categorical("n_factors", N_FACTORS),
            loss="warp",
            random_state=RANDOM_STATE,
            learning_rate=trial.suggest_float(
                "LEARNING_RATE", 0.02, 0.05, step=0.01),
            user_alpha=trial.suggest_float("USER_ALPHA", 0.2, 1, step=0.4),
            item_alpha=trial.suggest_float("ITEM_ALPHA", 0.2, 1, step=0.4),
        ),
        epochs=trial.suggest_int("Epochs", 1, 2),
        num_threads=NUM_THREADS,
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    if metric_values.get("MAP@10") >= map_score:
        with open(f"models/LightFM.dill", "wb") as file:
            pickle.dump("LightFM_best", file)
    return metric_values.get("MAP@10")


study_LightFM = optuna.create_study(direction="maximize")
study_LightFM.optimize(objective, n_trials=3600, timeout=10800)
study_LightFM.best_params


[32m[I 2022-12-13 16:31:10,745][0m A new study created in memory with name: no-name-c812a614-4a87-44d8-b56a-be614a88687a[0m
[32m[I 2022-12-13 16:41:07,459][0m Trial 0 finished with value: 0.0005651075918644076 and parameters: {'n_factors': 256, 'LEARNING_RATE': 0.05, 'USER_ALPHA': 0.6000000000000001, 'ITEM_ALPHA': 0.2, 'Epochs': 1}. Best is trial 0 with value: 0.0005651075918644076.[0m
[32m[I 2022-12-13 16:51:07,950][0m Trial 1 finished with value: 0.0035573916053586556 and parameters: {'n_factors': 256, 'LEARNING_RATE': 0.04, 'USER_ALPHA': 0.6000000000000001, 'ITEM_ALPHA': 1.0, 'Epochs': 1}. Best is trial 1 with value: 0.0035573916053586556.[0m
[32m[I 2022-12-13 16:57:51,881][0m Trial 2 finished with value: 0.0006509227982602694 and parameters: {'n_factors': 128, 'LEARNING_RATE': 0.02, 'USER_ALPHA': 1.0, 'ITEM_ALPHA': 0.2, 'Epochs': 2}. Best is trial 1 with value: 0.0035573916053586556.[0m
[32m[I 2022-12-13 17:02:48,933][0m Trial 3 finished with value: 0.0741876054283642

{'n_factors': 64,
 'LEARNING_RATE': 0.02,
 'USER_ALPHA': 0.6000000000000001,
 'ITEM_ALPHA': 0.6000000000000001,
 'Epochs': 1}

In [None]:
map_score = 0


def objective(trial):
    model = ImplicitALSWrapperModel(
        model=AlternatingLeastSquares(
            factors=trial.suggest_categorical("n_factors", N_FACTORS),
            random_state=RANDOM_STATE,
            num_threads=4,
        ),
        fit_features_together=trial.suggest_categorical(
            "is_fitting_features", [True, False])
    )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    if metric_values.get("MAP@10") >= map_score:
        with open(f"models/ALS.dill", "wb") as file:
            pickle.dump("ALS_best", file)
    return metric_values.get("MAP@10")


study_als = optuna.create_study(direction="maximize")
study_als.optimize(objective, n_trials=8, timeout=1800)

study_als.best_params

[32m[I 2022-12-13 19:32:53,777][0m A new study created in memory with name: no-name-44c61a00-ed9f-4341-8e22-0b15afb0926a[0m
[32m[I 2022-12-13 19:35:34,590][0m Trial 0 finished with value: 0.06322509491192645 and parameters: {'n_factors': 256, 'is_fitting_features': False}. Best is trial 0 with value: 0.06322509491192645.[0m
[32m[I 2022-12-13 19:37:13,610][0m Trial 1 finished with value: 0.07388927306636595 and parameters: {'n_factors': 32, 'is_fitting_features': True}. Best is trial 1 with value: 0.07388927306636595.[0m
[32m[I 2022-12-13 19:39:09,624][0m Trial 2 finished with value: 0.06421853224681903 and parameters: {'n_factors': 128, 'is_fitting_features': False}. Best is trial 1 with value: 0.07388927306636595.[0m
[32m[I 2022-12-13 19:40:50,291][0m Trial 3 finished with value: 0.06382138704646514 and parameters: {'n_factors': 64, 'is_fitting_features': False}. Best is trial 1 with value: 0.07388927306636595.[0m
[32m[I 2022-12-13 19:42:33,578][0m Trial 4 finished wi

{'n_factors': 128, 'is_fitting_features': True}

In [None]:
lightfm_params = study_LightFM.best_params
als_params = study_als.best_params
print(lightfm_params)
print(als_params.get("is_fitting_features"))

{'n_factors': 64, 'LEARNING_RATE': 0.02, 'USER_ALPHA': 0.6000000000000001, 'ITEM_ALPHA': 0.6000000000000001, 'Epochs': 1}
True


In [None]:
models[f"ALS_n_factors_{als_params.get('n_factors')}_{als_params.get('is_fitting_features')}"] = ImplicitALSWrapperModel(
    model=AlternatingLeastSquares(
        factors=als_params.get("n_factors"),
        random_state=RANDOM_STATE,
        num_threads=4,
    ),
    fit_features_together=als_params.get("is_fitting_features")
)


models[f"LightFM_warp_{lightfm_params.get("n_factors")}_{lightfm_params.get("LEARNING_RATE")}_{lightfm_params.get("USER_ALPHA")}_{lightfm_params.get("ITEM_ALPHA")}_{lightfm_params.get("Epochs")}"] = LightFMWrapperModel(
    LightFM(
        no_components=lightfm_params.get("n_factors"),
        loss="warp",
        random_state=RANDOM_STATE,
        learning_rate=lightfm_params.get("LEARNING_RATE"),
        user_alpha=lightfm_params.get("USER_ALPHA"),
        item_alpha=lightfm_params.get("ITEM_ALPHA"),
    ),
    epochs=lightfm_params.get("Epochs"),
    num_threads=NUM_THREADS,
)

In [None]:
model_names = list(models.keys())
model_names

['popular',
 'ALS_n_factors_128_True',
 'LightFM_warp_64_0.02_0.6000000000000001_0.6000000000000001_1']

In [None]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {"model": model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
Fitting model ALS_n_factors_128_True...
Fitting model LightFM_warp_64_0.02_0.6000000000000001_0.6000000000000001_1...
CPU times: user 5min 48s, sys: 1min 30s, total: 7min 18s
Wall time: 4min 31s


In [None]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop("model", inplace=True)

In [None]:
df_quality.style.highlight_max(color="lightgreen", axis=1)

model,popular,ALS_n_factors_128_True,LightFM_warp_64_0.02_0.6000000000000001_0.6000000000000001_1
Precision@1,0.069368,0.082504,0.075697
Recall@1,0.035863,0.042603,0.039784
Precision@2,0.063681,0.069634,0.066514
Recall@2,0.064597,0.070168,0.06743
Precision@3,0.059233,0.060006,0.063688
Recall@3,0.08808,0.08892,0.094912
Precision@4,0.057348,0.053298,0.057248
Recall@4,0.112881,0.103797,0.112635
Precision@5,0.051035,0.047603,0.050915
Recall@5,0.124184,0.114411,0.12379


In [None]:
for model_name in model_names:
    with open(f"models/{model_name}.dill", "wb") as file: 
        pickle.dump(models.get(model_name), file)

In [None]:
LightFM_model = models.get(
    "LightFM_warp_64_0.02_0.6000000000000001_0.6000000000000001_1")

# Approximate Nearest Neighbors 

In [None]:
import hnswlib
import nmslib

In [None]:
user_embeddings, item_embeddings = LightFM_model.get_vectors(dataset)

In [None]:
user_embeddings.shape, item_embeddings.shape

((756562, 66), (14019, 66))

In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [None]:
print("pre shape: ", item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
print(
    f"augmented item embedding shape: {augmented_item_embeddings.shape} \nmax_norm: {max_norm}")

pre shape:  (14019, 66)
augmented item embedding shape: (14019, 67) 
max_norm: 1.000000000000715


In [None]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(756562, 67)

In [None]:
# Set index parameters
# These are the most important ones
M = 64
efC = 100

num_threads = 4
index_time_params = {"M": M, "indexThreadQty": num_threads,
                     "efConstruction": efC, "post": 0}
print("Index-time parameters", index_time_params)

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [None]:
# Number of neighbors 
K=10

In [None]:
# Space name should correspond to the space name 
# used for brute-force search
space_name="cosinesimil"

In [None]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method="hnsw", space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14019

In [None]:
index

<nmslib.FloatIndex method='hnsw' space='cosinesimil' at 0x38c453c0>

In [None]:
# Create an index
start = time.time()
index_time_params = {
    "M": M, "indexThreadQty": num_threads, "efConstruction": efC}
index.createIndex(index_time_params)
end = time.time()
print("Index-time parameters", index_time_params)
print("Indexing time = %f" % (end-start))

Index-time parameters {'M': 64, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 263.465866


In [None]:
# Setting query-time parameters
efS = 100
query_time_params = {"efSearch": efS}
print("Setting query-time parameters", query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [None]:
# Querying
query_qty = augmented_user_embeddings.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(augmented_user_embeddings, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=39.949651 (sec), per query=0.000053 (sec), per query adjusted for thread number=0.000211 (sec)


In [None]:
recos = {user: list(reco[0]) for user, reco in enumerate(nbrs)}

In [None]:
recos

In [None]:
with open('lightfm_64_1.dill', 'wb') as f:
    pickle.dump(recos ,f)