In [1]:
import os
os.chdir("../")
import pickle
import pandas as pd
import numpy as np

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import recall_at_k, precision_at_k
from src.optimizer import OptunaMaximizer
from src.config import Config
from src.dataset import Interactions
from scipy.sparse import coo_matrix, csr_matrix
from typing import Iterable, Iterator, Optional


# First look

In [2]:
with Config.PREPROCESSED_INTERACTIONS_PATH.open("rb") as file_object:
    interactions = pickle.load(file_object)

In [3]:
with Config.PREPROCESSED_ITEMS_PATH.open("rb") as file_object:
    items = pickle.load(file_object)

In [4]:
with Config.PREPROCESSED_USERS_PATH.open("rb") as file_object:
    users = pickle.load(file_object)

In [5]:
items

Unnamed: 0,id,title,genres,authors,year
0,128115,Ворон-челобитчик,"Зарубежные детские книги,Сказки,Зарубежная кла...",Михаил Салтыков-Щедрин,1886
1,210979,Скрипка Ротшильда,"Классическая проза,Литература 19 века,Русская ...",Антон Чехов,1894
2,95632,Испорченные дети,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1869
3,247906,Странный человек,"Пьесы и драматургия,Литература 19 века",Михаил Лермонтов,1831
4,294280,Господа ташкентцы,"Зарубежная классика,Классическая проза,Литерат...",Михаил Салтыков-Щедрин,1873
...,...,...,...,...,...
59594,45640,МК Московский Комсомолец 291-2019,"Политология,Книги по экономике,Газеты",,2019
59595,321616,МК Московский Комсомолец 292-2019,"Политология,Книги по экономике,Газеты",,2019
59596,125582,Известия 248-249-2019,"Политология,Общая история,Газеты",,2019
59597,33188,Men's Health 01-2020,Журнальные издания,,2019


In [6]:
users

Unnamed: 0,user_id,age,sex
0,1,45_54,
1,2,18_24,0.0
2,3,65_inf,0.0
3,4,18_24,0.0
4,5,35_44,0.0
...,...,...,...
142883,159606,25_34,0.0
142884,159607,25_34,
142885,159609,18_24,0.0
142886,159610,35_44,0.0


In [7]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1332091 entries, 0 to 1532997
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype               
---  ------          --------------    -----               
 0   user_id         1332091 non-null  int64               
 1   item_id         1332091 non-null  int64               
 2   progress        1332091 non-null  int8                
 3   rating          280762 non-null   Sparse[float32, nan]
 4   start_date      1332091 non-null  datetime64[ns]      
 5   interest_score  1332091 non-null  Sparse[float32, nan]
dtypes: Sparse[float32, nan](2), datetime64[ns](1), int64(2), int8(1)
memory usage: 54.2 MB


In [8]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142888 entries, 0 to 142887
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype               
---  ------   --------------   -----               
 0   user_id  142888 non-null  int64               
 1   age      142742 non-null  category            
 2   sex      136626 non-null  Sparse[float32, nan]
dtypes: Sparse[float32, nan](1), category(1), int64(1)
memory usage: 2.3 MB


In [9]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59599 entries, 0 to 59598
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   id       59599 non-null  int64   
 1   title    59599 non-null  object  
 2   genres   59568 non-null  category
 3   authors  52714 non-null  category
 4   year     46720 non-null  category
dtypes: category(3), int64(1), object(1)
memory usage: 2.3+ MB


In [10]:
interactions.isnull().sum()

user_id                 0
item_id                 0
progress                0
rating            1051329
start_date              0
interest_score          0
dtype: int64

In [11]:
items.isnull().sum()

id             0
title          0
genres        31
authors     6885
year       12879
dtype: int64

In [12]:
users.isnull().sum()

user_id       0
age         146
sex        6262
dtype: int64

# Preprocessing

In [13]:
items.rename(columns={"id": "item_id"}, inplace=True)

In [14]:
items = items[items["item_id"].isin(interactions["item_id"].unique())].copy()
users = users[users["user_id"].isin(interactions["user_id"].unique())].copy()
interactions = interactions[
    interactions["user_id"].isin(users["user_id"].unique())
    & interactions["item_id"].isin(items["item_id"].unique())
].copy()

In [15]:
print(set(items["item_id"]) - set(interactions["item_id"]))
print(set(users["user_id"]) - set(interactions["user_id"]))
print(set(interactions["item_id"]) - set(items["item_id"]))
print(set(interactions["user_id"]) - set(users["user_id"]))

set()
set()
set()
set()


In [16]:
unknown = "unknown"

items["genres"] = items["genres"].cat.add_categories(unknown)
items["authors"] = items["authors"].cat.add_categories(unknown)
items["year"] = items["year"].cat.add_categories(unknown)

users["age"] = users["age"].cat.add_categories(unknown)
users['sex'] = np.array(users['sex'].astype(str))
users['sex'] = users['sex'].replace("nan", np.nan).replace("1.0", "1").replace("0.0", "0").astype("category")
users["sex"] = users["sex"].cat.add_categories(unknown)

items = items.fillna(unknown)
users = users.fillna(unknown)

  users['sex'] = np.array(users['sex'].astype(str))


In [17]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1215180 entries, 0 to 1532997
Data columns (total 6 columns):
 #   Column          Non-Null Count    Dtype               
---  ------          --------------    -----               
 0   user_id         1215180 non-null  int64               
 1   item_id         1215180 non-null  int64               
 2   progress        1215180 non-null  int8                
 3   rating          258370 non-null   Sparse[float64, nan]
 4   start_date      1215180 non-null  datetime64[ns]      
 5   interest_score  1215180 non-null  Sparse[float32, nan]
dtypes: Sparse[float32, nan](1), Sparse[float64, nan](1), datetime64[ns](1), int64(2), int8(1)
memory usage: 50.5 MB


In [18]:
users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135480 entries, 0 to 142887
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   user_id  135480 non-null  int64   
 1   age      135480 non-null  category
 2   sex      135480 non-null  category
dtypes: category(2), int64(1)
memory usage: 2.3 MB


In [19]:
items.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 59599 entries, 0 to 59598
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   item_id  59599 non-null  int64   
 1   title    59599 non-null  object  
 2   genres   59599 non-null  category
 3   authors  59599 non-null  category
 4   year     59599 non-null  category
dtypes: category(3), int64(1), object(1)
memory usage: 2.7+ MB


In [20]:
users.isnull().sum()

user_id    0
age        0
sex        0
dtype: int64

In [21]:
items.isnull().sum()

item_id    0
title      0
genres     0
authors    0
year       0
dtype: int64

# Building lightfm dataset

In [22]:
unique_users = users["user_id"].unique()
unique_items = items["item_id"].unique()

In [23]:
item_features = np.array([
        *items["genres"].str.split(",").explode().unique(),
        *items["authors"].unique(),
        *items["year"].unique()
    ],
    dtype=object
)

user_features = np.array([
        *users["age"].unique(),
        *users["sex"].unique()    
    ],
    dtype=object
)

In [24]:
dataset = Dataset()
dataset.fit(
    users=unique_users,
    items=unique_items,
    user_features=user_features,
    item_features=item_features
)

In [25]:
dataset_mapper = dataset.mapping()
dataset_mapper = {
    'users_mapping': dataset_mapper[0],
    'user_features_mapping': dataset_mapper[1],
    'items_mapping': dataset_mapper[2],
    'item_features_mapping': dataset_mapper[3]
}
dataset_mapper['users_inv_mapping'] = {v: k for k, v in dataset_mapper['users_mapping'].items()}
dataset_mapper['items_inv_mapping'] = {v: k for k, v in dataset_mapper['items_mapping'].items()}
dataset_mapper

{'users_mapping': {1: 0,
  2: 1,
  3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10,
  12: 11,
  13: 12,
  14: 13,
  16: 14,
  17: 15,
  18: 16,
  20: 17,
  21: 18,
  23: 19,
  24: 20,
  26: 21,
  27: 22,
  28: 23,
  29: 24,
  30: 25,
  31: 26,
  32: 27,
  33: 28,
  34: 29,
  35: 30,
  36: 31,
  37: 32,
  38: 33,
  40: 34,
  43: 35,
  44: 36,
  45: 37,
  46: 38,
  47: 39,
  48: 40,
  49: 41,
  51: 42,
  53: 43,
  54: 44,
  55: 45,
  56: 46,
  57: 47,
  58: 48,
  59: 49,
  61: 50,
  62: 51,
  63: 52,
  65: 53,
  66: 54,
  68: 55,
  70: 56,
  71: 57,
  72: 58,
  73: 59,
  74: 60,
  76: 61,
  77: 62,
  78: 63,
  79: 64,
  80: 65,
  82: 66,
  83: 67,
  84: 68,
  85: 69,
  87: 70,
  88: 71,
  89: 72,
  90: 73,
  91: 74,
  92: 75,
  93: 76,
  94: 77,
  95: 78,
  97: 79,
  98: 80,
  99: 81,
  100: 82,
  101: 83,
  102: 84,
  104: 85,
  105: 86,
  106: 87,
  107: 88,
  108: 89,
  109: 90,
  110: 91,
  111: 92,
  113: 93,
  114: 94,
  115: 95,
  116: 96,
  117: 97,
  118:

# Building train, test & valid datasets

In [26]:
interactions = Interactions(interactions)
interactions.train = interactions.train[["user_id", "item_id", "interest_score"]]
interactions.test = interactions.test[["user_id", "item_id", "interest_score"]]
interactions.valid = interactions.valid[["user_id", "item_id", "interest_score"]]

In [27]:
interactions.train

Unnamed: 0,user_id,item_id,interest_score
0,126706,14433,0.400
1,127290,140952,0.290
3,46791,83486,0.615
4,79313,188770,0.940
5,63454,78434,0.435
...,...,...,...
1532993,153908,98585,0.220
1532994,154008,251969,0.020
1532995,154892,298192,0.840
1532996,156948,38118,0.890


In [28]:
interactions.test

Unnamed: 0,user_id,item_id,interest_score
1517915,101642,319500,0.835
1517917,130425,193445,0.490
1517918,93986,80733,0.235
1517919,159466,124115,0.420
1517920,158775,223806,0.270
...,...,...,...
1530838,141930,219928,0.450
1530839,53358,42887,0.290
1530840,151170,284652,0.135
1530841,141293,273421,0.240


In [29]:
interactions.valid

Unnamed: 0,user_id,item_id,interest_score
1503047,22032,287219,0.275
1503048,84214,121609,1.000
1503049,28992,11482,0.110
1503050,23345,281921,1.000
1503052,147024,5165,0.095
...,...,...,...
1517908,81768,294370,0.850
1517910,158991,99669,0.815
1517911,77232,142149,0.020
1517912,17843,174535,0.060


In [30]:
def get_iterable_interactions(interactions: pd.DataFrame) -> Iterator:
    return zip(*interactions.values.T)

In [31]:
coo_train, train_weights = dataset.build_interactions(
    get_iterable_interactions(interactions.train[["user_id", "item_id", "interest_score"]])
)
coo_train

<135480x59599 sparse matrix of type '<class 'numpy.int32'>'
	with 1192524 stored elements in COOrdinate format>

In [32]:
coo_test, test_weights = dataset.build_interactions(
    get_iterable_interactions(interactions.test[["user_id", "item_id", "interest_score"]])
)
coo_test

<135480x59599 sparse matrix of type '<class 'numpy.int32'>'
	with 10518 stored elements in COOrdinate format>

In [33]:
coo_valid, valid_weights = dataset.build_interactions(
    get_iterable_interactions(interactions.valid[["user_id", "item_id", "interest_score"]])
)
coo_valid

<135480x59599 sparse matrix of type '<class 'numpy.int32'>'
	with 12138 stored elements in COOrdinate format>

# Preparing users & items features

In [34]:
def get_iterable_features(obj: pd.DataFrame) -> Iterator:
    def make_id_features(row: Iterable):
        id = row[0]
        features = []
        for feature in row[1: ]:
            if isinstance(feature, list):
                features.extend(feature)
                continue
            features.append(feature)
        return (id, features)
    return map(make_id_features, zip(*obj.values.T))

In [35]:
prepared_user_features = dataset.build_user_features(
    get_iterable_features(
        users[["user_id", "age", "sex"]]
    )
)
prepared_user_features

<135480x135489 sparse matrix of type '<class 'numpy.float32'>'
	with 406440 stored elements in Compressed Sparse Row format>

In [36]:
items["genres"] = items["genres"].str.split(",")
prepared_item_features = dataset.build_item_features(
    get_iterable_features(
        items[["item_id", "genres", "authors", "year"]]
    )
)
prepared_item_features

<59599x78558 sparse matrix of type '<class 'numpy.float32'>'
	with 304945 stored elements in Compressed Sparse Row format>

# Optimizing model

In [39]:
class Objective:
    def __init__(self,
                train: coo_matrix,
                train_weights: coo_matrix,
                valid: coo_matrix, 
                user_features: csr_matrix, 
                item_features: csr_matrix, 
                k: int,
                seed: Optional[int]
            ):
        self.train = train
        self.train_weights = train_weights
        self.valid = valid
        self.user_features = user_features
        self.item_features = item_features
        self.k = k
        self.seed = seed

    def __call__(self, trial) -> float:
        model_search_space = {
            "no_components": trial.suggest_int("no_components", 16, 256, 16),
            "learning_schedule": trial.suggest_categorical("learning_schedule", ["adagrad", "adadelta"]),
            "loss":  trial.suggest_categorical("loss", ["logistic", "bpr", "warp"]),
            "learning_rate": trial.suggest_float("learning_rate", .01, .05),
            "user_alpha": trial.suggest_float("user_alpha", 1e-9, 1e-8),
            "item_alpha": trial.suggest_float("item_alpha", 1e-9, 1e-8),
            "random_state": self.seed
            
        }
        fit_search_space = {
            "epochs": trial.suggest_int("epochs", 10, 60, 5)
        }
        model = LightFM(**model_search_space)
        model.fit(
            interactions=self.train,
            sample_weight=self.train_weights,
            user_features=self.user_features,
            item_features=self.item_features,
            num_threads=Config.NUM_THREADS,
            epochs=fit_search_space["epochs"]
        )
        return precision_at_k(
            model=model,
            train_interactions=self.train,
            test_interactions=self.valid,
            k=self.k,
            user_features=self.user_features,
            item_features=self.item_features,
            num_threads=12
        ).mean()
    

In [40]:
objective = Objective(coo_train, train_weights, coo_valid, prepared_user_features, prepared_item_features, Config.K, seed=42)
optimizer = OptunaMaximizer(objective, "LightFM study")
best_trial = optimizer.optimize(Config.ITERATIONS, Config.STOP_ITERATION)

[32m[I 2023-03-10 20:23:02,555][0m A new study created in memory with name: LightFM study[0m


  0%|          | 0/100 [00:00<?, ?it/s]

[32m[I 2023-03-10 20:26:38,427][0m Trial 0 finished with value: 0.001150350901298225 and parameters: {'no_components': 256, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.049113071123620226, 'user_alpha': 4.146366237684012e-09, 'item_alpha': 4.020732251127043e-09, 'epochs': 20}. Best is trial 0 with value: 0.001150350901298225.[0m
[32m[I 2023-03-10 20:27:30,662][0m Trial 1 finished with value: 1.1503509085741825e-05 and parameters: {'no_components': 16, 'learning_schedule': 'adadelta', 'loss': 'logistic', 'learning_rate': 0.0398556923573678, 'user_alpha': 6.5185513105797e-09, 'item_alpha': 6.4253844922574445e-09, 'epochs': 40}. Best is trial 0 with value: 0.001150350901298225.[0m
[32m[I 2023-03-10 20:28:17,039][0m Trial 2 finished with value: 1.1503509085741825e-05 and parameters: {'no_components': 16, 'learning_schedule': 'adadelta', 'loss': 'logistic', 'learning_rate': 0.01825311961894409, 'user_alpha': 3.884060554749485e-09, 'item_alpha': 5.48328174126602

In [44]:
best_params = best_trial.params
model = LightFM(
    **{key: val for key, val in best_params.items() if key != "epochs"}
)
model.fit(
    interactions=coo_train,
    sample_weight=train_weights,
    user_features=prepared_user_features,
    item_features=prepared_item_features,
    num_threads=12,
    epochs=best_params["epochs"]
)

<lightfm.lightfm.LightFM at 0x7efe345a1910>

In [45]:
precision_at_k(
    model=model,
    train_interactions=coo_train,
    test_interactions=coo_test,
    k=Config.K,
    user_features=prepared_user_features,
    item_features=prepared_item_features,
    num_threads=12
).mean()

0.0007065355

In [46]:
recall_at_k(
    model=model,
    train_interactions=coo_train,
    test_interactions=coo_test,
    k=Config.K,
    user_features=prepared_user_features,
    item_features=prepared_item_features,
    num_threads=12
).mean()

0.003808547061764321

In [47]:
with open("models/lightfm_trial.pickle", "wb") as file_object:
    pickle.dump(best_trial, file_object) 