In [None]:
# Устанавливаем библиотеку LightFM:
# ------------------------------------------------------------------------------
# Installing the LightFM library:

!pip install -q lightfm

In [None]:
# Устанавливаем пакет LightFM Dataset helper для подготовки
# данных для модели LightFM:
# ------------------------------------------------------------------------------
# Installing the LightFM DatasetHelper package to prepare data
# for the LightFM model:

!pip install -q lightfm_dataset_helper

In [None]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm_dataset_helper.lightfm_dataset_helper import DatasetHelper

In [None]:
# Загружаем только нужные нам данные, а также задаем разделители,
# имена столбцов, движок и кодировку:
# ------------------------------------------------------------------------------
# We load only the data we need, and also set the separators,
# column names, engine and encoding:

ratings = pd.read_csv(
    'ml-1m/ratings.dat',
    sep = '::',
    names= ['userId', 'movieId', 'rating'],
    usecols=[0, 1, 2],
    engine='python'
    )
movies = pd.read_csv(
    'ml-1m/movies.dat',
    sep = '::',
    names= ['movieId', 'title', 'genres'],
    usecols=[0, 1, 2],
    engine='python',
    encoding = 'latin-1'
    )
users = pd.read_csv(
    'ml-1m/users.dat',
    sep = '::',
    names= ['userId', 'gender', 'age', 'occupation', 'zip-code'],
    engine='python',
    encoding = 'latin-1'
    )

In [None]:
# Посмотрим на данные:
# ------------------------------------------------------------------------------
# Let's look at the data:

ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
users.head()

Unnamed: 0,userId,gender,age,occupation,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


***Для обработки набора данных мы будем использовать пакет DatasetHelper  
библиотеки LightFM. Затем построим и обучим гибридную модель LightFM.***


---
***We will use the DatasetHelper package to process the dataset  
Lightdm libraries. Then we will build and train a hybrid LightFM model.***


In [None]:
# Оределим столбцы для построения набора данных с помощью DatasetHelper:
# ------------------------------------------------------------------------------
# # Define columns to build a dataset using DatasetHelper:

items_column = 'movieId'
user_column = 'userId'
ratings_column = 'rating'

items_feature_columns = [
    'title',
    'genres'
    ]

user_features_columns = [
    'gender',
    'age',
    'occupation',
    'zip-code'
    ]

In [None]:
# Сопоставим данные в датафреймах movies и users с данными в датафрейме ratings:
# ------------------------------------------------------------------------------
# Let's compare the data in the movies and users dataframes with the data
# in the ratings dataframe:

movies = movies[movies[items_column].isin(ratings[items_column])]
users = users[users[user_column].isin(ratings[user_column])]

In [None]:
# Передаем наши данные в DatasetHelper:
# ------------------------------------------------------------------------------
# Passing our data to DatasetHelper:

dataset_helper_instance = DatasetHelper(
users_dataframe=users,
items_dataframe=movies,
interactions_dataframe=ratings,
item_id_column=items_column,
items_feature_columns=items_feature_columns,
user_id_column=user_column,
user_features_columns=user_features_columns,
interaction_column=ratings_column,
clean_unknown_interactions=True,
)

In [None]:
# Строим набор данных для модели LightFM.
# Чтобы не получить ошибку переопределяем атрибут float:
# ------------------------------------------------------------------------------
# Building a dataset for the LightFM model.
# To avoid getting an error, redefine the float attribute:

np.float = float
dataset_helper_instance.routine()

In [None]:
# Построим и обучим модель LightFM:
# ------------------------------------------------------------------------------
# Let's build and train the LightFM model:

model = LightFM(no_components=30, loss="warp", k=15, random_state=42)
model.fit(
    interactions=dataset_helper_instance.interactions,
    sample_weight=dataset_helper_instance.weights,
    item_features=dataset_helper_instance.item_features_list,
    user_features=dataset_helper_instance.user_features_list,
    verbose=True,
    epochs=10,
    num_threads=20,
)

Epoch: 100%|██████████| 10/10 [00:49<00:00,  4.98s/it]


<lightfm.lightfm.LightFM at 0x78fd53ba1480>

In [None]:
# Сделаем предсказания для конкретного пользователя:
# ------------------------------------------------------------------------------
# Let's make predictions for a specific user:

n_items = dataset_helper_instance.interactions.shape[1]
user_id = 120
scores = model.predict(user_id, np.arange(n_items))
scores

array([-3.2400053 , -0.00902378,  0.2650213 , ..., -1.3060505 ,
       -2.5871575 , -5.1769843 ], dtype=float32)

In [None]:
# Посмотрим какие фильмы мы предсказали:
# ------------------------------------------------------------------------------
# Let's see which movies we predicted:

scores = pd.Series(scores)
scores.index = movies['title']
scores.sort_values(ascending=False)

title
Heavy Metal (1981)                          3.700969
Vampire in Brooklyn (1995)                  2.853658
Spawn (1997)                                2.818584
Robocop 2 (1990)                            2.784452
Hellraiser: Bloodline (1996)                2.740587
                                              ...   
Spanish Prisoner, The (1997)               -6.988829
Shall We Dance? (Shall We Dansu?) (1996)   -7.104220
Kolya (1996)                               -7.115179
Election (1999)                            -7.128723
Secrets & Lies (1996)                      -7.412042
Length: 3706, dtype: float32