In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse.linalg import svds

pd.options.display.max_colwidth = 2000
pd.options.display.float_format = '{:.2f}'.format

# Данные

[Источник](https://grouplens.org/datasets/movielens/) исходных данных. 
Данные для скринкаста предобработаны

Извлечем доступные данные о фильмах и о взаимодействиях пользователей с фильмами

In [2]:
ratings = pd.read_csv('ratings_df_sample_2.csv')
movies = pd.read_csv('movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,54,2,3.0,974918176
1,54,32,5.0,974836809
2,54,47,4.0,974837760
3,54,50,4.0,974837760
4,54,223,5.0,974840217


In [4]:
ratings.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6040099 entries, 0 to 6040098
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     6040099 non-null  int64  
 1   movieId    6040099 non-null  int64  
 2   rating     6040099 non-null  float64
 3   timestamp  6040099 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 184.3 MB


  ratings.info(null_counts=True)


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


Для удобства, сразу приджоиним фильмы

In [7]:
ratings=(
    ratings
    .merge(movies, on='movieId')
)

In [8]:
ratings.shape

(6040099, 6)

In [9]:
ratings[:5]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,54,2,3.0,974918176,Jumanji (1995),Adventure|Children|Fantasy
1,91,2,3.5,1112061358,Jumanji (1995),Adventure|Children|Fantasy
2,116,2,2.0,1132728068,Jumanji (1995),Adventure|Children|Fantasy
3,124,2,2.0,1134476330,Jumanji (1995),Adventure|Children|Fantasy
4,129,2,3.0,1283448701,Jumanji (1995),Adventure|Children|Fantasy


In [10]:
n_users = ratings['userId'].nunique()
n_movies = ratings['movieId'].nunique()
(n_users, n_movies)

(20000, 1000)

In [11]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,6040099.0,6040099.0,6040099.0,6040099.0
mean,68804.82,4822.96,3.55,1115774334.98
std,40102.24,11368.03,1.0,135843321.3
min,7.0,1.0,0.5,824835410.0
25%,34180.0,919.0,3.0,995660158.0
50%,68914.0,1876.0,4.0,1111706240.0
75%,103281.0,3448.0,4.0,1213151458.5
max,138493.0,81845.0,5.0,1427780469.0


Для удобства отмасштабируем идентификаторы фильмов таким образом, чтобы они начинались с 0 и заканчивались на n_movies-1

Этот метод позволит находить фильм по индексу матрицы (i-й столбец матрицы это i-й фильм)

In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,54,2,3.0,974918176,Jumanji (1995),Adventure|Children|Fantasy
1,91,2,3.5,1112061358,Jumanji (1995),Adventure|Children|Fantasy
2,116,2,2.0,1132728068,Jumanji (1995),Adventure|Children|Fantasy
3,124,2,2.0,1134476330,Jumanji (1995),Adventure|Children|Fantasy
4,129,2,3.0,1283448701,Jumanji (1995),Adventure|Children|Fantasy


In [13]:
%%time
movies_values = ratings['movieId'].unique()

ratings['movieId'] = ratings['movieId'].apply(lambda f: np.where(movies_values == f)[0][0])

CPU times: total: 36.7 s
Wall time: 37 s


Также отмасштабируем идентификаторы пользователей таким образом, чтобы они начинались с 0 и заканчивались на n_users-1

Этот метод позволит находить пользовтеля по индексу матрицы (i-я строка матрицы это i-й пользователь)

In [14]:
%%time
users_values = ratings['userId'].unique()

ratings['userId'] = ratings['userId'].apply(lambda f: np.where(users_values == f)[0][0])

CPU times: total: 1min 40s
Wall time: 1min 40s


In [15]:
ratings[:5]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,0,0,3.0,974918176,Jumanji (1995),Adventure|Children|Fantasy
1,1,0,3.5,1112061358,Jumanji (1995),Adventure|Children|Fantasy
2,2,0,2.0,1132728068,Jumanji (1995),Adventure|Children|Fantasy
3,3,0,2.0,1134476330,Jumanji (1995),Adventure|Children|Fantasy
4,4,0,3.0,1283448701,Jumanji (1995),Adventure|Children|Fantasy


In [16]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,6040099.0,6040099.0,6040099.0,6040099.0
mean,9115.02,422.9,3.55,1115774334.98
std,5563.83,283.85,1.0,135843321.3
min,0.0,0.0,0.5,824835410.0
25%,4365.0,165.0,3.0,995660158.0
50%,8679.0,415.0,4.0,1111706240.0
75%,13758.0,646.0,4.0,1213151458.5
max,19999.0,999.0,5.0,1427780469.0


# Контентная рекомендация фильмов

Реализуем алгоритм рекомендации, используя только информацию о фильмах. 

Используя название и жанр фильма, сформируем tf-idf ветора, и для каждого фильма найдем топ 5 похожих фильмов по косинусному расстоянию.

Извлечем фильмы, сделам текстовую фичу test_feature, которую будем кодировать

In [17]:
unique_movies = ratings[['movieId', 'title', 'genres']].drop_duplicates()

In [18]:
unique_movies.shape

(1000, 3)

In [19]:
unique_movies.head()

Unnamed: 0,movieId,title,genres
0,0,Jumanji (1995),Adventure|Children|Fantasy
9524,1,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
23118,2,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
36942,3,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
51035,4,Clerks (1994),Comedy


In [20]:
unique_movies['genres'] = unique_movies['genres'].apply(lambda x: ' '.join(x.split('|')))

In [21]:
unique_movies['text_feature'] = unique_movies.apply(lambda x: x['title'].lower() + ' ' + x['genres'].lower(), axis = 1)

In [22]:
unique_movies = unique_movies.sort_values('movieId')

In [23]:
unique_movies[:5]

Unnamed: 0,movieId,title,genres,text_feature
0,0,Jumanji (1995),Adventure Children Fantasy,jumanji (1995) adventure children fantasy
9524,1,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,twelve monkeys (a.k.a. 12 monkeys) (1995) mystery sci-fi thriller
23118,2,Seven (a.k.a. Se7en) (1995),Mystery Thriller,seven (a.k.a. se7en) (1995) mystery thriller
36942,3,"Usual Suspects, The (1995)",Crime Mystery Thriller,"usual suspects, the (1995) crime mystery thriller"
51035,4,Clerks (1994),Comedy,clerks (1994) comedy


Сформируем tf-idf ветора для каждого text_feature  
Как считается [TF-IDF](https://ru.wikipedia.org/wiki/TF-IDF)

In [24]:
vectorizer = TfidfVectorizer()

In [25]:
tfidf_features = vectorizer.fit_transform(unique_movies['text_feature']).toarray()

In [26]:
tfidf_features.shape

(1000, 1646)

In [27]:
pd.DataFrame(tfidf_features)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1636,1637,1638,1639,1640,1641,1642,1643,1644,1645
0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00,0.37,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
996,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
997,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
998,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [28]:
[u for u, v in vectorizer.vocabulary_.items() if v == 4]

['12']

# Косинусное расстояние

Рассмотрим функцию для попарных косинусных расстояния массива

Сosine_distances - это единица минус косинус, такая величина берется для того, чтобы выполнялась логика - чем меньше значение тем ближе
вектора

In [29]:
demo_data = np.array([[5,5,5,0,0], [4,1,0,5,3], [1,0,0,5,0], [5,0,5,0,4]])

cosine_distances(demo_data)

array([[0.        , 0.59577396, 0.8867723 , 0.28933095],
       [0.59577396, 0.        , 0.2036092 , 0.4484398 ],
       [0.8867723 , 0.2036092 , 0.        , 0.87929886],
       [0.28933095, 0.4484398 , 0.87929886, 0.        ]])

Проверим что, что на позиции 0 1 действитеьно 1 - косинус

In [30]:
1 - (demo_data[0]*demo_data[1]).sum()/( (demo_data[0]**2).sum() * (demo_data[1]**2).sum())**0.5

0.5957739582727783

Вычислим расстояния

In [31]:
cosine_similarity = cosine_distances(tfidf_features)

In [32]:
cosine_similarity.shape

(1000, 1000)

Зная попарные косинусные расстояния, найдем топ 5 кандидатов для каждого фильма

In [33]:
top=5
movies_sim = []
for i in range(len(cosine_similarity)):
    neighbors = (cosine_similarity[i]).argsort()[1:top+1] # считаем от 1 до top1 + 1, 
    # так как ближайшее расстояние у вектора до самого себя
    movies_sim.append(neighbors)
movies_sim = np.array(movies_sim)

In [34]:
movies_sim.shape

(1000, 5)

Составим датафрейм для того, чтобы понять по описаниям, какие фильмы похожи на какие

In [35]:
movie_id_name = dict(unique_movies.set_index('movieId')['text_feature'])
movies_top_df = pd.DataFrame(movies_sim, columns = ['top1', 'top2', 'top3', 'top4', 'top5'])
movies_top_df['top1'] = movies_top_df['top1'].apply(lambda x: movie_id_name[x])
movies_top_df['top2'] = movies_top_df['top2'].apply(lambda x: movie_id_name[x])
movies_top_df['top3'] = movies_top_df['top3'].apply(lambda x: movie_id_name[x])
movies_top_df['top4'] = movies_top_df['top4'].apply(lambda x: movie_id_name[x])
movies_top_df['top5'] = movies_top_df['top5'].apply(lambda x: movie_id_name[x])

movies_top_df['movieId'] = movies_top_df.index

Посмотрим на полученные рекомендации

In [36]:
movies_recs = (
    unique_movies.merge(
        movies_top_df,
        left_on='movieId', right_on='movieId'
    )
)

In [37]:
movies_recs[:5]

Unnamed: 0,movieId,title,genres,text_feature,top1,top2,top3,top4,top5
0,0,Jumanji (1995),Adventure Children Fantasy,jumanji (1995) adventure children fantasy,toy story (1995) adventure animation children comedy fantasy,casper (1995) adventure children,"indian in the cupboard, the (1995) adventure children fantasy",babe (1995) children drama,antz (1998) adventure animation children comedy fantasy
1,1,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,twelve monkeys (a.k.a. 12 monkeys) (1995) mystery sci-fi thriller,ocean's twelve (2004) action comedy crime thriller,congo (1995) action adventure mystery sci-fi,strange days (1995) action crime drama mystery sci-fi thriller,12 angry men (1957) drama,outbreak (1995) action drama sci-fi thriller
2,2,Seven (a.k.a. Se7en) (1995),Mystery Thriller,seven (a.k.a. se7en) (1995) mystery thriller,copycat (1995) crime drama horror mystery thriller,"usual suspects, the (1995) crime mystery thriller",strange days (1995) action crime drama mystery sci-fi thriller,congo (1995) action adventure mystery sci-fi,snow white and the seven dwarfs (1937) animation children drama fantasy musical
3,3,"Usual Suspects, The (1995)",Crime Mystery Thriller,"usual suspects, the (1995) crime mystery thriller",copycat (1995) crime drama horror mystery thriller,"net, the (1995) action crime thriller",strange days (1995) action crime drama mystery sci-fi thriller,"negotiator, the (1998) action crime drama mystery thriller",heat (1995) action crime thriller
4,4,Clerks (1994),Comedy,clerks (1994) comedy,i.q. (1994) comedy romance,airheads (1994) comedy,junior (1994) comedy sci-fi,"mask, the (1994) action comedy crime fantasy","flintstones, the (1994) children comedy fantasy"


In [38]:
movies_recs[movies_recs['title'].apply(lambda x: 'Star Wars' in x)]

Unnamed: 0,movieId,title,genres,text_feature,top1,top2,top3,top4,top5
5,5,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,star wars: episode iv - a new hope (1977) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax
20,20,Star Wars: Episode V - The Empire Strikes Back (1980),Action Adventure Sci-Fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode iv - a new hope (1977) action adventure sci-fi
51,51,Star Wars: Episode I - The Phantom Menace (1999),Action Adventure Sci-Fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode iv - a new hope (1977) action adventure sci-fi
64,64,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi
560,560,Star Wars: Episode II - Attack of the Clones (2002),Action Adventure Sci-Fi IMAX,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi
577,577,Star Wars: Episode III - Revenge of the Sith (2005),Action Adventure Sci-Fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi


# Колаборативная фильтрация

Для оценки прогнозов с помощью колаборативной фильтрации, разобьем датасет на train и test

In [39]:
train_data, test_data = train_test_split(ratings, test_size=0.01)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))

Train shape: (5979698, 6)
Test shape: (60401, 6)


# Метрика качества

Посчитаем метрику на тесте, если бы мы предсказали всем средний рейтинг

In [40]:
train_data['rating'].mean()

3.5539676920138774

In [41]:
const_default = 3.5
test_data['constant_predict'] = const_default
print(f"RMSE metric: {np.sqrt(mean_squared_error(test_data['constant_predict'], test_data['rating']))}")

RMSE metric: 1.0022077911366267


# Сформируем матрицу user-item

In [42]:
n_users = train_data['userId'].nunique()
n_users

20000

In [43]:
n_movies = train_data['movieId'].nunique()
n_movies

1000

Создаём user-item матрицу – для обучения

Неопределенные ячейки будем заполнять не нулями, а 3.5, так как это средний рейтинг (заполняем пропуски средним)

In [44]:
%%time
train_data_matrix = const_default*np.ones((n_users, n_movies))
for line in train_data.to_dict(orient='records'):
    train_data_matrix[line['userId'], line['movieId']] = line['rating']  

CPU times: total: 24.9 s
Wall time: 27.5 s


In [45]:
train_data_matrix.shape

(20000, 1000)

In [46]:
train_data_matrix

array([[3. , 5. , 4. , ..., 3.5, 3.5, 3.5],
       [3.5, 4. , 4. , ..., 3.5, 3.5, 3.5],
       [2. , 4. , 4.5, ..., 3.5, 3.5, 3.5],
       ...,
       [3.5, 3.5, 3.5, ..., 3.5, 3.5, 3.5],
       [3.5, 3.5, 3.5, ..., 3.5, 3.5, 3.5],
       [3.5, 3.5, 3.5, ..., 3.5, 3. , 3.5]])

# Матрицы попарных косинусных расстояний

In [47]:
%%time
# считаем попарное косинусное расстояние для пользователей (строк матрицы)
user_similarity = cosine_distances(train_data_matrix)

CPU times: total: 20.3 s
Wall time: 7.61 s


In [48]:
# # считаем попарное косинусное расстояние для фильмов (столбцов матрицы)
movie_similarity = cosine_distances(train_data_matrix.T)

In [49]:
user_similarity

array([[0.        , 0.02441943, 0.05518208, ..., 0.01828911, 0.01566167,
        0.01884581],
       [0.02441943, 0.        , 0.04972277, ..., 0.0158517 , 0.01307998,
        0.01782062],
       [0.05518208, 0.04972277, 0.        , ..., 0.04909257, 0.04772508,
        0.05329033],
       ...,
       [0.01828911, 0.0158517 , 0.04909257, ..., 0.        , 0.0053888 ,
        0.01105112],
       [0.01566167, 0.01307998, 0.04772508, ..., 0.0053888 , 0.        ,
        0.00756106],
       [0.01884581, 0.01782062, 0.05329033, ..., 0.01105112, 0.00756106,
        0.        ]])

In [50]:
movie_similarity

array([[0.        , 0.03628174, 0.03689956, ..., 0.02398225, 0.02729845,
        0.02517229],
       [0.03628174, 0.        , 0.02541281, ..., 0.02075149, 0.02328947,
        0.02270976],
       [0.03689956, 0.02541281, 0.        , ..., 0.02178781, 0.02285471,
        0.02484129],
       ...,
       [0.02398225, 0.02075149, 0.02178781, ..., 0.        , 0.01166299,
        0.00877448],
       [0.02729845, 0.02328947, 0.02285471, ..., 0.01166299, 0.        ,
        0.01340963],
       [0.02517229, 0.02270976, 0.02484129, ..., 0.00877448, 0.01340963,
        0.        ]])

In [51]:
user_similarity.shape

(20000, 20000)

In [52]:
movie_similarity.shape

(1000, 1000)

# Алгоритм user-based

Для каждого пользователя находим топ 10 ближайших соседей, исключая себя самого (поэтому индекс от единицы до top + 1)

In [53]:
%%time
top=10
top_similar_users = []
for i in range(n_users):
    neighbors = (user_similarity[i]).argsort()[1:top + 1]
    top_similar_users.append(
        train_data_matrix[neighbors]
    )
top_similar_users = np.array(top_similar_users)

CPU times: total: 32.2 s
Wall time: 47.4 s


Количество  пользователей, количество соседей, количество фильмов

In [54]:
top_similar_users.shape

(20000, 10, 1000)

Для каждого пользователя и каждого фильма считаем средний рейтинг у фильма от топ похожих пользователей

Так делам матрицу с предсказанными рейтингами

In [55]:
predicted_ratings_user_based = top_similar_users.mean(1)

In [56]:
predicted_ratings_user_based.shape

(20000, 1000)

Делаем предикт на тестовом датасете, для каждого тестового userId и movieId заполняем предсказанный рейтинг из полученной матрицы

По userId строке и movieId столбцу, для этого и делали смену айдишников

In [57]:
def round_to_nearest_0_5(n):
    return round(n * 2) / 2

In [58]:
test_data['predict_user_based'] = test_data.apply(
    lambda f: round_to_nearest_0_5(predicted_ratings_user_based[f['userId'], f['movieId']]), axis = 1
)

In [59]:
print(f"RMSE metric: {np.sqrt(mean_squared_error(test_data['predict_user_based'], test_data['rating']))}")

RMSE metric: 0.946908995146132


# Алгоритм item-based

Для каждого фильма находим топ 10 ближайших соседей, исключая себя самого (поэтмоу индекс от единицы до top + 1)

In [60]:
top=10
top_similar_ratings = []
for i in range(n_movies):
    neighbors = (movie_similarity[i]).argsort()[1:top + 1]
    top_similar_ratings.append(
        train_data_matrix.T[neighbors]
    )
    
top_similar_ratings = np.array(top_similar_ratings)

Число фильмов, число соседей, число пользователей

In [61]:
top_similar_ratings.shape

(1000, 10, 20000)

Для каждого пользователя и каждого фильма считаем средний рейтинг от пользователя для топ соседних фильмов 

In [62]:
predicted_ratings_item_based = top_similar_ratings.mean(1).T

In [63]:
predicted_ratings_item_based.shape

(20000, 1000)

Делаем предикт на тестовом датасете, для каждого тестового userId и movieId заполняем предсказанный рейтинг из полученной матрицы

По userId строке и movieId столбцу, для этого и делали смену айдишников

In [64]:
test_data['predict_item_based'] = test_data.apply(
    lambda f: round_to_nearest_0_5(predicted_ratings_item_based[f['userId'], f['movieId']]), axis = 1
)

In [65]:
print(f"RMSE metric: {np.sqrt(mean_squared_error(test_data['predict_item_based'], test_data['rating']))}")

RMSE metric: 0.9548528739343812


Когда у вас много пользователей и мало фильмов, рекомендации, основанные на пользователях (user-based), могут быть более эффективными по сравнению с рекомендациями, основанными на элементах (item-based). Это связано с тем, что user-based рекомендации сосредотачиваются на схожести между пользователями и используют информацию о предпочтениях пользователей, чтобы сделать рекомендации.

В случае user-based рекомендаций для каждого пользователя вычисляется схожесть с другими пользователями на основе их предпочтений по фильмам. Когда множество пользователей больше, выше шанс найти похожие пользовательские профили и предложить релевантные рекомендации.

В отличие от этого, item-based рекомендации фокусируются на схожести между фильмами. Когда у вас мало фильмов, может быть трудно найти достаточно похожих элементов для выработки рекомендаций.

Таким образом, в вашем случае, user-based алгоритм рекомендаций фильмов скорее всего будет более эффективным и произведет более качественные рекомендации. Однако, рекомендации основанные на содержании фильмов (content-based) или гибридные методы, комбинирующие несколько подходов, также могут быть полезными вариантами в ситуации с небольшим количеством фильмов.

# Алгоритм на основе матричного разложения

In [66]:
train_data_matrix.shape

(20000, 1000)

[Документация](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.svds.html) SVDs разложения  
[Документация](https://docs.scipy.org/doc/scipy/reference/generated/scipy.linalg.svd.html) SVD разложения

In [67]:
%%time
# делаем SVD
u, s, vh = svds(train_data_matrix, k=20)
s_diag_matrix = np.diag(s)

users = np.dot(u, s_diag_matrix)
items = vh.T

CPU times: total: 7.61 s
Wall time: 1.43 s


In [68]:
np.diag(s).shape

(20, 20)

In [69]:
vh.shape

(20, 1000)

In [70]:
users.shape

(20000, 20)

In [71]:
items.shape

(1000, 20)

In [72]:
test_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,constant_predict,predict_user_based,predict_item_based
4000604,6807,558,2.00,1032541752,Spider-Man (2002),Action|Adventure|Sci-Fi|Thriller,3.50,3.50,3.50
5035305,7198,748,5.00,955003481,Courage Under Fire (1996),Action|Crime|Drama|War,3.50,3.50,3.50
1801593,4716,211,4.00,992148701,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fantasy|Mystery,3.50,4.00,3.50
3322921,11452,457,4.00,1132453699,"Lord of the Rings: The Fellowship of the Ring, The (2001)",Adventure|Fantasy,3.50,4.00,3.50
3343487,2181,460,4.50,1071280094,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy,3.50,4.00,3.50
...,...,...,...,...,...,...,...,...,...
3939054,1958,549,3.50,1114464391,Rain Man (1988),Drama,3.50,3.50,3.50
5186313,5526,784,3.00,1300575845,Bruce Almighty (2003),Comedy|Drama|Fantasy|Romance,3.50,3.50,3.50
3502970,19830,482,4.50,1294695132,Strictly Ballroom (1992),Comedy|Romance,3.50,3.50,3.50
3391115,1471,465,4.00,1254493490,Kill Bill: Vol. 2 (2004),Action|Drama|Thriller,3.50,3.50,4.00


In [73]:
test_data['svd_predictions'] = test_data.apply(
    lambda f: round_to_nearest_0_5(np.dot(users[f['userId']], items[f['movieId']])), axis = 1
)

In [74]:
print(f"RMSE metric: {np.sqrt(mean_squared_error(test_data['svd_predictions'], test_data['rating']))}")

RMSE metric: 0.8295760287502457


матрица профилей (эмбедингов) пользователей

In [75]:
users.shape

(20000, 20)

матрица профилей (эмбедингов) фильмов

In [76]:
items.shape

(1000, 20)

посмотрим на соседей фильмов по эмбедингам (так же как по tf-idf)

In [77]:
cosine_similarity_items = cosine_distances(items)

In [78]:
top=5
movies_sim = []
for i in range(len(cosine_similarity_items)):
    neighbors = (cosine_similarity_items[i]).argsort()[1:top+1] # считаем от 1 до top1 + 1, 
    # так как ближайшее расстояние у вектора до самого себя
    movies_sim.append(neighbors)
movies_sim = np.array(movies_sim)

In [79]:
movie_id_name = dict(unique_movies[['movieId', 'text_feature']].values.tolist())
movies_top_df = pd.DataFrame(movies_sim, columns = ['top1', 'top2', 'top3', 'top4', 'top5'])
movies_top_df['top1'] = movies_top_df['top1'].apply(lambda x: movie_id_name[x])
movies_top_df['top2'] = movies_top_df['top2'].apply(lambda x: movie_id_name[x])
movies_top_df['top3'] = movies_top_df['top3'].apply(lambda x: movie_id_name[x])
movies_top_df['top4'] = movies_top_df['top4'].apply(lambda x: movie_id_name[x])
movies_top_df['top5'] = movies_top_df['top5'].apply(lambda x: movie_id_name[x])

movies_top_df = movies_top_df.reset_index()
movies_top_df['movieId'] = movies_top_df['index']

на основе матрично факторизации получили более неочевидные item рекомендации по сравнению с tf-idf

в рекомендациях появился star trek к star wars

In [80]:
movies_recs_matrix = (
    unique_movies.merge(
        movies_top_df,
        left_on='movieId', right_on='movieId'
    )
)

In [81]:
movies_recs_matrix[movies_recs_matrix['title'].apply(lambda x: 'Star Wars' in x)]

Unnamed: 0,movieId,title,genres,text_feature,index,top1,top2,top3,top4,top5
5,5,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,star wars: episode iv - a new hope (1977) action adventure sci-fi,5,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,raiders of the lost ark (indiana jones and the raiders of the lost ark) (1981) action adventure,indiana jones and the last crusade (1989) action adventure,star wars: episode i - the phantom menace (1999) action adventure sci-fi
20,20,Star Wars: Episode V - The Empire Strikes Back (1980),Action Adventure Sci-Fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,20,star wars: episode iv - a new hope (1977) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,raiders of the lost ark (indiana jones and the raiders of the lost ark) (1981) action adventure,indiana jones and the last crusade (1989) action adventure,star wars: episode i - the phantom menace (1999) action adventure sci-fi
51,51,Star Wars: Episode I - The Phantom Menace (1999),Action Adventure Sci-Fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,51,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,"matrix revolutions, the (2003) action adventure sci-fi thriller imax","matrix reloaded, the (2003) action adventure sci-fi thriller imax",star trek: generations (1994) adventure drama sci-fi
64,64,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,64,star wars: episode iv - a new hope (1977) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,indiana jones and the last crusade (1989) action adventure,raiders of the lost ark (indiana jones and the raiders of the lost ark) (1981) action adventure,star wars: episode i - the phantom menace (1999) action adventure sci-fi
560,560,Star Wars: Episode II - Attack of the Clones (2002),Action Adventure Sci-Fi IMAX,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,560,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,"matrix revolutions, the (2003) action adventure sci-fi thriller imax","matrix reloaded, the (2003) action adventure sci-fi thriller imax",x-men: the last stand (2006) action sci-fi thriller
577,577,Star Wars: Episode III - Revenge of the Sith (2005),Action Adventure Sci-Fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,577,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode i - the phantom menace (1999) action adventure sci-fi,"matrix reloaded, the (2003) action adventure sci-fi thriller imax","matrix revolutions, the (2003) action adventure sci-fi thriller imax",pirates of the caribbean: at world's end (2007) action adventure comedy fantasy


tf-idf

In [82]:
movies_recs[movies_recs['title'].apply(lambda x: 'Star Wars' in x)]

Unnamed: 0,movieId,title,genres,text_feature,top1,top2,top3,top4,top5
5,5,Star Wars: Episode IV - A New Hope (1977),Action Adventure Sci-Fi,star wars: episode iv - a new hope (1977) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax
20,20,Star Wars: Episode V - The Empire Strikes Back (1980),Action Adventure Sci-Fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode iv - a new hope (1977) action adventure sci-fi
51,51,Star Wars: Episode I - The Phantom Menace (1999),Action Adventure Sci-Fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode iv - a new hope (1977) action adventure sci-fi
64,64,Star Wars: Episode VI - Return of the Jedi (1983),Action Adventure Sci-Fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi
560,560,Star Wars: Episode II - Attack of the Clones (2002),Action Adventure Sci-Fi IMAX,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi
577,577,Star Wars: Episode III - Revenge of the Sith (2005),Action Adventure Sci-Fi,star wars: episode iii - revenge of the sith (2005) action adventure sci-fi,star wars: episode i - the phantom menace (1999) action adventure sci-fi,star wars: episode vi - return of the jedi (1983) action adventure sci-fi,star wars: episode ii - attack of the clones (2002) action adventure sci-fi imax,star wars: episode v - the empire strikes back (1980) action adventure sci-fi,star wars: episode iv - a new hope (1977) action adventure sci-fi
