# Коллаборативная фильтрация

In [49]:
from surprise import Dataset, Reader, SVD, KNNWithMeans, KNNBasic
from surprise.model_selection import cross_validate, train_test_split
import pandas as pd
import numpy as np

movies = pd.read_csv('../../src/MovieLens/movies.csv')
ratings = pd.read_csv('../../src/MovieLens/ratings.csv')

## функция для рекомендации на основе модели

In [None]:
# функция рекомендации фильмов
def generate_recommendations(uid, model, dataset, amount=5, random_order=True):
    all_iids = dataset['iid'].unique()
    iids_user_seen = dataset[dataset['uid'] == uid]['iid'].values
    iids_to_predict = np.setdiff1d(all_iids, iids_user_seen)

    predictions = []
    for iid in iids_to_predict:
        prediction = model.predict(uid, iid)
        predictions.append((iid, prediction.est))
    predictions.sort(key=lambda x: x[1], reverse=True)

    if random_order:
        np.random.shuffle(predictions)
    return sorted(predictions[:amount], key=lambda x: x[1], reverse=True)

## Подготовка данных

In [50]:
# массив для рекомендательной системы
ratings.head()
movies_with_ratings = movies.merge(ratings, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

dataset.head()
f"минимальный рейтинг: {ratings.rating.min()}, максимальный рейтинг: {ratings.rating.max()}"
f"минимальный рейтинг: {ratings.rating.min()}, максимальный рейтинг: {ratings.rating.max()}"
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

f"минимальный рейтинг: {ratings.rating.min()}, максимальный рейтинг: {ratings.rating.max()}"
trainset, testset = train_test_split(data, test_size=0.2, random_state=1)

## модель `KNNWithMeans`

In [110]:
def fit_recommend_system(trainset):
    algo_user_based = KNNWithMeans(k=50, sim_options={
        'name': 'cosine',
        'user_based': True  # compute similarities between users
    })

    algo_user_based.fit(trainset)

    algo_not_user_based = KNNWithMeans(k=50, sim_options={
        'name': 'cosine',
        'user_based': False  # compute similarities between users
    })

    algo_not_user_based.fit(trainset)
    return  algo_user_based, algo_not_user_based

algo_user_based, algo_not_user_based = fit_recommend_system(trainset)

# Кросс-валидация с 5 фолдами
cv_results_user = cross_validate(algo_user_based, data, measures=['RMSE'], cv=5, verbose=True)
print("Mean RMSE для модели user-based:", cv_results_user['test_rmse'].mean())

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9046  0.9032  0.9088  0.8945  0.8933  0.9009  0.0060  
Fit time          0.31    0.33    0.32    0.30    0.30    0.31    0.01    
Test time         0.80    0.72    0.76    0.72    0.77    0.75    0.03    
Mean RMSE для модели user-based: 0.9008665711082694


In [111]:
cv_results_not_user = cross_validate(algo_not_user_based, data, measures=['RMSE'], cv=5, verbose=True)
print("Mean RMSE для модели not-user:-based", cv_results_not_user['test_rmse'].mean())

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9121  0.9053  0.8962  0.9021  0.8997  0.9031  0.0054  
Fit time          5.62    6.05    5.71    5.59    5.82    5.76    0.17    
Test time         4.49    4.80    4.79    4.89    4.50    4.69    0.17    
Mean RMSE для модели not-user:-based 0.903078525644435


In [112]:
# Вывод рекомендаций в случайном порядке

uid_to_recommend = 5
recommendations_user_based = generate_recommendations(uid_to_recommend, algo_user_based, dataset)

print("Рекомендации (user-based) uid:{uid_to_recommend}")
for i, film in enumerate(recommendations_user_based, 1):
    print(f"{i}. {film[0]:<50} {'*' * int(film[1]):<5} {film[1]:.1f}")
    
recommendations_not_user_based = generate_recommendations(uid_to_recommend, algo_not_user_based, dataset)
print("Рекомендации (not_user-based) uid:{uid_to_recommend}")
for i, film in enumerate(recommendations_not_user_based, 1):
    print(f"{i}. {film[0]:<50} {'*' * int(film[1]):<5} {film[1]:.1f}")

Рекомендации (user-based) uid:{uid_to_recommend}
1. Vanishing, The (Spoorloos) (1988)                  ***   3.8
2. Little Miss Marker (1980)                          ***   3.5
3. Bolt (2008)                                        ***   3.4
4. 8 ½ Women (a.k.a. 8 1/2 Women) (a.k.a. Eight and a Half Women) (1999) ***   3.4
5. Crooklyn (1994)                                    **    2.2
Рекомендации (not_user-based) uid:{uid_to_recommend}
1. Night of the Creeps (1986)                         ***   3.7
2. Harrison Bergeron (1995)                           ***   3.5
3. Dark Star (1974)                                   ***   3.5
4. Backbeat (1993)                                    ***   3.3
5. Amityville 3-D (1983)                                    0.7


## `SVD` модель

In [138]:
def fit_recommend_system_SVD(trainset):
    model_svd = SVD()
    model_svd.fit(trainset)
    return model_svd

model_svd = fit_recommend_system_SVD(trainset)
# Кросс-валидация с 5 фолдами для обеих моделей
svd_result = cross_validate(model_svd, data, measures=['RMSE'], cv=5, verbose=True)
print("Mean RMSE для модели SVD:")
print(f"rmse SVD: {svd_result['test_rmse'].mean():.2f}")

uid_to_recommend = 5
recommendations_SVD = generate_recommendations(uid_to_recommend, model_svd, dataset)

print("Рекомендации svd uid:{uid_to_recommend}")
for i, film in enumerate(recommendations_SVD, 1):
    print(f"{i}. {film[0]:<50} {'*' * int(film[1]):<5} {film[1]:.1f}")

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8746  0.8746  0.8644  0.8844  0.8648  0.8726  0.0074  
Fit time          0.72    0.77    0.76    0.74    0.75    0.75    0.02    
Test time         0.07    0.07    0.07    0.23    0.07    0.10    0.07    
Mean RMSE для модели SVD:
rmse SVD: 0.87
Рекомендации svd uid:{uid_to_recommend}
1. Neo Tokyo (1987)                                   ***   3.5
2. Valentine's Day (2010)                             ***   3.4
3. Party, The (Boum, La) (1980)                       ***   3.4
4. Must Love Dogs (2005)                              ***   3.2
5. Tomcats (2001)                                     **    3.0


## `CoClustering` модель

In [127]:
from surprise import CoClustering
def fit_recommend_system_CoClustering(trainset):
    model_coclustering = CoClustering()
    model_coclustering.fit(trainset)
    return model_coclustering

model_coclustering = fit_recommend_system_CoClustering(trainset)
coclustering_result = cross_validate(model_coclustering, data, measures=['RMSE'], cv=5, verbose=True)

print("Mean RMSE для модели CoClustering:")
print(f"rmse CoClustering: {coclustering_result['test_rmse'].mean():.2f}")

uid_to_recommend = 5
recommendations_coclustering = generate_recommendations(uid_to_recommend, model_coclustering, dataset)

print("Рекомендации coclustering uid:{uid_to_recommend}")
for i, film in enumerate(recommendations_coclustering, 1):
    print(f"{i}. {film[0]:<50} {'*' * int(film[1]):<5} {film[1]:.1f}")

Evaluating RMSE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9407  0.9461  0.9425  0.9486  0.9451  0.9446  0.0028  
Fit time          1.81    1.89    1.78    1.82    1.89    1.84    0.05    
Test time         0.05    0.05    0.05    0.22    0.05    0.08    0.07    
Mean RMSE для модели CoClustering:
rmse CoClustering: 0.94
Рекомендации coclustering uid:{uid_to_recommend}
1. Ghost Graduation (2012)                            ***** 5.0
2. Tie Me Up! Tie Me Down! (¡Átame!) (1990)           ***   3.6
3. We're Back! A Dinosaur's Story (1993)              ***   3.4
4. Hotel Transylvania 2 (2015)                        ***   3.1
5. Hostel: Part II (2007)                             **    3.0


In [131]:
from surprise import NMF
from surprise.model_selection import cross_validate

def fit_recommend_system_NMF(trainset):
    model_nmf = NMF()
    model_nmf.fit(trainset)
    return model_nmf

model_nmf = fit_recommend_system_NMF(trainset)
nmf_result = cross_validate(model_nmf, data, measures=['RMSE'], cv=5, verbose=True)

print("Mean RMSE для модели NMF:")
print(f"rmse NMF: {nmf_result['test_rmse'].mean():.2f}")

Evaluating RMSE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9233  0.9148  0.9141  0.9289  0.9246  0.9211  0.0058  
Fit time          1.53    1.68    1.54    1.43    1.65    1.57    0.09    
Test time         0.23    0.06    0.06    0.06    0.06    0.09    0.07    
Mean RMSE для модели NMF:
rmse NMF: 0.92


In [152]:
from surprise import KNNBasic

def fit_recommend_system_KNNBasic(trainset):
    model_KNNBasic = KNNBasic()
    model_KNNBasic.fit(trainset)
    return model_KNNBasic

model_KNNBasic = fit_recommend_system_KNNBasic(trainset)
kNNBasic_result = cross_validate(model_KNNBasic, data, measures=['RMSE'], cv=5, verbose=True)
print("Mean RMSE для модели KNNBasic:")
print(f"rmse KNNBasic: {kNNBasic_result['test_rmse'].mean():.2f}")

uid_to_recommend = 5
predictions = generate_recommendations(uid_to_recommend, model_KNNBasic, dataset)

print("Рекомендации KNNBasic uid:{uid_to_recommend}")
for i, film in enumerate(predictions, 1):
    print(f"{i}. {film[0]:<50} {'*' * int(film[1]):<5} {film[1]:.1f}")

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9455  0.9409  0.9522  0.9504  0.9529  0.9484  0.0045  
Fit time          0.12    0.13    0.13    0.13    0.13    0.13    0.00    
Test time         0.87    0.65    0.65    0.65    0.64    0.69    0.09    
Mean RMSE для модели KNNBasic:
rmse KNNBasic: 0.95
Рекомендации KNNBasic uid:{uid_to_recommend}
1. Hustler, The (1961)                                ****  4.2
2. On the Waterfront (1954)                     

## Вывод

+ для рекомендации фильмов использовалась библиотека `surprise`
+ особенность коллаборативного подхода, в отсутсвии необходимости тщательной подготовки данных, но важно учитывать появление новых элементов которых не было в массиве
+ использовал сведущие модели

| Модель          | `RMSE` по 5 фолдам  | `>=0.87` | 
|-----------------|-------------------|--|
| `KNNWithMeans`    | `0.90`              |  |
| `SVD`             | `0.87`              | ✓ |
| `CoClustering`    | `0.94`              |  |
| `NMF`             | `0.92`              |  |
| `KNNBasic`        | `0.95 `             |  |

+ самый лучший `RMSE` у модели `SVD` для данного набора данных (попытка изменить параметры в целом не позволила повысить результат)
+ пример рекомендации фильмов для пользователя на основании модели 

| #  | фильм    | рейтинг `*` | рейтинг `float` | 
|----|------------------------------|-----|-----| 
| 1. | Neo Tokyo (1987)              | *** | 3.5 | 
| 2. | Valentine's Day (2010)        | *** | 3.4 | 
| 3. | Party, The (Boum, La) (1980)  | *** | 3.4 | 
| 4. | Must Love Dogs (2005)         | *** | 3.2 | 
| 5. | Tomcats (2001)                | ** | 3.0 |
