# cascade hybrid recommendation system
- <b>here is realization of cascade type of hybrid recommendation system: Utilizes recommendations from one method to enhance or refine recommendations from another method. For example, it may use CF to generate initial recommendations and then use CB to filter or improve them further.

In [8]:
# 0. Imports
import warnings
warnings.filterwarnings('ignore')

from surprise import SVD, SVDpp # SVD-разложение
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook


from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer # for parsing of ganres
from sklearn.neighbors import NearestNeighbors # KNN algorithm

import pandas as pd
import numpy as np

In [9]:
# Init data
links = pd.read_csv('dataset/links.csv')
movies = pd.read_csv('dataset/movies.csv')
ratings = pd.read_csv('dataset/ratings.csv')
tags = pd.read_csv('dataset/tags.csv')

In [10]:
# Join tables movies and ratings
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [11]:
movies_with_ratings.head()
# genres - будем использовать для рекомендация на основе содержания
# rating, userId - для рекомендаций на основе скрытых факторов

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [None]:
# Step 1: Collaborative filtering - will be used after contant-based step 

In [12]:
# create a special format for using Surprise library
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [13]:
# create Surprise objects: Reader and Dataset
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [14]:
# Split Dataset object
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [15]:
# Обучим алгоритм скрытых факторов SVD 
%%time
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

CPU times: user 189 ms, sys: 2.29 ms, total: 191 ms
Wall time: 190 ms


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a7c320a0>

In [16]:
# Get prediction
test_pred = algo.test(testset)

In [17]:
# Сalculate RMSE (значит, что в среднем ошибаемся на 0.86 значения райтинга)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8700


0.8699847645578735

In [18]:
# Получить конкретную пердсказанную оценку для юзера и фильма
algo.predict(uid=2.0, iid='Mortal Kombat (1995)').est

2.56845174299844

In [19]:
# Получим массив потенциальных оценок для всех фильмов для юзера 2 
current_user_id = 2.0
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique() # список уже просмотренных

scores = []
titles = []

for movie in movies_with_ratings.title.unique():
    if movie in user_movies: # если уже смотрел то идем дальше
        continue
        
    scores.append(algo.predict(uid=current_user_id, iid=movie).est) # добавить предсказание rating
    titles.append(movie)

In [30]:
# Get a array from predicted scores and movies
movies_ratings = [
    (movie, rating) for movie, rating in zip(titles, scores)
]

# sort
sorted(movies_ratings, key=lambda x: x[1], reverse=True)[:5]
# sorted(scores)[-5:]

[('Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
  4.535930677470675),
 ('Lawrence of Arabia (1962)', 4.462782476738638),
 ('Cool Hand Luke (1967)', 4.3648514258643845),
 ('Apocalypse Now (1979)', 4.336544547239249),
 ('Reservoir Dogs (1992)', 4.310649618433127)]

In [None]:
# Step 2. Content-based filtering

In [42]:
# функция распарсивания жанров (удаление пробелов, дефисов и далее сплит по вертикальной линии)
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

# get an array of janres
#' '.join('Adventure|Animation|Children|Comedy|Fantasy'.split('|')) # just example
movie_genres = [change_string(g).lower() for g in movies.genres.values]

In [43]:
movie_genres[0:3]

['adventure animation children comedy fantasy',
 'adventure children fantasy',
 'comedy romance']

In [44]:
# Применить CountVectorizer (кодировка слова в цифровое представление ( 1 слово = 1 id ))
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
# X_train_counts = [1, 0, 0, 1, 0, 1, 1, ....]

# к полученному цифровому описанию применяем tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# результат tf-idf обработки передаем на вход алгоритма KNN
neigh = NearestNeighbors(n_neighbors=20, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

In [48]:
# тестируем для набора жанров
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [71]:
print(f"test = {test};\n\n\
predict = {predict.toarray()};\n\n\
X_tfidf2 = {X_tfidf2.toarray()};\n\n\
res = {res}\n\n\
recommended movies_ids = {res[1][0]}") # res - первый array с предсказаниями, второй array с индексами фильмов

test = Adventure Comedy Fantasy Crime;

predict = [[0 1 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0]];

X_tfidf2 = [[0.         0.51372903 0.         0.         0.32977898 0.52250315
  0.         0.         0.59524681 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]];

res = (array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
        0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
        0.54288608, 0.54288608, 0.54288608, 0.54288608, 0.54288608,
        0.54288608, 0.54288608, 0.6188388 , 0.62682864, 0.62682864]]), array([[6774, 9096, 3576,  863, 2302, 2608, 7865, 3582, 8361, 3302, 5737,
        6723, 5636, 3376, 7496, 5627, 9717, 2206, 6133, 5832]]))

recommended movies_ids = [6774 9096 3576  863 2302 2608 7865 3582 8361 3302 5737 6723 5636 3376
 7496 5627 9717 2206 6133 5832]


In [64]:
# Вывод названия фильмов по айдишникам которые получили благодаря KNN
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
3576,4899,Black Knight (2001),Adventure|Comedy|Fantasy
863,1136,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy
2302,3052,Dogma (1999),Adventure|Comedy|Fantasy
2608,3489,Hook (1991),Adventure|Comedy|Fantasy
7865,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
8361,109042,Knights of Badassdom (2013),Adventure|Comedy|Fantasy
3302,4467,"Adventures of Baron Munchausen, The (1988)",Adventure|Comedy|Fantasy


In [75]:
# сортировка movies_with_ratings по полю timestamp (по времени проставления оценки) - используется ниже
movies_with_ratings.sort_values('timestamp', inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
15993,590,Dances with Wolves (1990),Adventure|Drama|Western,429.0,5.0,828124615.0
9792,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller,429.0,3.0,828124615.0
15590,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,429.0,5.0,828124615.0
9480,343,"Baby-Sitters Club, The (1995)",Children,429.0,3.0,828124615.0
11853,421,Black Beauty (1994),Adventure|Children|Drama,429.0,4.0,828124615.0


In [85]:
# получим уникальные просмотренные фильмы юзером отсортированные по дате
current_user_id = 5
user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()
user_movies

array(['Dances with Wolves (1990)', 'Batman (1989)', 'True Lies (1994)',
       'Pulp Fiction (1994)', 'Apollo 13 (1995)', 'Fugitive, The (1993)',
       'Aladdin (1992)', 'Ace Ventura: Pet Detective (1994)',
       'Batman Forever (1995)', 'Clear and Present Danger (1994)',
       'Beauty and the Beast (1991)', 'Stargate (1994)',
       'Braveheart (1995)', 'Shawshank Redemption, The (1994)',
       'Babe (1995)', 'Usual Suspects, The (1995)',
       'Lion King, The (1994)', "Schindler's List (1993)",
       'Clueless (1995)', 'Toy Story (1995)', 'Pretty Woman (1990)',
       'Remains of the Day, The (1993)',
       'Like Water for Chocolate (Como agua para chocolate) (1992)',
       'Mask, The (1994)', 'Get Shorty (1995)',
       'Postman, The (Postino, Il) (1994)', 'Little Women (1994)',
       'Snow White and the Seven Dwarfs (1937)',
       'Four Weddings and a Funeral (1994)', 'Fargo (1996)',
       'Terminator 2: Judgment Day (1991)', 'Secret Garden, The (1993)',
       'Eat Dri

In [78]:
# создадим dictionary {film_title_1: ganres_of_movie_1, film_title_2: ganres_of_movie_2, ....} с названиями фильмов и их жанров
title_genres = {}

for index, row in movies.iterrows():
    title_genres[row.title] = row.genres

In [100]:
# жанры просмотренные юзером в N последние разы
LAST_N_MOVIES = 1

genres = []
for movie in user_movies[-LAST_N_MOVIES:]: # user_movies содержит отсортированный список, т.к. сформирован на основе movies_with_ratings
    genres.append(title_genres.get(movie))
genres

['Action|Thriller']

In [101]:
# get a list on uniq user's genres
unique_genres = set()
for genre in genres:
    genres_i = genre.split('|')
    for parsed_genre in genres_i:
        unique_genres.add(parsed_genre)
unique_genres = list(unique_genres)

# join unique_genres in one string
out_genre = ""
for genre in sorted(unique_genres):
    out_genre += " "
    out_genre += genre
out_genre = out_genre[1:] # remove first " "
print(f"unique_genres = {unique_genres}\n\
out_genre = {out_genre}")

unique_genres = ['Thriller', 'Action']
out_genre = Action Thriller


In [114]:
# воссоздадим упомянутую логику в единой функции
# 1 шаг - получить жанры последних фильмов которые пользователь смотрел 
# 2 шаг - на основании этих жанров получить выдачу рекомендаций на основе содержаний (movies_to_score)
# 3 шаг - выдачу шага 2 подать на вход более сложного алгоритма (в примере SVD) и получить финальную выдачу

def recommend_for_user(user_id):
    # set user
    current_user_id = user_id
    # get movies that user watched sorted by watched data
    user_movies = movies_with_ratings[movies_with_ratings.userId == current_user_id].title.unique()

    # get last watched movie
    last_user_movie = user_movies[-1]

    # get ganre of this (last watched) movie (1 шаг)
    movie_genres = title_genres[last_user_movie]

    # parse string of ganres
    movie_genres = change_string(movie_genres)

    # get count_vectorizer and tfidf transformation
    predict = count_vect.transform([movie_genres])
    X_tfidf2 = tfidf_transformer.transform(predict)

    # get K nearest neighbors
    res = neigh.kneighbors(X_tfidf2, return_distance=True)

    # list of recommended movies (2 шаг) (какие близкие фильмы нашли)
    movies_to_score = movies.iloc[res[1][0]].title.values

    # create empty lists for recommendations (for movies and ratings)
    scores = []
    titles = []
    
    # to fill the lists by recommendations (а какие бы оценки поставил пользователь данным фильмам (шаг 3))
    for movie in movies_to_score:
        if movie in user_movies: # if user already watched this movie - skip 
            continue

        # итерируемся по результату KNN и получаем оценки рейтинга с помощью svd (создан выше)
        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)

    # print the 10 best recommendations
    best_indexes = np.argsort(scores)[-10:] # argsort ??? what does it do???
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [115]:
recommend_for_user(2.0)

Few Good Men, A (1992) 3.927147673252832
Zodiac (2007) 3.822997975589508
Shadow of a Doubt (1943) 3.8215386402015254
Frailty (2001) 3.814850710737123
Cape Fear (1962) 3.78287771830989
Bully (2001) 3.7122351128772397
Lodger: A Story of the London Fog, The (1927) 3.655538872484813
Rope (1948) 3.6057424266674776
Performance (1970) 3.586613512508335
Citizen X (1995) 3.5687239717115213


In [104]:
# что юзер 2 смотрел 
movies_with_ratings[movies_with_ratings.userId == 2.0].sort_values('rating')

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
97478,114060,The Drop (2014),Crime|Drama|Thriller,2.0,2.0,1445715000.0
93998,91658,"Girl with the Dragon Tattoo, The (2011)",Drama|Thriller,2.0,2.5,1445715000.0
8652,318,"Shawshank Redemption, The (1994)",Crime|Drama,2.0,3.0,1445715000.0
96746,109487,Interstellar (2014),Sci-Fi|IMAX,2.0,3.0,1445715000.0
91063,77455,Exit Through the Gift Shop (2010),Comedy|Documentary,2.0,3.0,1445715000.0
90135,71535,Zombieland (2009),Action|Comedy|Horror,2.0,3.0,1445715000.0
93833,91529,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,2.0,3.5,1445715000.0
95272,99114,Django Unchained (2012),Action|Drama|Western,2.0,3.5,1445715000.0
97675,115713,Ex Machina (2015),Drama|Sci-Fi|Thriller,2.0,3.5,1445715000.0
76960,8798,Collateral (2004),Action|Crime|Drama|Thriller,2.0,3.5,1445715000.0


In [109]:
np.argsort([1,9,5,7])

array([0, 2, 3, 1])

In [112]:
a = [3.655538872484813, 
     3.473914937052649, 
     3.814850710737123, 1
     3.538910093862023, 4
     3.5018639174120985, 
     3.822997975589508, 
     3.927147673252832, 
     3.5687239717115213, 3
     3.4002299625606582, 
     3.7122351128772397, 
     3.8215386402015254, 
     3.586613512508335, 0
     3.78287771830989, 
     3.6057424266674776, 
     3.5308805173868194, 2
     3.5018639174120985, 
     3.535810522206514, 5
     3.4481120134640317] 6

In [113]:
np.argsort(a)

array([ 8, 17,  1,  4, 15, 14, 16,  3,  7, 11, 13,  0,  9, 12,  2, 10,  5,
        6])