#### Задание 

Датасет ml-latest

1.Вспомнить подходы
3.Выбрать подход к гибридным системам
4.Написать свою рекомендательную систему

In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('ml-latest/links.csv')

genome_scores = pd.read_csv('ml-latest/genome-scores.csv')
genome_tags = pd.read_csv('ml-latest/genome-tags.csv')

movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')
tags = pd.read_csv('ml-latest/tags.csv')

### Info
* ratings - в строке одна оценка (0.5-5) одного фильма одним пользователем
* tags - в строке один тег, к одному фильму одним пользователем
* movies - в строке один фильм, его название и жанр
* links - id фильмов в разных рейтинговых системах
* genome - данные о релевантности тегов обзорам фильма

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [4]:
ratings[ratings.userId == 1040]

Unnamed: 0,userId,movieId,rating,timestamp
99456,1040,1,2.5,1514919511
99457,1040,6,2.5,1466111934
99458,1040,32,2.5,1436766901
99459,1040,47,4.0,1434426257
99460,1040,50,3.5,1436679843
...,...,...,...,...
99919,1040,176101,3.0,1533791872
99920,1040,176419,4.5,1515990017
99921,1040,176937,3.0,1533791479
99922,1040,177689,2.0,1532418217


In [5]:
rat_with_movies = ratings.join(movies.set_index('movieId'), on='movieId')

# добавим в полученных tags_with_movies информацию об рейтинге фильма
#tags_with_movies_rat = pd.merge(tags_with_movies, ratings,  how='left', left_on=['userId','movieId'], right_on = ['userId','movieId'])
#tags_with_movies_rat = tags_with_movies_rat.dropna()

# удаляем ненужные столбцы
del rat_with_movies['timestamp']

In [6]:
rat_with_movies

Unnamed: 0,userId,movieId,rating,title,genres
0,1,307,3.5,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
1,1,481,3.5,Kalifornia (1993),Drama|Thriller
2,1,1091,1.5,Weekend at Bernie's (1989),Comedy
3,1,1257,4.5,Better Off Dead... (1985),Comedy|Romance
4,1,1449,4.5,Waiting for Guffman (1996),Comedy
...,...,...,...,...,...
27753439,283228,8542,4.5,"Day at the Races, A (1937)",Comedy|Musical
27753440,283228,8712,4.5,My Favorite Wife (1940),Comedy|Romance
27753441,283228,34405,4.5,Serenity (2005),Action|Adventure|Sci-Fi
27753442,283228,44761,4.5,Brick (2005),Crime|Drama|Film-Noir|Mystery


### Соберем датасет

In [7]:
#tags_with_movies = tags.join(movies.set_index('movieId'), on='movieId')
best_scores = genome_scores[genome_scores.relevance > 0.85] # релевантные теги
best_tags = best_scores.join(genome_tags.set_index('tagId'), on='tagId') # добавим содержание тегов

# соберем теги по фильмам
tag_strings = []
movies_ = []

for movie, group in tqdm_notebook(best_tags.groupby('movieId')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace(':', '').replace('-', '') for s in group.tag.values]))
    movies_.append(movie)

best_tags = zip(movies_, tag_strings)
best_tags = pd.DataFrame(best_tags, columns = ['movieId', 'best_tags']) 

movies_with_best_tags = movies.join(best_tags.set_index('movieId'), on='movieId') # добавим список релевантных тегов к фильмам

# соберем теги к одному фильму по каждому пользователю
tag_strings = []
user_movies = []

for user_movie, group in tqdm_notebook(tags.groupby('movieId')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace(':', '').replace('-', '') for s in group.tag.values]))
    user_movies.append(user_movie)

list_tags = zip(user_movies, tag_strings)
list_tags = pd.DataFrame(list_tags, columns = ['movieId', 'tags']) 

# объединим польз теги с данными по фильмам и жанрам
movies_usertags_besttags = movies_with_best_tags.join(list_tags.set_index('movieId'), on='movieId')

# очищаем данные по жанрам
def change_string(s):
    return ' '.join(s.replace(' ', '').replace(':', '').replace('-', '').split('|'))

movies_usertags_besttags['genres_new'] = movies_usertags_besttags.genres.apply(change_string)

# удаляем ненужные столбцы
del movies_usertags_besttags['genres']

# объединяем лучшие и пользовательские теги, там где нет лучших 
movies_usertags_besttags['all_tags'] = np.where(pd.isna(movies_usertags_besttags['best_tags']), 
                                                movies_usertags_besttags['tags'], movies_usertags_besttags['best_tags'])
# удаляем строки с Nan
movies_usertags_besttags = movies_usertags_besttags.dropna(thresh=5)

movies_usertags_besttags

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=12654.0), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=45981.0), HTML(value='')))




Unnamed: 0,movieId,title,best_tags,tags,genres_new,all_tags
0,1,Toy Story (1995),adventure animated animation cartoon childhood...,animated buddymovie Cartoon cgi comedy compute...,Adventure Animation Children Comedy Fantasy,adventure animated animation cartoon childhood...
1,2,Jumanji (1995),adventure animals children family fantasy fant...,fantasy adaptedfrombook animals badcgi basedon...,Adventure Children Fantasy,adventure animals children family fantasy fant...
2,3,Grumpier Old Men (1995),comedy goodsequel sequel sequels,moldy old AnnMargaret BurgessMeredith DarylHan...,Comedy Romance,comedy goodsequel sequel sequels
3,4,Waiting to Exhale (1995),chickflick women,characters girlmovie characters chickflick bas...,Comedy Drama Romance,chickflick women
4,5,Father of the Bride Part II (1995),family fatherdaughterrelationship goodsequel p...,stevemartin stevemartin pregnancy remake aging...,Comedy,family fatherdaughterrelationship goodsequel p...
...,...,...,...,...,...,...
58038,193761,Bel Canto (2018),,concert hostage kidnapping operasinger rebel r...,Drama Thriller,concert hostage kidnapping operasinger rebel r...
58062,193811,Burning Shadow (2018),,doppelganger JulieDelpy,Thriller,doppelganger JulieDelpy
58074,193837,Lily C.A.T. (1987),,aliens anime cat spacemarines virus,Animation Horror SciFi,aliens anime cat spacemarines virus
58087,193864,No somos de piedra (1968),,AlfredoLanda anticonception comic family house...,Comedy,AlfredoLanda anticonception comic family house...


### Обучим алгоритм скрытых факторов

In [8]:
dataset = pd.DataFrame({
    'uid': rat_with_movies.userId,
    'iid': rat_with_movies.title,
    'rating': rat_with_movies.rating
})

In [9]:
dataset.rating.unique()

array([3.5, 1.5, 4.5, 2.5, 4. , 3. , 2. , 5. , 1. , 0.5])

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [11]:
trainset, testset = train_test_split(data, test_size=.15, random_state=42)

In [12]:
algo = SVD(n_factors=20, n_epochs=20)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fa3502fa590>

In [13]:
test_pred = algo.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.7979


0.7979212276159571

### Создадим словари жанров и тегов по фильмам

In [14]:
title_genres = {}
title_tags = {}

for index, row in tqdm_notebook(movies_usertags_besttags.iterrows()):
    title_genres[row.title] = row.genres_new
    title_tags[row.title] = row.all_tags

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [15]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))
movie_genres = [change_string(g) for g in movies.genres.values]

### Подготовим модель отбора фильмов по жанрам

In [16]:
movie_genres = movies_usertags_besttags.genres_new.values

In [17]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=1000, n_jobs=-1, metric='euclidean') 
neigh_genres = neigh.fit(X_train_tfidf)

### Подготовим модель отбора фильмов по тегам

In [18]:
movie_tags = movies_usertags_besttags.all_tags.values

In [19]:
count_vect = CountVectorizer()
X_train_counts2 = count_vect.fit_transform(movie_tags)

tfidf_transformer = TfidfTransformer()
X_train_tfidf2 = tfidf_transformer.fit_transform(X_train_counts2)

neigh2 = NearestNeighbors(n_neighbors=30, n_jobs=-1, metric='euclidean') 
neigh_tags = neigh2.fit(X_train_tfidf2)

In [20]:
def recommend_for_user(user_id):
    current_user_id = user_id
    user_movies = rat_with_movies[rat_with_movies.userId == current_user_id].title.unique()
    
    # данные о последнем фильме пользователя
    last_user_movie = user_movies[-1]
    movie_genres_user = title_genres[last_user_movie]
    movie_tags_user = title_tags[last_user_movie]
    
    # преобразуем эти данные в tfidf
    predict = count_vect.transform([movie_genres_user])
    predict2 = count_vect.transform([movie_tags_user])
    X_tfidf = tfidf_transformer.transform(predict)
    X_tfidf2 = tfidf_transformer.transform(predict2)

    res = neigh_genres.kneighbors(X_tfidf, return_distance=True)
    res2 = neigh_tags.kneighbors(X_tfidf2, return_distance=True)
    
    movies_to_score = movies_usertags_besttags.iloc[res[1][0]].title.values
    movies_to_score2 = movies_usertags_besttags.iloc[res2[1][0]].title.values

    scores = []
    titles = []

    for movie in movies_to_score2:
        if movie in user_movies:
            continue
        
#        if movie in movies_to_score:
        scores.append(algo.predict(uid=current_user_id, iid=movie).est)
        titles.append(movie)

    best_indexes = np.argsort(scores)[-10:]
    for i in reversed(best_indexes):
        print(titles[i], scores[i])

In [21]:
recommend_for_user(14.0)

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 29590 while Y.shape[1] == 20