In [1]:
import numpy as np 
import pandas as pd 

In [2]:
movies_data = pd.read_csv("../input/movielens100k/movies.csv")
movies_data.drop_duplicates(subset ="title",keep='first',inplace=True,ignore_index=True) 
movies_data.head() 

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_data = pd.read_csv("../input/movielens100k/ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [4]:
movies_with_ratings = movies_data.merge(ratings_data, on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [5]:
from surprise import Dataset
from surprise import Reader

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,7,Toy Story (1995),3.0
1,9,Toy Story (1995),4.0
2,13,Toy Story (1995),5.0
3,15,Toy Story (1995),2.0
4,19,Toy Story (1995),3.0


In [8]:
dataset.rating.min(), dataset.rating.max() 

(0.5, 5.0)

In [9]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [10]:
from surprise import SVD

In [11]:
trainset = data.build_full_trainset()
model = SVD(n_epochs = 50, lr_all = 0.02,n_factors = 200)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7ef85975c700>

In [12]:
def split_genres(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [13]:
movies_data['genres'] = [split_genres(g) for g in movies_data.genres.values]
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [17]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movies_data.genres.values)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = NearestNeighbors(n_neighbors=50, n_jobs=-1, metric='cosine') 
neigh.fit(X_train_tfidf)

In [18]:
# выбираем 50 наиболее похожих фильмов по жанрам
knn = NearestNeighbors(n_neighbors=50, metric='euclidean') 
knn.fit(X_train_tfidf)

In [40]:
genres = 'Adventure Animation Children Comedy Fantasy'
predict = count_vect.transform([genres])
X_tfidf2 = tfidf_transformer.transform(predict)

res = list(neigh.kneighbors(X_tfidf2, return_distance=False)[0])

In [42]:
movies_data.iloc[res].head()

Unnamed: 0,movieId,title,genres
3217,4016,"Emperor's New Groove, The (2000)",Adventure Animation Children Comedy Fantasy
7943,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure Animation Children Comedy Fantasy
8364,103755,Turbo (2013),Adventure Animation Children Comedy Fantasy
8926,136016,The Good Dinosaur (2015),Adventure Animation Children Comedy Fantasy
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy


In [22]:
# получить жанры по названию фильма
def knn_film(f):
    film = movies_data[movies_data.title == f]
    return str(film.genres)

In [32]:
# находим наиболее похожие фильмы по жанрам введенного фильма,
# удаляем из списка сам этот фильм
def similar_films(film):
    predict = count_vect.transform([knn_film(film)])
    X_tfidf2 = tfidf_transformer.transform(predict)

    res = list(neigh.kneighbors(X_tfidf2, return_distance=False)[0])
    film_id = movies_data[movies_data.title == film].index
    if film_id in res:
        res.remove(film_id)
    return res    

In [35]:
film = 'Toy Story (1995)'
res = similar_films(film)

In [36]:
movies_data.iloc[res].head()

Unnamed: 0,movieId,title,genres
3217,4016,"Emperor's New Groove, The (2000)",Adventure Animation Children Comedy Fantasy
7943,91355,Asterix and the Vikings (Astérix et les Viking...,Adventure Animation Children Comedy Fantasy
8364,103755,Turbo (2013),Adventure Animation Children Comedy Fantasy
8926,136016,The Good Dinosaur (2015),Adventure Animation Children Comedy Fantasy
1815,2294,Antz (1998),Adventure Animation Children Comedy Fantasy


In [26]:
# последний понравившийся юзеру фильм(оценка >= 4) по номеру юзера
def last_liked_film(user):
    df_user = movies_with_ratings[movies_with_ratings.userId == user]
    last_film = df_user[df_user.rating >= 4].sort_values('timestamp',ascending = False)
    return last_film.iloc[0].title

In [37]:
film = last_liked_film(11)
res = similar_films(film)

In [38]:
movies_data.iloc[res].head()

Unnamed: 0,movieId,title,genres
3240,4042,"Alamo, The (1960)",Action Drama War Western
238,266,Legends of the Fall (1994),Drama Romance War Western
142,163,Desperado (1995),Action Romance Western
5138,7379,The Alamo (2004),Drama War Western
4632,6422,Shenandoah (1965),Drama War Western


In [29]:
# предсказываем рейтинг фильмов,отобранных на предыдущем этапе,с помомщью SVD
# не учитываем фильмы, которые юзер уже оценивал
# рекомендуем n фильмов с наибольшими предсказанными рейтингами
def rec_n_films(user,n = 3):
    user_movies = movies_with_ratings[movies_with_ratings.userId == user].title
    knn_films = movies_data.iloc[res].title.values
    scores = []
    titles = []

    for movie in knn_films:
        if movie in user_movies:
            continue

        scores.append(model.predict(uid=user, iid=movie).est)
        titles.append(movie)
    best_indexes = np.argsort(scores)[-n:]
    rec_films = []
    for i in reversed(best_indexes):
        rec_films.append(titles[i])
    rec = movies_data[movies_data.title.isin(rec_films)]
    return rec.set_index('movieId')

In [30]:
rec_n_films(11,5)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1224,Henry V (1989),Action Drama Romance War
1304,Butch Cassidy and the Sundance Kid (1969),Action Western
3368,"Big Country, The (1958)",Romance Western
4802,Operation Petticoat (1959),Action Comedy Romance War
99114,Django Unchained (2012),Action Drama Western
