In [73]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [107]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [120]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

def change_string1(s):
    return ' '.join(s.split(' '))

In [89]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [233]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(movie_genres)
tfidf_vectorizer.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [234]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [235]:
test = change_string("Adventure|Comedy|Fantasy|Crime")
X_tfidf2 = tfidf_vectorizer.transform([test])
res = neigh.kneighbors(X_tfidf2, return_distance=True)
res[1][0]

array([6774, 9096, 5636, 6723, 3376, 7496, 9717], dtype=int64)

In [14]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
6723,58972,Nim's Island (2008),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
7496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
9717,188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [15]:
movies_and_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_and_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [16]:
movie_tags = movies_and_tags.tag.values
movie_tags

array(['pixar', 'pixar', 'fun', ..., nan, nan, nan], dtype=object)

In [104]:
tfidf_vectorizer = TfidfVectorizer()
x = tfidf_vectorizer.fit_transform(movie_tags.astype('U'))
tfidf_vectorizer.vocabulary_

{'pixar': 1187,
 'fun': 609,
 'fantasy': 548,
 'magic': 952,
 'board': 187,
 'game': 617,
 'robin': 1308,
 'williams': 1715,
 'moldy': 1023,
 'old': 1121,
 'nan': 1065,
 'pregnancy': 1217,
 'remake': 1280,
 'politics': 1201,
 'president': 1220,
 'mafia': 950,
 'jane': 837,
 'austen': 116,
 'hollywood': 736,
 'serial': 1380,
 'killer': 879,
 'alcoholism': 50,
 'shakespeare': 1390,
 'in': 785,
 'netflix': 1084,
 'queue': 1246,
 'kidnapping': 877,
 'high': 719,
 'school': 1354,
 'teacher': 1538,
 'time': 1579,
 'travel': 1605,
 'brad': 204,
 'pitt': 1186,
 'bruce': 221,
 'willis': 1716,
 'mindfuck': 1011,
 'post': 1208,
 'apocalyptic': 88,
 'twist': 1622,
 'ending': 507,
 'animal': 77,
 'movie': 1045,
 'pigs': 1184,
 'villain': 1665,
 'nonexistent': 1100,
 'or': 1130,
 'not': 1107,
 'needed': 1076,
 'for': 586,
 'good': 649,
 'story': 1481,
 'death': 414,
 'penalty': 1168,
 'nun': 1111,
 'twins': 1621,
 'chick': 283,
 'flick': 582,
 'funny': 610,
 'paul': 1163,
 'rudd': 1323,
 'quotable':

In [137]:
median = ratings.groupby('movieId')[['rating']].median()
tags ['tag'] = tags ['tag'] + ' '
tag = tags.groupby('movieId')[['tag']].sum()
median

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,4.0
2,3.5
3,3.0
4,3.0
5,3.0
...,...
193581,4.0
193583,3.5
193585,3.5
193587,3.5


In [145]:
movies_and_median = median.join(movies.set_index('movieId'), on='movieId')
movies_tags= tag.join(movies_and_median, on='movieId')
movies_tags

Unnamed: 0_level_0,tag,rating,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,pixar pixar fun,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,fantasy magic board game Robin Williams game,3.5,Jumanji (1995),Adventure|Children|Fantasy
3,moldy old,3.0,Grumpier Old Men (1995),Comedy|Romance
5,pregnancy remake,3.0,Father of the Bride Part II (1995),Comedy
7,remake,3.0,Sabrina (1995),Comedy|Romance
...,...,...,...,...
183611,Comedy funny Rachel McAdams,4.0,Game Night (2018),Action|Comedy|Crime|Horror
184471,adventure Alicia Vikander video game adaptation,3.0,Tomb Raider (2018),Action|Adventure|Fantasy
187593,Josh Brolin Ryan Reynolds sarcasm,4.0,Deadpool 2 (2018),Action|Comedy|Sci-Fi
187595,Emilia Clarke star wars,4.0,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi


## Выбираем пользователя и считываем Id фильмов, которые он смотрел и оценил выше 3.0

In [155]:
df_user = ratings.query(('userId == 505 '))
df_user = df_user.query(('rating >= 3.0 '))
del df_user['rating']
del df_user['userId']
del df_user['timestamp']
df_user

Unnamed: 0,movieId
80255,47
80256,273
80259,593
80260,968
80261,1022
80262,1029
80263,1033
80264,1215
80265,1241
80266,1261


In [156]:
df_user_tag = df_user.join(movies_tags, on='movieId')
df_user_tag

Unnamed: 0,movieId,tag,rating,title,genres
80255,47,mystery twist ending serial killer,4.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
80256,273,gothic,3.0,Mary Shelley's Frankenstein (Frankenstein) (1994),Drama|Horror|Sci-Fi
80259,593,Hannibal Lector disturbing drama gothic psycho...,4.0,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
80260,968,zombies,4.0,Night of the Living Dead (1968),Horror|Sci-Fi|Thriller
80261,1022,Disney,3.25,Cinderella (1950),Animation|Children|Fantasy|Musical|Romance
80262,1029,Disney,3.5,Dumbo (1941),Animation|Children|Drama|Musical
80263,1033,Disney,3.5,"Fox and the Hound, The (1981)",Animation|Children|Drama
80264,1215,,,,
80265,1241,,,,
80266,1261,,,,


## Фильмы со схожей медианной оценкой, как и те, которые понравились пользователю

In [206]:
df_user_tag['rating'].fillna(0, inplace=True)
m = np.median(df_user_tag['rating'])
m

3.125

In [205]:
m1 = movies_tags.query(('rating >=3'))
m1 = m1.query(('rating <= 3.25'))
m1


Unnamed: 0_level_0,tag,rating,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3,moldy old,3.00,Grumpier Old Men (1995),Comedy|Romance
5,pregnancy remake,3.00,Father of the Bride Part II (1995),Comedy
7,remake,3.00,Sabrina (1995),Comedy|Romance
22,serial killer,3.00,Copycat (1995),Crime|Drama|Horror|Mystery|Thriller
31,high school teacher,3.00,Dangerous Minds (1995),Drama
...,...,...,...,...
135536,Bad story Bad writing Batman Ben Affleck comic...,3.00,Suicide Squad (2016),Action|Crime|Sci-Fi
141890,Moving,3.00,Beasts of No Nation (2015),Drama|War
176419,allegorical uncomfortable unsettling,3.25,Mother! (2017),Drama|Horror|Mystery|Thriller
180985,bad music,3.00,The Greatest Showman (2017),Drama


In [111]:
movie_genres1 = [change_string(g) for g in df_user_tag.genres.values.astype('U')]
movie_tags1 = [change_string(g) for g in df_user_tag.tag.values.astype('U')]

## Ищем фильмы, ближайшие по жанру

In [127]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(movie_genres)
tfidf_vectorizer.vocabulary_

{'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'fantasy': 8,
 'romance': 15,
 'drama': 7,
 'action': 0,
 'crime': 5,
 'thriller': 17,
 'horror': 10,
 'mystery': 13,
 'scifi': 16,
 'war': 18,
 'musical': 12,
 'documentary': 6,
 'imax': 11,
 'western': 19,
 'filmnoir': 9,
 'nogenreslisted': 14}

In [236]:
X_tfidf2 = tfidf_vectorizer.transform(movie_genres1)
res1 = neigh.kneighbors(X_tfidf2, return_distance=True)
res1[1]

array([[2659, 4035, 3517, 7352, 2421,  221, 4902],
       [5099, 5048,  235, 5053, 1028, 7282, 9032],
       [9059, 6895, 8185, 6500,  510, 3946, 5093],
       [8430, 1926, 1700, 4092, 2822, 1925, 1193],
       [ 780, 7207,  812, 1550, 8723, 7907,  783],
       [ 786, 5766, 6804, 1559, 3990, 7766,  782],
       [1493, 6405,  790, 7510, 3938, 4230, 7552],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [6263, 6951,  307, 2744, 4752, 7236, 9594],
       [2284, 5942, 8589, 5929, 8581, 2292, 5933],
       [8977, 6884, 2055, 2156, 9069, 7454, 2589],
       [9286,   46, 3329, 6846,  831,  828, 1250],
       [1732, 8682, 7569, 6195, 3061, 4961, 7329],
       [2659, 4035, 3517, 7352, 2421,  221, 4902],
       [4408, 3941, 9332, 6650,

## Ищем фильмы, ближайшие по тегам

In [239]:
movie_tags = [change_string1(g) for g in movies_tags.tag.values]

tfidf_vectorizer = TfidfVectorizer()
x = tfidf_vectorizer.fit_transform(movie_tags)
tfidf_vectorizer.vocabulary_

{'pixar': 1186,
 'fun': 609,
 'fantasy': 548,
 'magic': 952,
 'board': 187,
 'game': 617,
 'robin': 1307,
 'williams': 1714,
 'moldy': 1023,
 'old': 1120,
 'pregnancy': 1216,
 'remake': 1279,
 'politics': 1200,
 'president': 1219,
 'mafia': 950,
 'jane': 837,
 'austen': 116,
 'hollywood': 736,
 'serial': 1379,
 'killer': 879,
 'alcoholism': 50,
 'shakespeare': 1389,
 'in': 785,
 'netflix': 1083,
 'queue': 1245,
 'kidnapping': 877,
 'high': 719,
 'school': 1353,
 'teacher': 1537,
 'time': 1578,
 'travel': 1604,
 'brad': 204,
 'pitt': 1185,
 'bruce': 221,
 'willis': 1715,
 'mindfuck': 1011,
 'post': 1207,
 'apocalyptic': 88,
 'twist': 1621,
 'ending': 507,
 'animal': 77,
 'movie': 1045,
 'pigs': 1183,
 'villain': 1664,
 'nonexistent': 1099,
 'or': 1129,
 'not': 1106,
 'needed': 1075,
 'for': 586,
 'good': 649,
 'story': 1480,
 'death': 414,
 'penalty': 1167,
 'nun': 1110,
 'twins': 1620,
 'chick': 283,
 'flick': 582,
 'funny': 610,
 'paul': 1162,
 'rudd': 1322,
 'quotable': 1249,
 'seen'

In [238]:
neigh1 = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh1.fit(x)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
                 metric_params=None, n_jobs=-1, n_neighbors=7, p=2, radius=1.0)

In [240]:
X_tfidf3 = tfidf_vectorizer.transform(movie_tags1)
X_tfidf3
res2 = neigh1.kneighbors(X_tfidf3, return_distance=True)
res2[1]

array([[ 673,  990,   77,   51, 1216, 1104,  978],
       [  69,  673,  990,  146,  510,  103, 1035],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [1241,  231,  782, 1028, 1383,  990,  673],
       [ 246,  347,  243,  147,  247,  489,  149],
       [ 246,  347,  243,  147,  247,  489,  149],
       [ 246,  347,  243,  147,  247,  489,  149],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,  639,   77,  527,  722, 1216],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 714,  634,  673,  990,   77,  488, 1363],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [1241,  231,  782, 1028, 1383,  990,  673],
       [ 673,  990,   77,   51, 1216, 1104,  978],
       [ 673,  990,   77,   51,

## Соединим полученные массивы и найдем фильмы, которые упоминаются в них чаще всего. 


In [241]:
a1 = res1[1].flatten()
a2 = res2[1].flatten()
a = a1 + a2
a

array([ 3332,  5025,  3594,  7403,  3637,  1325,  5880,  5168,  5721,
        1225,  5199,  1538,  7385, 10067,  9732,  7885,  8262,  6551,
        1726,  5050,  6071,  9671,  2157,  2482,  5120,  4205,  2915,
        1866,  1026,  7554,  1055,  1697,  8970,  8396,   932,  1032,
        6113,  7047,  1706,  4237,  8255,   931,  1739,  6752,  1033,
        7657,  4185,  4719,  7701,  9959,  1036,  3406,  6897,  2047,
        1932,  2228,  9959,  1036,  3406,  6897,  2047,  1932,  2228,
        9959,  1036,  3406,  6897,  2047,  1932,  2228,  9959,  1036,
        3406,  6897,  2047,  1932,  2228,  9959,  1036,  3406,  6897,
        2047,  1932,  2228,  9959,  1036,  3406,  6897,  2047,  1932,
        2228,  6936,  7941,   946,  2821,  5279,  7958, 10810,  2957,
        6932,  8666,  5980,  9797,  3396,  6911,  9691,  7518,  2728,
        3146,  9146,  7942,  3952,  9959,  1036,  3406,  6897,  2047,
        1932,  2228,  2973,  8913,  8351,  7223,  4444,  5951,  8002,
        3332,  5025,

In [251]:
from collections import Counter
import operator 

c = Counter(a)
b = [l for k,l in sorted([(j,i) for i,j in c.items()], reverse=True)]
b

[9959,
 6897,
 3406,
 2228,
 2047,
 1932,
 1036,
 7403,
 5880,
 5025,
 3637,
 3594,
 3332,
 1325,
 10810,
 10067,
 10032,
 9866,
 9797,
 9781,
 9734,
 9732,
 9691,
 9671,
 9480,
 9424,
 9409,
 9275,
 9146,
 9086,
 8970,
 8913,
 8666,
 8396,
 8387,
 8351,
 8262,
 8255,
 8002,
 7958,
 7942,
 7941,
 7885,
 7836,
 7827,
 7701,
 7687,
 7657,
 7554,
 7547,
 7518,
 7385,
 7384,
 7223,
 7047,
 6936,
 6932,
 6911,
 6853,
 6752,
 6701,
 6689,
 6551,
 6437,
 6173,
 6113,
 6071,
 5980,
 5951,
 5721,
 5573,
 5279,
 5199,
 5168,
 5120,
 5081,
 5050,
 4931,
 4719,
 4444,
 4237,
 4205,
 4185,
 3952,
 3639,
 3396,
 3146,
 2973,
 2957,
 2915,
 2821,
 2728,
 2724,
 2528,
 2491,
 2482,
 2157,
 2037,
 1866,
 1739,
 1726,
 1706,
 1697,
 1538,
 1225,
 1055,
 1033,
 1032,
 1026,
 946,
 932,
 931]

In [261]:
#выведем названия наиболее подходящих пользователю фильмов, отсортированные по счетчику
b = [l for k,l in sorted([(j,i) for i,j in c.items()], reverse=True)]

movies.iloc[b[1:10]]

Unnamed: 0,movieId,title,genres
6897,63515,The Island (2006),Drama|Mystery
3406,4634,Penn & Teller Get Killed (1989),Adventure|Comedy
2228,2962,Fever Pitch (1997),Comedy|Romance
2047,2726,"Killing, The (1956)",Crime|Film-Noir
1932,2561,True Crime (1999),Crime|Thriller
1036,1348,"Nosferatu (Nosferatu, eine Symphonie des Graue...",Horror
7403,80094,"Last Exorcism, The (2010)",Horror|Thriller
5880,33148,King's Ransom (2005),Comedy|Crime
5025,7815,True Stories (1986),Comedy|Musical
