In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import NearestNeighbors

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-','').split('|'))

### Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    -TF-IDF на тегах и жанрах
    -Средние оценки (+ median, variance, etc.) пользователя и фильма

In [4]:
dup = movies[movies.title.duplicated()==True]['title'].to_list()
movies[movies.title.isin(dup)].sort_values(by='title')

Unnamed: 0,movieId,title,genres
4169,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
650,838,Emma (1996),Comedy|Drama|Romance
5601,26958,Emma (1996),Romance
5854,32600,Eros (2004),Drama
9135,147002,Eros (2004),Drama|Romance
2141,2851,Saturn 3 (1980),Adventure|Sci-Fi|Thriller
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller
5931,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
6932,64997,War of the Worlds (2005),Action|Sci-Fi


In [5]:
to_dtop = [4169, 5601, 5854, 9468, 6932]
movies.drop(to_dtop, inplace=True)

Теги

In [6]:
memento = {}

for index, row in tags.iterrows():
    
    if row['movieId'] in memento:
        memento[row['movieId']] += f" {row['tag']}" 
        
        
    else:
        memento[row['movieId']] = row['tag']


In [7]:
tags2 = pd.DataFrame.from_dict(memento, orient='index').reset_index()
tags2.columns = ['movieId', 'tags']

In [8]:
movies_with_genres_tags = movies.join(tags2.set_index('movieId'), on='movieId')
movies_with_genres_tags['genres'] = movies_with_genres_tags['genres'].apply(change_string)
movies_with_genres_tags.head()

Unnamed: 0,movieId,title,genres,tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake


In [9]:
mean_ratings = ratings.groupby('movieId').mean().drop(['userId','timestamp'],axis=1)
mean_ratings.rename(columns = {'rating':'mean_ratings'}, inplace = True)

median_ratings = ratings.groupby('movieId').median().drop(['userId','timestamp'],axis=1)
median_ratings.rename(columns = {'rating':'med_ratings'}, inplace = True)

variance_ratings = ratings.groupby('movieId').var().drop(['userId','timestamp'],axis=1)
variance_ratings.rename(columns = {'rating':'var_ratings'}, inplace = True)

movies_genres_tags_ratings = movies_with_genres_tags.join(mean_ratings, on='movieId')
movies_genres_tags_ratings = movies_genres_tags_ratings.join(median_ratings, on='movieId')
movies_genres_tags_ratings = movies_genres_tags_ratings.join(variance_ratings, on='movieId')

movies_genres_tags_ratings.head()

Unnamed: 0,movieId,title,genres,tags,mean_ratings,med_ratings,var_ratings
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,3.92093,4.0,0.69699
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,3.431818,3.5,0.777419
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,3.259615,3.0,1.112651
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,2.357143,3.0,0.72619
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,3.071429,3.0,0.822917


In [10]:
movies_genres_tags_ratings = movies_genres_tags_ratings.dropna()

In [11]:
tag_strings = movies_genres_tags_ratings.tags.to_list()

In [12]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [14]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [15]:
test_tags = 'magic board game fantasy game Robin Williams'

predict = count_vect.transform([test_tags])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [16]:
res

(array([[0.        , 1.        , 1.15891734, 1.19605292, 1.2089532 ,
         1.25917202, 1.25917202]]),
 array([[   1,  653, 1207, 1391,  836,  228,  756]], dtype=int64))

In [17]:
movies_genres_tags_ratings.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,tags,mean_ratings,med_ratings,var_ratings
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game,3.431818,3.5,0.777419
2382,3160,Magnolia (1999),Drama,L.A.,3.711538,4.0,0.983786
6254,46972,Night at the Museum (2006),Action Comedy Fantasy IMAX,Ben Stiller Robin Williams,3.23913,3.0,0.79249
9692,184471,Tomb Raider (2018),Action Adventure Fantasy,adventure Alicia Vikander video game adaptation,2.5,3.0,1.833333
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy,fantasy high fantasy Magic mythology tolkien w...,4.106061,4.5,0.874481
744,971,Cat on a Hot Tin Roof (1958),Drama,Tennessee Williams,4.1,4.25,0.766667
3065,4113,"Glass Menagerie, The (1987)",Drama,Tennessee Williams,3.0,3.0,2.0


Жанры

In [18]:
genres_list = movies_genres_tags_ratings.genres.to_list()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(genres_list)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neig_reg = KNeighborsRegressor(n_neighbors=7, n_jobs=-1, metric='euclidean')
neig_reg.fit(X_train_counts, X_train_tfidf)

KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [19]:
test = change_string('Adventure|Comedy|Fantasy|Crime')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

predicted_movies = neig_reg.kneighbors(X_tfidf2, return_distance=True)
predicted_movies

(array([[0.89410095, 0.8966645 , 0.8966645 , 0.8966645 , 0.8966645 ,
         0.8966645 , 1.00229273]]),
 array([[1120, 1346,  889,  836,  606,  894,  213]], dtype=int64))

In [20]:
movies_genres_tags_ratings.iloc[predicted_movies[1][0]].sort_values(by=['mean_ratings'], ascending=False)

Unnamed: 0,movieId,title,genres,tags,mean_ratings,med_ratings,var_ratings
5407,25771,"Andalusian Dog, An (Chien andalou, Un) (1929)",Fantasy,mindfuck surreal surrealism,4.5,4.5,0.333333
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy,fantasy high fantasy Magic mythology tolkien w...,4.106061,4.5,0.874481
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure Fantasy,Myth Tolkein,4.021277,4.0,0.796336
721,941,"Mark of Zorro, The (1940)",Adventure,swashbuckler,3.75,3.75,0.125
2157,2872,Excalibur (1981),Adventure Fantasy,England King Arthur,3.64,3.5,1.031667
4076,5816,Harry Potter and the Chamber of Secrets (2002),Adventure Fantasy,Magic Wizards,3.598039,3.5,0.85663
8617,118696,The Hobbit: The Battle of the Five Armies (2014),Adventure Fantasy,hope,3.416667,3.25,1.183824


Оценить RMSE на тестовой выборке

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [22]:
print(movies_genres_tags_ratings.mean_ratings.isna().any())
movies_genres_tags_ratings3 = movies_genres_tags_ratings.dropna()

False


In [23]:
X_train, X_test, y_train, y_test = train_test_split(movies_genres_tags_ratings3.drop(['mean_ratings'], axis=1), 
                                                    movies_genres_tags_ratings3.mean_ratings, test_size=0.33, 
                                                    random_state=42)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [24]:
train_genres_strings = train_df.genres.to_list()
count_vect_tags = CountVectorizer()
X_train_count4 = count_vect_tags.fit_transform(train_genres_strings)

tfidf_transformer = TfidfTransformer()
X_train_tfidf4 = tfidf_transformer.fit_transform(X_train_count4)

neig4 = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig4.fit(X_train_count4, train_df.mean_ratings)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [25]:
test_genres_strings = test_df.genres.to_list()

X_test_count4 = count_vect_tags.transform(test_genres_strings)
X_test_tfidf4 = tfidf_transformer.transform(X_test_count4)

predicted = neig4.predict(X_test_count4)

In [26]:
mean_squared_error(test_df.mean_ratings, predicted)

0.2210732001692797