## Домашнее задание по теме «Рекомендации на основе содержания»
Преподаватель: Наталья Баданина

    Использовать dataset MovieLens
    Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    TF-IDF на тегах и жанрах
    Средние оценки (+ median, variance, etc.) пользователя и фильма

    Оценить RMSE на тестовой выборке



In [1]:
import os
import numpy as np
from path import Path
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
data_dir = Path('ml-latest-small')
os.listdir(data_dir)

['genome-scores.csv',
 'genome-tags.csv',
 'links.csv',
 'movies.csv',
 'ratings.csv',
 'README.txt',
 'tags.csv']

In [3]:
movies = pd.read_csv(data_dir / 'movies.csv')
tags = pd.read_csv(data_dir / 'tags.csv')
ratings = pd.read_csv(data_dir / 'ratings.csv')

In [4]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
...,...,...,...,...
27753439,283228,8542,4.5,1379882795
27753440,283228,8712,4.5,1379882751
27753441,283228,34405,4.5,1379882889
27753442,283228,44761,4.5,1354159524


In [5]:
# Использую теги из большого набора данных
tags.dropna(inplace=True)
tags[tags.movieId == 1]

Unnamed: 0,userId,movieId,tag,timestamp
1775,1040,1,animated,1514919574
1776,1040,1,buddy movie,1514919577
1777,1040,1,Cartoon,1514919589
1778,1040,1,cgi,1514919556
1779,1040,1,comedy,1514919569
...,...,...,...,...
1098768,280018,1,witty,1516156302
1100103,280978,1,Pixar,1162989834
1105064,282405,1,Disney,1437927776
1105065,282405,1,friendship,1437927794


In [6]:
tags.shape

(1108981, 4)

In [7]:
def change_tag(tag):
    return ' '.join(list(set(tag))).lower()

tags = tags.groupby('movieId').agg(list)
# tags.tag = tags.tag.apply(change_tag)
tags.head()

Unnamed: 0_level_0,userId,tag,timestamp
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,"[1040, 1040, 1040, 1040, 1040, 1040, 1040, 104...","[animated, buddy movie, Cartoon, cgi, comedy, ...","[1514919574, 1514919577, 1514919589, 151491955..."
2,"[3679, 5510, 5510, 5510, 5510, 5510, 5510, 551...","[fantasy, adapted from:book, animals, bad cgi,...","[1454270092, 1303424411, 1303424415, 130342441..."
3,"[17224, 17224, 19007, 19007, 19007, 19007, 190...","[moldy, old, Ann Margaret, Burgess Meredith, D...","[1143424860, 1143424860, 1506741914, 150674193..."
4,"[2419, 24230, 54594, 54594, 73406, 73406, 7340...","[characters, girl movie, characters, chick fli...","[1335562009, 1187039053, 1360563411, 136276998..."
5,"[20315, 51495, 54271, 54271, 73406, 73406, 734...","[steve martin, steve martin, pregnancy, remake...","[1143209074, 1304427888, 1137373903, 113737390..."


In [8]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')).lower()

# movies['genres'] = movies['genres'].apply(change_string)
# movies

In [9]:
movies[movies.title.duplicated()]

Unnamed: 0,movieId,title,genres
9142,26958,Emma (1996),Romance
9157,26982,Men with Guns (1997),Drama
13309,64997,War of the Worlds (2005),Action|Sci-Fi
13395,65665,Hamlet (2000),Drama
13614,67459,Chaos (2005),Crime|Drama|Horror
...,...,...,...
57269,191775,Berlin Calling (2008),Comedy|Drama
57305,191867,Let There Be Light (2017),Documentary
57361,192003,Journey to the Center of the Earth (2008),Action|Adventure|Fantasy|Sci-Fi
57463,192243,Contact (1992),Drama|Horror|Mystery|Thriller


In [10]:
# Есть несколько дубликатов
movies[movies.title == 'Confessions of a Dangerous Mind (2002)']

Unnamed: 0,movieId,title,genres
5905,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
36458,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller


In [11]:
movies.drop_duplicates(inplace=True)

In [12]:
ratings = ratings[['movieId', 'rating']].groupby('movieId').agg(np.sum)
ratings

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,266115.0
2,88122.0
3,49466.5
4,8592.0
5,47618.0
...,...
193876,3.0
193878,2.0
193880,2.0
193882,2.0


In [13]:
movies = movies.join(ratings, on='movieId')
movies = movies.join(tags, on='movieId')
movies.dropna(inplace=True)
movies.head()

Unnamed: 0,movieId,title,genres,rating,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,266115.0,"[1040, 1040, 1040, 1040, 1040, 1040, 1040, 104...","[animated, buddy movie, Cartoon, cgi, comedy, ...","[1514919574, 1514919577, 1514919589, 151491955..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,88122.0,"[3679, 5510, 5510, 5510, 5510, 5510, 5510, 551...","[fantasy, adapted from:book, animals, bad cgi,...","[1454270092, 1303424411, 1303424415, 130342441..."
2,3,Grumpier Old Men (1995),Comedy|Romance,49466.5,"[17224, 17224, 19007, 19007, 19007, 19007, 190...","[moldy, old, Ann Margaret, Burgess Meredith, D...","[1143424860, 1143424860, 1506741914, 150674193..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,8592.0,"[2419, 24230, 54594, 54594, 73406, 73406, 7340...","[characters, girl movie, characters, chick fli...","[1335562009, 1187039053, 1360563411, 136276998..."
4,5,Father of the Bride Part II (1995),Comedy,47618.0,"[20315, 51495, 54271, 54271, 73406, 73406, 734...","[steve martin, steve martin, pregnancy, remake...","[1143209074, 1304427888, 1137373903, 113737390..."


In [14]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41772 entries, 0 to 58089
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   movieId    41772 non-null  int64  
 1   title      41772 non-null  object 
 2   genres     41772 non-null  object 
 3   rating     41772 non-null  float64
 4   userId     41772 non-null  object 
 5   tag        41772 non-null  object 
 6   timestamp  41772 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 2.5+ MB


In [15]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['adventure animation children comedy fantasy',
 'adventure children fantasy',
 'comedy romance',
 'comedy drama romance',
 'comedy',
 'action crime thriller',
 'comedy romance',
 'adventure children',
 'action',
 'action adventure thriller']

In [16]:
count_vect = CountVectorizer()
genres_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
genres_tfidf = tfidf_transformer.fit_transform(genres_counts)
genres_tfidf.toarray().shape

(41772, 20)

In [17]:
genres_tfidf.toarray()[0][:10]

array([0.        , 0.44099886, 0.49247529, 0.49434753, 0.28038717,
       0.        , 0.        , 0.        , 0.48988942, 0.        ])

In [18]:
movie_tags = [change_tag(g) for g in movies.tag.values]
count_vect = CountVectorizer(min_df=0.001, max_df=0.999)
tags_counts = count_vect.fit_transform(movie_tags)
tfidf_transformer = TfidfTransformer()
tags_tfidf = tfidf_transformer.fit_transform(tags_counts)
tags_tfidf.toarray().shape

(41772, 2618)

In [19]:
X = np.hstack((tags_tfidf.toarray(), genres_tfidf.toarray()))
y = movies.rating
X.shape

(41772, 2638)

In [20]:
# try PCA
# scores = []
# for n in tqdm(range(10, 50)):
#     pca = PCA(n_components = n)
#     XPCA = pca.fit_transform(X)
#     reg = LinearRegression().fit(XPCA, y)
#     score = reg.score(XPCA, y)
#     scores.append(score.mean())
#     print(score.mean())

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# reg = LinearRegression().fit(X_train, y_train)
# reg.score(X_test, y_test), mean_squared_error(reg.predict(X_test), y_test, squared=False)

In [28]:
%%time
rf = RandomForestRegressor(max_depth=12, random_state=42, verbose=True, n_jobs=-1)
rf.fit(X_train, y_train)
rf.score(X_test, y_test), mean_squared_error(reg.predict(X_test), y_test, squared=False)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.8min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.1s finished


CPU times: user 1h 6min, sys: 17 s, total: 1h 6min 17s
Wall time: 7min 47s


(0.7421691483565127, 16900.838958443324)