In [61]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

%matplotlib inline

In [24]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [25]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [26]:
movies_modified = movies.copy()

In [27]:
movies_modified['list_genres'] = [change_string(g) for g in movies_modified.genres.values]
movies_modified 

Unnamed: 0,movieId,title,genres,list_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Action Animation Comedy Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy
9739,193585,Flint (2017),Drama,Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Action Animation


In [112]:
l = movies_modified.merge(tags)
p = l.groupby('movieId').agg({'tag':list}).reset_index()
p['tag'] = [' '.join(map(str, l)) for l in p['tag']]
p

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Comedy funny Rachel McAdams
1568,184471,adventure Alicia Vikander video game adaptation
1569,187593,Josh Brolin Ryan Reynolds sarcasm
1570,187595,Emilia Clarke star wars


In [113]:
movies_modified_new = movies_modified.join(p.set_index('movieId'), on='movieId')
movies_modified_new = movies_modified_new.fillna(' ')
movies_modified_new

Unnamed: 0,movieId,title,genres,list_genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,pregnancy remake
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,Action Animation Comedy Fantasy,
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy,
9739,193585,Flint (2017),Drama,Drama,
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,Action Animation,


In [114]:
# Насколько правильно использовать tfidf_transformer_genres.fit(movies_modified_new[['list_genres','tag']]) ? 

tfidf_transformer_genres = TfidfVectorizer()
tfidf_transformer_genres.fit(movies_modified_new[['list_genres','tag']])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [104]:
p = ratings.groupby('movieId')['rating'].median().reset_index()
k = ratings.groupby('movieId')['rating'].mean().reset_index()
p = p.rename(columns={'rating': 'rating_median'})
k = k.rename(columns={'rating': 'rating_mean'})
with_ratings = with_ratings.merge(p)
with_ratings = with_ratings.merge(k)
with_ratings = with_ratings

In [105]:
for_model_df = with_ratings[with_ratings['userId'] == 100]
for_model_df

Unnamed: 0,movieId,title,list_genres,tag,rating,userId,rating_median,rating_mean
337,3,Grumpier Old Men (1995),Comedy Romance,moldy old,3.5,100,3.00,3.259615
759,11,"American President, The (1995)",Comedy Drama Romance,politics president,4.0,100,4.00,3.671429
882,16,Casino (1995),Crime Drama,Mafia,4.5,100,4.00,3.926829
966,17,Sense and Sensibility (1995),Drama Romance,Jane Austen,4.5,100,4.00,3.776119
1056,19,Ace Ventura: When Nature Calls (1995),Comedy,,1.0,100,3.00,2.727273
...,...,...,...,...,...,...,...,...
72254,6785,Seven Brides for Seven Brothers (1954),Comedy Musical Romance Western,Lonesome Polecat,4.5,100,3.75,3.750000
72599,6873,Intolerable Cruelty (2003),Comedy Romance,,4.0,100,3.00,3.175000
73819,7149,Something's Gotta Give (2003),Comedy Drama Romance,,3.5,100,4.00,3.647059
76290,8529,"Terminal, The (2004)",Comedy Drama Romance,,4.0,100,3.50,3.319149


In [95]:
# pd.DataFrame(tfidf_transformer.transform(for_model_df['list_genres']).toarray())

In [108]:
# Как правильно присвоить значение переменной X? Чтобы было не только 2 значения и то те нулевые. 
X, y = tfidf_transformer.transform(for_model_df[['list_genres','tag']]), for_model_df['rating']

In [107]:
pd.DataFrame(X.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
X, y = tfidf_transformer.transform(for_model_df['tag']), for_model_df['rating']

In [86]:
X_tain, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 12)

ValueError: Found input variables with inconsistent numbers of samples: [2, 148]

In [76]:
lr = LinearRegression()
lr.fit(X_tain,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [77]:
lr.predict(X_test)

array([3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.56837835, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571,
       3.96428571, 3.96428571, 3.96428571, 3.96428571, 3.96428571])

In [78]:
y_test

72599    4.0
62640    3.0
21016    3.5
44048    3.5
10371    4.0
6487     4.0
42137    4.0
72254    4.5
76290    4.0
66344    3.5
11146    4.0
59324    3.0
69791    4.5
33885    3.5
30927    4.0
28598    4.0
57002    4.5
11349    3.5
69968    4.5
56631    4.0
20215    4.0
27553    4.5
60466    4.5
20175    4.5
15062    4.0
4961     4.5
28335    3.5
7114     4.5
66821    4.5
22410    2.0
759      4.0
31260    4.5
6380     1.0
58123    3.5
27950    3.5
6357     4.0
28980    4.5
19674    4.5
13530    4.0
3060     4.0
7909     3.5
5930     4.0
3250     3.5
16812    4.5
69396    4.5
2940     4.0
62771    3.5
59990    5.0
55257    4.0
21267    4.0
40096    4.5
52992    4.5
37180    3.5
52651    4.0
3109     4.5
39767    3.5
1413     4.5
70826    4.0
43654    5.0
46352    3.5
Name: rating, dtype: float64