In [1]:
import pandas as pd
import numpy as np

In [90]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [91]:
# удалим колонки, которые не будут использоваться в модели
ratings = ratings.drop('timestamp', axis = 1)
tags = tags.drop(['timestamp','userId'], axis = 1)
# и приведем содержание к нужному виду
tags['tag'] = tags['tag'].apply(lambda x: x.replace(' ', '')).str.lower()
movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')
movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|', ' ')).str.lower()

In [92]:
#уберем повторяющиеся тэги
tags = tags.drop_duplicates()

In [93]:
tags.head()

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,highlyquotable
2,60756,willferrell
3,89774,boxingstory
4,89774,mma


In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_tfidf = tfidf_vec.fit_transform(tags['tag'])

In [95]:
#соберем все тэги и их веса в отдельный список
idf = tfidf_vec.idf_
weights = dict(zip(tfidf_vec.get_feature_names(), idf))

weights_sort= sorted(weights.items(), key=lambda x: x[1], reverse = True)
weights_sort

[('1900s', 8.488293515159427),
 ('1960s', 8.488293515159427),
 ('1990s', 8.488293515159427),
 ('2001', 8.488293515159427),
 ('2danimation', 8.488293515159427),
 ('70mm', 8.488293515159427),
 ('80', 8.488293515159427),
 ('abortion', 8.488293515159427),
 ('absorbing', 8.488293515159427),
 ('abstract', 8.488293515159427),
 ('academyaward', 8.488293515159427),
 ('accident', 8.488293515159427),
 ('achronological', 8.488293515159427),
 ('acleverchefrat', 8.488293515159427),
 ('acting', 8.488293515159427),
 ('actionchoreography', 8.488293515159427),
 ('actionpacked', 8.488293515159427),
 ('addiction', 8.488293515159427),
 ('adingoatemybaby', 8.488293515159427),
 ('adorable', 8.488293515159427),
 ('adrienbrody', 8.488293515159427),
 ('adulthumor', 8.488293515159427),
 ('afghanistan', 8.488293515159427),
 ('aggressive', 8.488293515159427),
 ('aging', 8.488293515159427),
 ('alanrickman', 8.488293515159427),
 ('alcatraz', 8.488293515159427),
 ('aliciavikander', 8.488293515159427),
 ('allegorical'

In [96]:
#убираем из рассмотрения низкочастотные тэги, заменим все тэги с весом менее 7 на other
tags_to_replace = []
for i in range (0, len(weights_sort)):
    if weights_sort[i][1] < 7:
        tags_to_replace.append(weights_sort[i][0] )
        

for i in tags_to_replace:
    tags['tag'] = tags['tag'].apply(lambda x: x.replace(i, 'other'))

In [97]:
#уберем повторяющиеся тэги
tags = tags.drop_duplicates()

#объединяем тэги по фильму
tags = tags.groupby(['movieId']).agg(lambda row: ' '.join(row.dropna())).reset_index()
tags.head()

Unnamed: 0,movieId,tag
0,1,pixar fun
1,2,fantasy magicboardgame robinwilliams game
2,3,moldy old
3,5,pregnancy other
4,7,other


In [98]:
#обучим tf-idf по получившемуся списку тэгов 
matrix_tfidf = tfidf_vec.fit_transform(tags['tag'])
tags_tfidf = pd.DataFrame(matrix_tfidf.todense(), columns = tfidf_vec.get_feature_names(), index=tags['movieId']).reset_index()

tags_tfidf.head()

Unnamed: 0,movieId,06othernominatedbestmovie,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001,...,workplace,worldwari,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [99]:
#обучим tf-idf по списку жанров

tfidf_matrix = tfidf_vec.fit_transform(movies['genres'])
genres_tfidf = pd.DataFrame(tfidf_matrix.todense(), columns = tfidf_vec.get_feature_names(), index=movies['movieId']).reset_index()
genres_tfidf.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,listed,musical,mystery,no,noir,romance,scifi,thriller,war,western
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
#добавим средние оценки по фильму и по пользователю
mean_rating = ratings.groupby(['movieId']).rating.agg(['mean'])
ratings = ratings.join(mean_rating , on='movieId')
ratings = ratings.rename({'mean': 'mean_for_movie'}, axis=1)

user_rat = ratings.groupby(['userId']).rating.agg(['mean'])
ratings = ratings.join(user_rat , on='userId')
ratings = ratings.rename({'mean': 'mean_for_user'}, axis=1) 

ratings.head()

Unnamed: 0,userId,movieId,rating,mean_for_movie,mean_for_user
0,1,1,4.0,3.92093,4.366379
1,1,3,4.0,3.259615,4.366379
2,1,6,4.0,3.946078,4.366379
3,1,47,5.0,3.975369,4.366379
4,1,50,5.0,4.237745,4.366379


In [103]:
#объединяем датафреймы
movie_info = genres_tfidf.merge(tags_tfidf, on='movieId')
finall = ratings.merge(movie_info, on = ['movieId'])
finall.head()

Unnamed: 0,userId,movieId,rating,mean_for_movie,mean_for_user,action,adventure,animation,children_x,comedy,...,workplace,worldwari,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,1,4.0,3.92093,4.366379,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,4.0,3.92093,3.636364,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7,1,4.5,3.92093,3.230263,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,15,1,2.5,3.92093,3.448148,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,1,4.5,3.92093,4.209524,0.0,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
#задаем Х и У, делим выборки

from sklearn.model_selection import train_test_split
X1 = finall.iloc[:, 3:]
y = finall.rating
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=42)

In [112]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [118]:
reg.score(X_test, y_test)

-1.1063969232976783e+22