In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [3]:
# удалим колонки, которые не будут использоваться в модели
ratings = ratings.drop('timestamp', axis = 1)
tags = tags.drop(['timestamp','userId'], axis = 1)
# и приведем содержание к нужному виду
tags['tag'] = tags['tag'].apply(lambda x: x.replace(' ', '')).str.lower()
movies['genres'] = movies['genres'].apply(lambda x: x.replace('|', ' ')).str.lower()

In [4]:
#уберем повторяющиеся тэги
tags = tags.drop_duplicates()

In [5]:
tags.head()

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,highlyquotable
2,60756,willferrell
3,89774,boxingstory
4,89774,mma


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer()
X_train_tfidf = tfidf_vec.fit_transform(tags['tag'])

In [7]:
#соберем все тэги и их веса в отдельный список
idf = tfidf_vec.idf_
weights = dict(zip(tfidf_vec.get_feature_names(), idf))

weights_sort= sorted(weights.items(), key=lambda x: x[1], reverse = True)
weights_sort

[('1900s', 8.488293515159427),
 ('1960s', 8.488293515159427),
 ('1990s', 8.488293515159427),
 ('2001', 8.488293515159427),
 ('2danimation', 8.488293515159427),
 ('70mm', 8.488293515159427),
 ('80', 8.488293515159427),
 ('abortion', 8.488293515159427),
 ('absorbing', 8.488293515159427),
 ('abstract', 8.488293515159427),
 ('academyaward', 8.488293515159427),
 ('accident', 8.488293515159427),
 ('achronological', 8.488293515159427),
 ('acleverchefrat', 8.488293515159427),
 ('acting', 8.488293515159427),
 ('actionchoreography', 8.488293515159427),
 ('actionpacked', 8.488293515159427),
 ('addiction', 8.488293515159427),
 ('adingoatemybaby', 8.488293515159427),
 ('adorable', 8.488293515159427),
 ('adrienbrody', 8.488293515159427),
 ('adulthumor', 8.488293515159427),
 ('afghanistan', 8.488293515159427),
 ('aggressive', 8.488293515159427),
 ('aging', 8.488293515159427),
 ('alanrickman', 8.488293515159427),
 ('alcatraz', 8.488293515159427),
 ('aliciavikander', 8.488293515159427),
 ('allegorical'

In [8]:
#убираем из рассмотрения низкочастотные тэги, заменим все тэги с весом менее 7 на other
tags_to_replace = []
for i in range (0, len(weights_sort)):
    if weights_sort[i][1] < 7:
        tags_to_replace.append(weights_sort[i][0] )
        

for i in tags_to_replace:
    tags['tag'] = tags['tag'].apply(lambda x: x.replace(i, 'other'))

In [9]:
#уберем повторяющиеся тэги
tags = tags.drop_duplicates()

In [10]:
#объединяем тэги по фильму
tags = tags.groupby(['movieId']).agg(lambda row: ' '.join(row.dropna()))
tags.head()

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar fun
2,fantasy magicboardgame robinwilliams game
3,moldy old
5,pregnancy other
7,other


In [49]:
X1_train_tfidf = tfidf_vec.fit_transform(tags['tag'])
tags['tags - tfidf'] = list((X1_train_tfidf).toarray())

In [50]:
tags.head()

Unnamed: 0_level_0,tag,tags - tfidf
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,fantasy magicboardgame robinwilliams game,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,moldy old,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,pregnancy other,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,other,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [51]:
#объединяем датафреймы
rat_movies = ratings.join(movies.set_index('movieId'), on='movieId')
finall = rat_movies.merge(tags, on = ['movieId'])
finall.head()

Unnamed: 0,userId,movieId,rating,title,genres,tag,tags - tfidf
0,1,1,4.0,Toy Story (1995),adventure animation children comedy fantasy,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,5,1,4.0,Toy Story (1995),adventure animation children comedy fantasy,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,7,1,4.5,Toy Story (1995),adventure animation children comedy fantasy,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,15,1,2.5,Toy Story (1995),adventure animation children comedy fantasy,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,17,1,4.5,Toy Story (1995),adventure animation children comedy fantasy,pixar fun,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
finall['genres - tfidf'] = list(tfidf_vec.fit_transform(finall['genres']).toarray())

In [53]:
finall = finall.drop(['genres', 'tag', 'title'], axis = 1)
finall.head()

Unnamed: 0,userId,movieId,rating,tags - tfidf,genres - tfidf
0,1,1,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,..."
1,5,1,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,..."
2,7,1,4.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,..."
3,15,1,2.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,..."
4,17,1,4.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,..."


In [54]:
#добавим средние оценки по фильму
mean_rating = finall.groupby(['movieId']).rating.agg(['mean'])
finall = finall.join(mean_rating , on='movieId')
finall = finall.rename({'mean': 'mean_for_movie'}, axis=1) 

In [55]:
#добавим средние оценки по пользователю
user_rat = finall.groupby(['userId']).rating.agg(['mean'])
finall = finall.join(user_rat , on='userId')
finall = finall.rename({'mean': 'mean_for_user'}, axis=1) 
finall.head()

Unnamed: 0,userId,movieId,rating,tags - tfidf,genres - tfidf,mean_for_movie,mean_for_user
0,1,1,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,...",3.92093,4.403509
1,5,1,4.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,...",3.92093,3.74359
2,7,1,4.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,...",3.92093,3.543956
3,15,1,2.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,...",3.92093,3.5625
4,17,1,4.5,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.36089327162674073, 0.5434373477167418,...",3.92093,4.27907


In [56]:
#отбираем данные по одному пользователю
user_474 = finall.loc[finall.userId == 474]

In [64]:
#задаем Х и У, делим выборки

from sklearn.model_selection import train_test_split
X = user_474.iloc[:,3:]
y = user_474.rating
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [65]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

ValueError: setting an array element with a sequence.