In [1]:
import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('Dataset/links.csv')
movies = pd.read_csv('Dataset/movies.csv')
ratings = pd.read_csv('Dataset/ratings.csv')
tags = pd.read_csv('Dataset/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# получим средний рейтинг фильмов по всем пользователям
rating = ratings.groupby('movieId')[['rating']].mean()
rating.head()

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.92093
2,3.431818
3,3.259615
4,2.357143
5,3.071429


In [6]:
movies_rating = movies.merge(rating, on='movieId')
movies_rating.head()

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy|Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429


In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movies_rating['genres'] = [change_string(g) for g in movies_rating.genres.values]
movies_rating.head()

Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,3.92093
1,2,Jumanji (1995),Adventure Children Fantasy,3.431818
2,3,Grumpier Old Men (1995),Comedy Romance,3.259615
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2.357143
4,5,Father of the Bride Part II (1995),Comedy,3.071429


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
# получаем векторы tfidf по жанрам для каждого фильма
tfidf = TfidfVectorizer()
genres_tfidf = tfidf.fit_transform(movies_rating.genres)
genres_tfidf

<9724x20 sparse matrix of type '<class 'numpy.float64'>'
	with 22046 stored elements in Compressed Sparse Row format>

In [11]:
genres_tfidf_df = pd.DataFrame(genres_tfidf.toarray(), columns=tfidf.get_feature_names_out())
genres_tfidf_df.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.416817,0.51634,0.504733,0.267517,0.0,0.0,0.0,0.483048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.512358,0.0,0.620425,0.0,0.0,0.0,0.0,0.593769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.570532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821275,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.504636,0.0,0.0,0.466539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726418,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# предсказание рейтинга фильма на основе его указанных жанров
X = genres_tfidf_df
y = rating.rating

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [15]:
rf = RandomForestRegressor(max_depth=7)

In [16]:
rf.fit(X_train,y_train)

RandomForestRegressor(max_depth=7)

In [17]:
pred = rf.predict(X_test)

In [18]:
mean_squared_error(y_test, pred,squared=False)

0.8348471214134994

In [19]:
# получим датафрейм с проставленными тегами для фильмов
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,62,magic board game,1528843932


In [20]:
# рейтинг и теги для каждого фильма
movies_tags = movies_with_tags.groupby('movieId')[['tag']].sum()
movies_tags_rate = movies_tags.merge(rating, on = 'movieId')

In [21]:
movies_tags_rate.head()

Unnamed: 0_level_0,tag,rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,pixarpixarfun,3.92093
2,fantasymagic board gameRobin Williamsgame,3.431818
3,moldyold,3.259615
5,pregnancyremake,3.071429
7,remake,3.185185


In [22]:
# список тегов для каждого фильма
def change_string_tags(s):
    return str(s).replace(' ', '').replace('-', '').lower()

tag_strings = []
mov = []

for movie, group in movies_with_tags.groupby('movieId'):
    if movie in movies_tags_rate.index.unique():
        tag_strings.append(' '.join([change_string_tags(s) for s in group.tag.values]))
        mov.append(movie)

In [23]:
# получаем векторы tfidf по тегам для каждого фильма
tfidf_tag = TfidfVectorizer()
X_train_tfidf_tag = tfidf_tag.fit_transform(tag_strings)
X_train_tfidf_tag

<1554x1469 sparse matrix of type '<class 'numpy.float64'>'
	with 3577 stored elements in Compressed Sparse Row format>

In [24]:
tags_tfidf_df = pd.DataFrame(X_train_tfidf_tag.toarray(), columns=tfidf_tag.get_feature_names_out())
tags_tfidf_df.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# предсказание рейтинга фильма на основе поставленных тегов
X2 = tags_tfidf_df
y2 = movies_tags_rate.rating

In [26]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=12)

In [27]:
rf = RandomForestRegressor(max_depth=8)

In [28]:
rf.fit(X2_train,y2_train)

RandomForestRegressor(max_depth=8)

In [29]:
pred = rf.predict(X2_test)

In [30]:
mean_squared_error(y2_test, pred,squared=False)

0.5161248528846628

Теги являются более информативным признаком для предсказания рейтинга фильма и модель на их основе показывает значительно лучшие результаты, чем модель с использованием жанров фильма

In [36]:
genres_df = genres_tfidf_df.set_index(movies_rating['movieId'])
genres_df.head()

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.0,0.416817,0.51634,0.504733,0.267517,0.0,0.0,0.0,0.483048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.512358,0.0,0.620425,0.0,0.0,0.0,0.0,0.593769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.570532,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821275,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.504636,0.0,0.0,0.466539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726418,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
tags_df = tags_tfidf_df.set_index(movies_tags_rate.index)
tags_df.head()

Unnamed: 0_level_0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# объединим данные по тегам и жанрам в общий датасет
genres_tags_df = pd.concat([genres_df, tags_df],axis=1,join='inner')

In [58]:
genres_tags_df.head()

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.416817,0.51634,0.504733,0.267517,0.0,0.0,0.0,0.483048,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.512358,0.0,0.620425,0.0,0.0,0.0,0.0,0.593769,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.570532,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.570532,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
# предсказание рейтинга фильма по тегам и жанрам
X3 = genres_tags_df
y3 = movies_tags_rate.rating

In [62]:
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=12)

In [63]:
rf.fit(X3_train,y3_train)

RandomForestRegressor(max_depth=8)

In [64]:
pred = rf.predict(X3_test)

In [65]:
# модель на тегах и жанрах показывает самый лучший результат
mean_squared_error(y3_test, pred,squared=False)

0.5004710885667832