In [4]:
import pandas as pd
import numpy as np
import requests

from tqdm.notebook import tqdm
from  sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

In [2]:
# максимальное число признаков tfidf
max_features_tfidf = 750

In [3]:
!wget 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'  # 100 000 оценок и 3600 тегов, применённых к 9000 фильмов 600 пользователями. Последнее обновление 9/2018.

--2025-05-10 13:50:54--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2025-05-10 13:50:56 (2.42 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [None]:
#!wget https://files.grouplens.org/datasets/movielens/ml-latest.zip # Полная версия: около 33 000 000 оценок и 2 000 000 тегов, применённых к 86 000 фильмов 330 975 пользователями.
                                                                   # Включает данные о геноме тегов с 14 миллионами оценок релевантности по 1100 тегам. Последнее обновление: сентябрь 2018 г.

In [20]:
!unzip -o ml-latest-small.zip # !unzip -o ml-latest-small.zip или !yes A | unzip ml-latest-small.zip (архив автоматически распаковывался с заменой всех файлов без запроса подтверждения)

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [21]:
movies = pd.read_csv('ml-latest-small/movies.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
links = pd.read_csv('ml-latest-small/links.csv')

## feature_genres_tf_idf таблицa. Обработка жанров - преобразование в строку для TF-IDF

In [None]:
print(f'Размерность movies таблицы: {movies.shape}')
print(f'Пропуски movies: {movies.isnull().sum().values}')
print(f'Размерность tags таблицы: {tags.shape}')
print(f'Пропуски tags: {tags.isnull().sum().values}')

Размерность movies таблицы: (9742, 3)
Пропуски movies: [0 0 0]
Размерность tags таблицы: (3683, 4)
Пропуски tags: [0 0 0 0]


In [None]:
movies_with_tags = movies.merge(right=tags, on='movieId', how='left')
print(f'Размерность  таблицы: {movies_with_tags.shape}')
print(f'Пропуски: {movies_with_tags.isnull().sum().values}')
movies_with_tags = movies_with_tags[['movieId', 'userId', 'title', 'genres', 'tag', 'timestamp']]
movies_with_tags.head(3)

Размерность  таблицы: (11853, 6)
Пропуски: [   0    0    0 8170 8170 8170]


Unnamed: 0,movieId,userId,title,genres,tag,timestamp
0,1,336.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,1139046000.0
1,1,474.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar,1137207000.0
2,1,567.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,fun,1525286000.0


In [None]:
# strip() - удаляет лишиние пробелы в начале и в конце строки, #upper - нормализует в единый регистр
movies_with_tags['genres'] = movies_with_tags['genres'].apply(lambda str_: " ".join(str_.replace(' ', '').replace('-', '').split('|')).strip().upper())

In [None]:
movies_with_tags.fillna(0, inplace=True) # заполнение пропусков
print(f'Пропуски: {movies_with_tags.isnull().sum().values}')
movies_with_tags.head(1)

Пропуски: [0 0 0 0 0 0]


Unnamed: 0,movieId,userId,title,genres,tag,timestamp
0,1,336.0,Toy Story (1995),ADVENTURE ANIMATION CHILDREN COMEDY FANTASY,pixar,1139046000.0


In [None]:
def feature_tfidf(df, column, get_df=None):
  """
  Функция преобразует текстовую колонку в матрицу TF-IDF
  """
  # Создаем CountVectorizer и трансформируем текст
  CountVectorizer_text = CountVectorizer(max_features=max_features_tfidf)
  x_countvectorizer = CountVectorizer_text.fit_transform(df[column])
  # Затем производим TF-IDF трансформацию
  tf_idf = TfidfTransformer()
  x_tf_idf = tf_idf.fit_transform(x_countvectorizer).toarray()

  if get_df==None:
    return x_tf_idf
  elif get_df == 'yes':
    return pd.DataFrame(data=x_tf_idf, columns=CountVectorizer_text.get_feature_names_out())

In [None]:
genres_tfidf = feature_tfidf(movies_with_tags, 'genres', get_df='yes')
print(f'Размерность genres_tfidf таблицы: {genres_tfidf.shape}')
genres_tfidf.head(1)

Размерность genres_tfidf таблицы: (11853, 20)


Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Присоединяем к таблице  ID-ки фильмов
genres_tfidf = pd.concat([movies_with_tags['movieId'], genres_tfidf], axis=1)
print(f'Размерность genres_tfidf таблицы: {genres_tfidf.shape}')
genres_tfidf.head(3)

Размерность genres_tfidf таблицы: (11853, 21)


Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## feature_tag_tf_idf таблицa. Обработка тегов - объединение всех тегов для каждого фильма. TF-IDF

In [None]:
print(f'Размерность tags таблицы: {tags.shape}')
print(f'Пропуски tags: {tags.isnull().sum().values}')
print(f'Всего уникальных фильмов - {movies.movieId.nunique()}')
print(f'Уникальные фильмы у которых присутствуют теги - {tags.movieId.nunique()}')
tags.head(1)

Размерность tags таблицы: (3683, 4)
Пропуски tags: [0 0 0 0]
Всего уникальных фильмов - 9742
Уникальные фильмы у которых присутствуют теги - 1572


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994


In [None]:
movies_with_tags.head(1)

Unnamed: 0,movieId,userId,title,genres,tag,timestamp
0,1,336.0,Toy Story (1995),ADVENTURE ANIMATION CHILDREN COMEDY FANTASY,pixar,1139046000.0


In [None]:
# Применяем группировку и метод apply. str.replace не применял
tags_movie = movies_with_tags.groupby(['movieId'])['tag'].apply(
    lambda tags: " ".join([str(s).strip().upper() for s in tags])).reset_index()

In [None]:
print(tags_movie.shape)
tags_movie.head(3)

(9742, 2)


Unnamed: 0,movieId,tag
0,1,PIXAR PIXAR FUN
1,2,FANTASY MAGIC BOARD GAME ROBIN WILLIAMS GAME
2,3,MOLDY OLD


In [None]:
tags_tfidf = feature_tfidf(df=tags_movie, column='tag', get_df='yes')
print(f'Размерность tags_tfidf таблицы: {tags_tfidf.shape}')
tags_tfidf.head(1)

Размерность tags_tfidf таблицы: (9742, 750)


Unnamed: 0,06,1920s,1950s,1970s,1980s,250,aardman,abuse,acting,action,...,will,williams,willis,with,witty,wizards,world,writing,york,zombies
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# таблица tf_idf тегов
tags_tfidf = pd.concat([tags_movie['movieId'], tags_tfidf], axis=1)
print(f'Размерность tags_tfidf таблицы: {tags_tfidf.shape}')
tags_tfidf.head(1)

Размерность tags_tfidf таблицы: (9742, 751)


Unnamed: 0,movieId,06,1920s,1950s,1970s,1980s,250,aardman,abuse,acting,...,will,williams,willis,with,witty,wizards,world,writing,york,zombies
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Таблицы статистики

In [None]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [None]:
statistics = ['mean', 'std', 'min', 'max']
movies_stats = ratings.groupby('movieId').agg({'rating': statistics}).reset_index()
print(f'Размерность таблицы: {movies_stats.shape}')
print(f'Пропуски: {movies_stats.isnull().sum().values}')
movies_stats.columns = ['movieId', 'movie_mean_rating', 'movie_rating_std', 'movie_rating_min', 'movie_rating_max']
movies_stats.head(3)

Размерность таблицы: (9724, 5)
Пропуски: [   0    0 3446    0    0]


Unnamed: 0,movieId,movie_mean_rating,movie_rating_std,movie_rating_min,movie_rating_max
0,1,3.92093,0.834859,0.5,5.0
1,2,3.431818,0.881713,0.5,5.0
2,3,3.259615,1.054823,0.5,5.0


In [None]:
movies_stats.fillna(0, inplace=True)

In [None]:
user_stats = ratings.groupby('userId').agg({'rating': statistics}).reset_index()
print(f'Размерность таблицы: {user_stats.shape}')
print(f'Пропуски: {user_stats.isnull().sum().values}')
user_stats.columns = ['userId', 'userId_mean_rating', 'userId_rating_std', 'userId_rating_min', 'userId_rating_max']
user_stats.head(3)

Размерность таблицы: (610, 5)
Пропуски: [0 0 0 0 0]


Unnamed: 0,userId,userId_mean_rating,userId_rating_std,userId_rating_min,userId_rating_max
0,1,4.366379,0.800048,1.0,5.0
1,2,3.948276,0.805615,2.0,5.0
2,3,2.435897,2.090642,0.5,5.0


In [None]:
ratings_stats = ratings.merge(right=movies_stats, on='movieId', how='left').merge(right=user_stats, on='userId', how='left')
print(f'Размерность таблицы: {ratings_stats.shape}')
print(f'Пропуски: {ratings_stats.isnull().sum().values}')
ratings_stats.head(3)

Размерность таблицы: (100836, 12)
Пропуски: [0 0 0 0 0 0 0 0 0 0 0 0]


Unnamed: 0,userId,movieId,rating,timestamp,movie_mean_rating,movie_rating_std,movie_rating_min,movie_rating_max,userId_mean_rating,userId_rating_std,userId_rating_min,userId_rating_max
0,1,1,4.0,964982703,3.92093,0.834859,0.5,5.0,4.366379,0.800048,1.0,5.0
1,1,3,4.0,964981247,3.259615,1.054823,0.5,5.0,4.366379,0.800048,1.0,5.0
2,1,6,4.0,964982224,3.946078,0.817224,1.0,5.0,4.366379,0.800048,1.0,5.0


In [None]:
ratings_stats.fillna(0, inplace=True)
print(f'Размерность таблицы: {ratings_stats.shape}')
print(f'Пропуски: {ratings_stats.isnull().sum().values}')

Размерность таблицы: (100836, 12)
Пропуски: [0 0 0 0 0 0 0 0 0 0 0 0]


### Фильмы дупликаты с одинаковыми названиями, хотя id фильмов разное

In [None]:
movies[movies['title'].duplicated()] # Фильмы дупликаты с одинаковыми названиями, хотя id фильмов разное

Unnamed: 0,movieId,title,genres
5601,26958,Emma (1996),Romance
6932,64997,War of the Worlds (2005),Action|Sci-Fi
9106,144606,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Romance|Thriller
9135,147002,Eros (2004),Drama|Romance
9468,168358,Saturn 3 (1980),Sci-Fi|Thriller


## ИТОГОВЫЕ ТАБЛИЦЫ:

In [None]:
print(genres_tfidf.shape)
genres_tfidf.head(1)

(11853, 21)


Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,0.0,0.405293,0.513769,0.514673,0.273438,0.0,0.0,0.0,0.481791,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(tags_tfidf.shape)
tags_tfidf.head(1)

(9742, 751)


Unnamed: 0,movieId,06,1920s,1950s,1970s,1980s,250,aardman,abuse,acting,...,will,williams,willis,with,witty,wizards,world,writing,york,zombies
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
print(ratings_stats.shape)
ratings_stats.head(1)

(100836, 12)


Unnamed: 0,userId,movieId,rating,timestamp,movie_mean_rating,movie_rating_std,movie_rating_min,movie_rating_max,userId_mean_rating,userId_rating_std,userId_rating_min,userId_rating_max
0,1,1,4.0,964982703,3.92093,0.834859,0.5,5.0,4.366379,0.800048,1.0,5.0


In [None]:
movies_total = (ratings_stats.merge(right=genres_tfidf, on='movieId', how='left').
                      merge(right=tags_tfidf, on='movieId', how='left'))

In [None]:
print(movies_total.shape)
print(movies_total[['userId', 'movieId',	'rating']].nunique())
movies_total.head(3)

(285762, 782)
userId      610
movieId    9724
rating       10
dtype: int64


Unnamed: 0,userId,movieId,rating,timestamp,movie_mean_rating,movie_rating_std,movie_rating_min,movie_rating_max,userId_mean_rating,userId_rating_std,...,will,williams,willis,with,witty,wizards,world,writing,york,zombies
0,1,1,4.0,964982703,3.92093,0.834859,0.5,5.0,4.366379,0.800048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1,4.0,964982703,3.92093,0.834859,0.5,5.0,4.366379,0.800048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,4.0,964982703,3.92093,0.834859,0.5,5.0,4.366379,0.800048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Модель

In [None]:
from sklearn import metrics

In [None]:
from sklearn.linear_model import LogisticRegression, Ridge, Lasso

In [None]:
X_feature = movies_total.drop(labels=['rating', 'userId',	'movieId', 'timestamp'] , axis=1)
y_target = movies_total['rating']

In [None]:
model_Ridge = Ridge(random_state=1982)

In [None]:
model_Ridge.fit(X_feature, y_target)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
feature_importance = sorted(list(zip(abs(model_Ridge.coef_), model_Ridge.feature_names_in_)), key=lambda x: x[0], reverse=True)[:10]

In [None]:
def model_x_y(x, y, model=Ridge(), random_st=1982):
  """ Функция для ручного формирования датафрейма с признаками и целевой переменной """
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=random_st)
  model.fit(x_train, y_train)
  y_pred_train = model.predict(x_train)
  y_pred_test = model.predict(x_test)

  r2_train = metrics.r2_score(y_train, y_pred_train)
  r2_test = metrics.r2_score(y_test, y_pred_test)
  rmse_train = np.sqrt(metrics.mean_squared_error(y_train, y_pred_train))
  rmse_test = np.sqrt(metrics.mean_squared_error(y_test, y_pred_test))
  mae_train = metrics.mean_absolute_error(y_train, y_pred_train)
  mae_test = metrics.mean_absolute_error(y_test, y_pred_test)

  df_metrics_model=pd.DataFrame({'R2': [r2_train, r2_test], 'RMSE': [rmse_train, rmse_test], 'MAE': [mae_train, mae_test]}, index=['train', 'test'])

  return df_metrics_model

In [None]:
Ridge = model_x_y(x=X_feature, y=y_target)
Ridge

Unnamed: 0,R2,RMSE,MAE
train,0.313843,0.845784,0.635134
test,0.313746,0.84478,0.634996


In [None]:
feature_importance

[(np.float64(0.8929624189577858), 'movie_mean_rating'),
 (np.float64(0.7160414952712655), 'userId_mean_rating'),
 (np.float64(0.6436554052325922), 'symbolism'),
 (np.float64(0.6079879096673736), 'understated'),
 (np.float64(0.5384525685097689), 'psychedelic'),
 (np.float64(0.5179735866869435), 'epic'),
 (np.float64(0.4867297484010929), 'acting'),
 (np.float64(0.4344016061338799), 'awkward'),
 (np.float64(0.4109830722931717), 'touching'),
 (np.float64(0.4037581459923236), 'harsh')]

In [None]:
# Дерево решений
from sklearn.tree import DecisionTreeRegressor

In [None]:
model_x_y(X_feature, y_target, model=DecisionTreeRegressor(random_state=1982, min_samples_leaf=5, min_samples_split=10, max_depth=30))

Unnamed: 0,R2,RMSE,MAE
train,0.8821,0.350594,0.168244
test,0.70965,0.549493,0.2673


In [None]:
model_tree = DecisionTreeRegressor(random_state=1982, min_samples_leaf=5, min_samples_split=10, max_depth=30)
model_tree.fit(X_feature, y_target)

In [None]:
sorted(zip(abs(model_tree.feature_importances_), X_feature.columns), key=lambda x: x[0], reverse=True)[:10]

[(np.float64(0.2670478691949998), 'movie_mean_rating'),
 (np.float64(0.26014120652332995), 'userId_mean_rating'),
 (np.float64(0.1525398463225832), 'userId_rating_std'),
 (np.float64(0.04214490040823291), 'movie_rating_std'),
 (np.float64(0.024487995280983745), 'userId_rating_min'),
 (np.float64(0.01139388281094749), 'movie_rating_min'),
 (np.float64(0.010807937995280858), 'drama_x'),
 (np.float64(0.008766675512427023), 'action_x'),
 (np.float64(0.008737350412678544), 'adventure_x'),
 (np.float64(0.008659065686476187), 'thriller_x')]