In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.metrics import mean_absolute_error, root_mean_squared_error
from sklearn.neighbors import NearestNeighbors
import numpy as np

## 1. Загрузка данных

In [2]:
df = pd.read_csv('imdb_top_1000.csv')

## 2. Предобработка
### преобразование числовых полей

In [3]:
df.dtypes

Poster_Link       object
Series_Title      object
Released_Year     object
Certificate       object
Runtime           object
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross             object
dtype: object

In [4]:
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce').fillna(0).astype(int)
df['No_of_Votes'] = (df['No_of_Votes'].astype(str).str.replace(',', '', regex=False).replace('nan', '0').astype(int))
df['Gross'] = (df['Gross'].astype(str).str.replace(',', '', regex=False).replace('nan', np.nan))
df['Gross'] = pd.to_numeric(df['Gross'], errors='coerce').fillna(0.0)
df['Meta_score'] = df['Meta_score'].fillna(df['Meta_score'].median())
df['IMDB_Rating'] = df['IMDB_Rating'].fillna(df['IMDB_Rating'].mean())
if df['Runtime'].dtype == object:
    df['Runtime'] = df['Runtime'].str.extract(r'(\d+)').fillna(0).astype(int)

In [5]:
df.dtypes

Poster_Link       object
Series_Title      object
Released_Year      int64
Certificate       object
Runtime            int64
Genre             object
IMDB_Rating      float64
Overview          object
Meta_score       float64
Director          object
Star1             object
Star2             object
Star3             object
Star4             object
No_of_Votes        int64
Gross            float64
dtype: object

In [6]:
df.describe()

Unnamed: 0,Released_Year,Runtime,IMDB_Rating,Meta_score,No_of_Votes,Gross
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,1989.226,122.891,7.9493,78.133,273692.9,56536880.0
std,67.135341,28.093671,0.275491,11.368225,327372.7,103238200.0
min,0.0,45.0,7.6,28.0,25088.0,0.0
25%,1976.0,103.0,7.7,72.0,55526.25,445709.8
50%,1999.0,119.0,7.9,79.0,138548.5,10702750.0
75%,2009.0,137.0,8.1,85.25,374161.2,61539890.0
max,2020.0,321.0,9.3,100.0,2343110.0,936662200.0


## 3. Кодировка жанров и актеров
### Жанры

In [7]:
df['Genre_List'] = df['Genre'].fillna('').str.split(',\s*')
mlb_genre = MultiLabelBinarizer()
genre_encoded = mlb_genre.fit_transform(df['Genre_List'])

  df['Genre_List'] = df['Genre'].fillna('').str.split(',\s*')


### Актеры

In [8]:
df['Stars_List'] = df[['Star1','Star2','Star3','Star4']].fillna('').values.tolist()
mlb_stars = MultiLabelBinarizer()
stars_encoded = mlb_stars.fit_transform(df['Stars_List'])

### Режиссеры

In [9]:
director_means = df.groupby('Director')['IMDB_Rating'].mean().to_dict()
global_mean = df['IMDB_Rating'].mean()
df['Director_Score'] = df['Director'].map(director_means).fillna(global_mean)

## 4. Создание признаков

In [10]:
X = np.hstack([df[['Released_Year', 'Runtime', 'Meta_score', 'No_of_Votes', 'Gross', 'Director_Score']].values, genre_encoded, stars_encoded])
y = df['IMDB_Rating'].values

## 5. Масштабирование числовых признаков

In [11]:
scaler = StandardScaler()
X[:, :6] = scaler.fit_transform(X[:, :6])

## 6. Разбиение на train/test

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 7. Обучение модели

In [13]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


## 8. Оценка

In [14]:
predictions = model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)
rmse = root_mean_squared_error(y_test, predictions)

print(f'MAE на тестовой выборке: {mae:.3f}')
print(f'RMSE на тестовой выборке: {rmse:.3f}')

MAE на тестовой выборке: 0.111
RMSE на тестовой выборке: 0.163


## 9. Функция предсказания оценки

In [15]:
def predict_rating(
    director: str,
    genres: list[str],
    stars: list[str],
    released_year: int | None = None,
    runtime: int | None = None,
    meta_score: int | None = None,
    votes: int | None = None,
    gross: int | None = None,
):
    median_year = int(df["Released_Year"].median())
    median_runtime = int(df["Runtime"].median())
    median_meta = float(df["Meta_score"].median())
    median_votes = int(df["No_of_Votes"].median())
    median_gross = float(df["Gross"].median())

    if not director:
        raise ValueError("Аргумент 'director' обязателен")
    if not genres:
        raise ValueError("Нужно указать хотя бы один жанр")
    if not stars:
        raise ValueError("Нужно указать хотя бы одного актёра")

    released_year = released_year or median_year
    runtime = runtime or median_runtime
    meta_score = meta_score or median_meta
    votes = votes or median_votes
    gross = gross or median_gross

    director_score = director_means.get(director, global_mean)

    # Параметры
    genre_vec = mlb_genre.transform([genres])
    stars_vec = mlb_stars.transform([stars])

    num = np.array([[released_year, runtime, meta_score, votes, gross, director_score]])
    num = scaler.transform(num)
    features = np.hstack([num, genre_vec, stars_vec])
    return model.predict(features)[0]

## 10. Рекомендации (content-based)

In [16]:
nn = NearestNeighbors(n_neighbors=6, metric='euclidean')
nn.fit(X)

0,1,2
,n_neighbors,6
,radius,1.0
,algorithm,'auto'
,leaf_size,30
,metric,'euclidean'
,p,2
,metric_params,
,n_jobs,


In [17]:
def recommend(liked_titles: list[str], n_recs: int = 5) -> list[str]:
    # Переводим список любимых в множество для О(1)-проверки
    liked_set = set(liked_titles)

    # Ищем индексы тех фильмов, что пользователь уже любит
    idxs = df.index[df["Series_Title"].isin(liked_titles)].tolist()
    if not idxs:
        return []

    # Для каждого любимого находим ближайших соседей
    neighbors = nn.kneighbors(X[idxs], return_distance=False)
    recs = []

    # Проходим по всем найденным соседям
    for row in neighbors:
        for i in row[1:]:  # пропускаем сам фильм (0-й элемент)
            title = df.loc[i, "Series_Title"]
            # если уже есть в liked — пропускаем
            if title in liked_set:
                continue
            # иначе добавляем рекомендацию
            recs.append(title)
            if len(recs) >= n_recs:
                return recs

    return recs

### пример использования

In [18]:
print("Пример предсказания рейтинга:")
print(
    predict_rating(
        director="David Fincher",
        genres=["Crime", "Drama", "Mystery", "Thriller"],
        stars=["Brad Pitt", "Morgan Freeman", "Kevin Spacey"],
        meta_score=65,
        runtime=127,
        released_year=1995,
        votes=2100,
        gross=101_040_643,
    )
)

Пример предсказания рейтинга:
7.8


In [19]:
print('Пример рекомендаций:')
print(recommend(['Gone Girl', 'The Godfather', 'Inception', 'The Lord of the Rings: The Fellowship of the Ring', 'The Matrix', "Arrival"], n_recs=5))

Пример рекомендаций:
['The Godfather: Part II', 'Pulp Fiction', 'The Departed', 'Django Unchained', 'Batman Begins']
