In [1]:
pip install --upgrade pip notebook scikit-learn pandas numpy matplotlib seaborn pickle ast

Defaulting to user installation because normal site-packages is not writeable
Collecting notebook
  Downloading notebook-7.4.5-py3-none-any.whl.metadata (10 kB)
Collecting pandas
  Using cached pandas-2.3.2-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting numpy
  Using cached numpy-2.3.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting matplotlib
  Using cached matplotlib-3.10.6-cp313-cp313-win_amd64.whl.metadata (11 kB)
Note: you may need to restart the kernel to use updated packages.


ERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11; 1.26.0 Requires-Python >=3.9,<3.13; 1.26.1 Requires-Python >=3.9,<3.13
ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval
%matplotlib inline

**Загрузка данных**

In [3]:
movies = pd.read_csv('movies_metadata.csv', low_memory=False)
ratings = pd.read_csv('ratings_small.csv', low_memory=False)
keywords = pd.read_csv('keywords.csv', low_memory=False)
links = pd.read_csv('links_small.csv', low_memory=False)

Удаление пропусков в фильмах

In [4]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype('int')

Объединение датасетов фильмов и рейтингов

In [5]:
ratings = ratings.merge(links[['movieId', 'tmdbId']], left_on='movieId', right_on='movieId', how='inner')
ratings['movieId'] = pd.to_numeric(ratings['tmdbId'], errors='coerce')  # Align with movies 'id'
ratings = ratings.merge(movies[['id', 'original_title', 'genres', 'overview', 'vote_average', 'vote_count']], left_on='movieId', right_on='id', how='inner')

Получаем жанры

In [6]:
def get_genres(x):
    try:
        return ' '.join([d['name'] for d in literal_eval(x)]) if pd.notnull(x) else ''
    except:
        return ''

movies['genres_str'] = movies['genres'].apply(get_genres)

Получаем ключевые слова

In [7]:
def get_keywords(x):
    try:
        return ' '.join([d['name'] for d in literal_eval(x)]) if pd.notnull(x) else ''
    except:
        return ''

keywords['keywords_str'] = keywords['keywords'].apply(get_keywords)
movies = movies.merge(keywords[['id', 'keywords_str']], on='id', how='left')

Создаем текст содержащий описание, жанры и ключевые слова

In [8]:
movies['soup'] = movies['overview'].fillna('') + ' ' + movies['genres_str'] + ' ' + movies['keywords_str'].fillna('')

Создаем векторайзер

In [9]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['soup'])
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)

Создаем NearestNeighbors модель

In [10]:
nn_model = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
nn_model.fit(tfidf_matrix)

0,1,2
,n_neighbors,11
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,


Функция взвешенного рейтинга (в стиле imdb)

In [11]:
def weighted_rating(x, m=movies['vote_count'].quantile(0.8), C=movies['vote_average'].mean()):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + m) * R) + (m / (m + v) * C)

movies['weighted_rating'] = movies.apply(weighted_rating, axis=1)

10 Самых популярных

In [12]:
top10_popular = movies.sort_values('weighted_rating', ascending=False)['original_title'].head(10).tolist()

Рекомендации по жанру

In [13]:
def recommend_by_genre(genre: str, n: int = 10):
    genre_movies = movies[movies['genres_str'].str.contains(genre, case=False, na=False)]
    genre_movies = genre_movies.sort_values('weighted_rating', ascending=False)
    return genre_movies['original_title'].head(n).tolist()

Рекомендации по названию

In [14]:
title_to_index = pd.Series(movies.index, index=movies['original_title'])
def recommend_by_title(title: str, n: int = 10):
    if title not in title_to_index:
        return []
    idx = title_to_index[title]
    # Get the TF-IDF vector for the movie (already in CSR format)
    movie_vector = tfidf_matrix[idx]
    # Find n+1 nearest neighbors (including the movie itself)
    distances, indices = nn_model.kneighbors(movie_vector, n_neighbors=n+1)
    # Exclude the movie itself (first result)
    movie_indices = indices[0][1:]
    return movies['original_title'].iloc[movie_indices].tolist()

Коллаборативная фильтрация

In [15]:
user_ratings = ratings.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
item_sim = cosine_similarity(user_ratings.T)
item_sim_df = pd.DataFrame(item_sim, index=user_ratings.columns, columns=user_ratings.columns)

def recommend_collaborative(movie_title: str, n: int = 10):
    if movie_title not in title_to_index:
        return []
    movie_id = movies[movies['original_title'] == movie_title]['id'].values
    if len(movie_id) == 0:
        return []
    movie_id = int(movie_id[0])
    if movie_id not in item_sim_df.columns:
        return []
    similar_movies = item_sim_df.loc[movie_id].sort_values(ascending=False).iloc[1:n+1].index
    similar_titles = movies[movies['id'].isin(similar_movies)]['original_title'].head(n).tolist()
    return similar_titles

In [16]:
print("Топ 10 популярных:", top10_popular)
print("Топ в жанре комедии':", recommend_by_genre('Comedy'))
print("Похожие на 'История игрушек':", recommend_by_title('Toy Story'))
print("Рекомендации по 'История игрушек':", recommend_collaborative('Toy Story'))

Топ 10 популярных: ['Dilwale Dulhania Le Jayenge', 'The Shawshank Redemption', 'The Godfather', '君の名は。', 'The Dark Knight', 'Fight Club', 'Pulp Fiction', "Schindler's List", 'Whiplash', '千と千尋の神隠し']
Топ в жанре комедии': ['Dilwale Dulhania Le Jayenge', 'La vita è bella', 'Forrest Gump', 'Intouchables', 'Back to the Future', 'The Grand Budapest Hotel', 'Modern Times', 'The Great Dictator', 'City Lights', 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb']
Похожие на 'История игрушек': ['Toy Story 3', 'Toy Story 2', 'Small Soldiers', 'Small Fry', 'Silent Night, Deadly Night 5: The Toy Maker', '玩具修理者', 'The 40 Year Old Virgin', 'Dolls', 'Toys', "Child's Play 3"]
Рекомендации по 'История игрушек': ['Star Wars', 'Pulp Fiction', 'Forrest Gump', 'Jurassic Park', 'Independence Day', 'Return of the Jedi', 'Groundhog Day', 'Back to the Future', 'Toy Story 2', 'Shrek']


In [17]:
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

with open('nn_model.pkl', 'wb') as f:
    pickle.dump(nn_model, f)

print("Model and vectorizer saved successfully!")

Model and vectorizer saved successfully!
