<a href="https://colab.research.google.com/github/FilippMaksimov/Net.RecSystems/blob/main/Maksimov_F_Rec_Sys02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install surprise



In [63]:
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd
import numpy as np

Гибридные рекомендательные системы

In [64]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [65]:
movies_ratings = ratings.merge(movies, on='movieId', how='left')
dataset = pd.DataFrame({
    'uid': movies_ratings.userId,
    'iid': movies_ratings.title,
    'rating': movies_ratings.rating
})

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=1)
cf_model = SVD(n_factors=60, n_epochs=20, random_state=1)
cf_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a5c77de7910>

In [66]:
test_pred = cf_model.test(testset)
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8732


0.8731895239673345

In [67]:
y_pred_cf = [cf_model.predict(uid, iid).est for (uid, iid, _) in testset]

In [68]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

movies['genres'] = movies['genres'].fillna('')  # Заменяем NaN на пустую строку
# Применяем TF-IDF к жанрам
tfidf_genres = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
genres_tfidf_matrix = tfidf_genres.fit_transform(movies['genres'])
# Преобразуем TF-IDF матрицу в DataFrame
genres_tfidf_df = pd.DataFrame(genres_tfidf_matrix.toarray(), columns=tfidf_genres.get_feature_names_out())

# Обработка тегов

tags['tag'] = tags['tag'].fillna('').astype(str)  # Заменяем NaN на пустую строку
tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
# Применяем TF-IDF к тегам
tfidf_tags = TfidfVectorizer()
tags_tfidf_matrix = tfidf_tags.fit_transform(tags_grouped['tag'])
# Преобразуем TF-IDF матрицу в DataFrame
tags_tfidf_df = pd.DataFrame(tags_tfidf_matrix.toarray(), columns=tfidf_tags.get_feature_names_out())
tags_tfidf_df['movieId'] = tags_grouped['movieId']

#Агрегация статистик по пользователям
user_stats = ratings.groupby('userId')['rating'].agg(['mean', 'median', 'var', 'count']).reset_index()
user_stats.columns = ['userId', 'user_mean_rating', 'user_median_rating', 'user_rating_variance', 'user_rating_count']

#Агрегация статистик по фильмам
movie_stats = ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'var', 'count']).reset_index()
movie_stats.columns = ['movieId', 'movie_mean_rating', 'movie_median_rating', 'movie_rating_variance', 'movie_rating_count']

# 1. Добавляем TF-IDF жанров к таблице movies
movies = movies.join(genres_tfidf_df)

#Объединяем данные: рейтинги, статистики, TF-IDF
ratings_with_features = ratings.merge(user_stats, on='userId', how='left')
ratings_with_features = ratings_with_features.merge(movie_stats, on='movieId', how='left')
ratings_with_features = ratings_with_features.merge(movies, on='movieId', how='left')
ratings_with_features = ratings_with_features.merge(tags_tfidf_df, on='movieId', how='left').fillna(0)

#Подготовка признаков и целевой переменной
exclude_columns = ['userId', 'movieId', 'rating', 'timestamp', 'title', 'genres']
X = ratings_with_features.drop(columns=exclude_columns)
y = ratings_with_features['rating']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Контентная модель (CB)
content_model = Ridge(alpha=1.0)
content_model.fit(X_train, y_train)
y_pred_content = content_model.predict(X_test)

# --- Гибридная модель ---
# Объединяем предсказания CB и CF через взвешенную сумму
alpha = 0.3  # Вес для коллаборативной фильтрации
y_pred_hybrid = alpha * np.array(y_pred_cf) + (1 - alpha) * np.array(y_pred_content[:len(y_pred_cf)])

# --- Оценка модели ---
rmse_hybrid = np.sqrt(mean_squared_error(y_test[:len(y_pred_cf)], y_pred_hybrid))
print(f"RMSE для гибридной модели: {rmse_hybrid}")



RMSE для гибридной модели: 0.8578311295768493
