# **Нейронная коллаборативная фильтрация**



1.   Обработка текстовых данных о фильмах
*   Лемматизация, стемминг
*   TF-IDF- представление текстов в векторное представление

2.   Формирование данных для обучения - каждому пользователю сопоставляется просмотренные фильмы (позитивные примеры) и случайные непросмотренные (негативные примеры)


3.   Модель нейронной сети:
*   Вход: вектор фильма  и one-hot представление пользователя
*   Архитектура: несколько полносвязных слоев с активациями ReLU и выходным слоем с сигмоидной активацией
*   Оптимизатор: Adam
*   Задача: предсказание вероятности взаимодействия пользователя с фильмом





In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import Precision, Recall
from ast import literal_eval
from nltk.stem.snowball import SnowballStemmer
import spacy
import warnings;
warnings.simplefilter('ignore')

In [None]:
data = pd.read_csv('movies_metadata.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
links = pd.read_csv('links_small.csv')


In [None]:
ratings = pd.read_csv('ratings_small.csv')

Создадим колонку "soup", как и в content-based, содержащую всю необходимую информацию о фильме.

Затем векторизуем с помощью TF-IDF.

In [None]:
data = data[data['id'].str.isnumeric()]
data['id'] = data['id'].astype(int)
data['genres'] = data['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
data['year'] = pd.to_datetime(data['release_date'], errors='coerce').dt.year

data = pd.merge(data, credits, on='id')
data = pd.merge(data, keywords, on='id')


data['cast'] = data['cast'].apply(literal_eval)
data['cast'] = data['cast'].apply(lambda x:[i['name'] for i in x] if isinstance(x, list) else [])
data['cast'] = data['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)
data['cast'] = data['cast'].apply(lambda x: [str.lower(i.replace(" ", '')) for i in x])
data['crew'] = data['crew'].apply(literal_eval)

def get_director(x):
  for i in x:
    if i['job'] == 'Director':
      return i['name']
  return np.nan

data['director'] = data['crew'].apply(get_director)
data['director'] = data['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

data['keywords'] = data['keywords'].apply(literal_eval)
data['keywords'] = data['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
s = data.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

def filter_words(x):
  words = []
  for i in x:
    if i in s:
      words.append(i)
  return words

data['keywords'] = data['keywords'].apply(filter_words)

nlp = spacy.load("en_core_web_sm")

def lemmatize_words(x):
    words = []
    for token in x:
        doc = nlp(token)
        words.append(doc[0].lemma_)
    return words

data['keywords'] = data['keywords'].apply(lambda x: lemmatize_words(x))
data['keywords'] = data['keywords'].apply(lambda x: [i.replace(" ", "").lower() for i in x])

def concat(x):
  c = x['keywords'] + x['cast'] + [x['director']] + x['genres']
  soup = ' '.join([str(i).lower() for i in c])
  return soup

data['soup'] = data.apply(lambda x: concat(x), axis=1)



In [None]:
user_to_index = {user: idx for idx, user in enumerate(ratings['userId'].unique())}
movie_to_index = {movie: idx for idx, movie in enumerate(ratings['movieId'].unique())}

ratings['userId'] = ratings['userId'].map(user_to_index)
ratings['movieId'] = ratings['movieId'].map(movie_to_index)

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
data_tfidf = tfidf.fit_transform(data['soup'])

Теперь подготовим данные для обучения.

Создадим данные для обучения: положительные и отрицательные примеры.
- положительные примеры - фильмы, просмотренные пользователем и оцененные > 3.5
- отрицательные примеры -  фильмы, непросмотренные пользователем или оцененные < 3.5

In [None]:
ratings['label'] = (ratings['rating'] > 3.5).astype(int)
positive = ratings[ratings['label'] == 1]
negative = ratings[ratings['label'] == 0]
negative_sampled = negative.groupby('userId').apply(
    lambda x: x.sample(min(len(x), len(positive[positive['userId'] == x.name])), random_state=42)
).reset_index(drop=True)
dataset = pd.concat([positive, negative_sampled])

In [None]:
train, test = train_test_split(dataset, test_size=0.2, random_state=42)


train_user = train['userId'].values
train_movie = train['movieId'].values
train_soup = data_tfidf[train['movieId'].values].toarray()
train_labels = train['label'].values

test_user = test['userId'].values
test_movie = test['movieId'].values
test_soup = data_tfidf[test['movieId'].values].toarray()
test_labels = test['label'].values

In [None]:
def create_hybrid_ncf(n_users, n_movies, soup_dim, embedding_dim=50):
    user_input = Input(shape=(1,))
    movie_input = Input(shape=(1,))
    soup_input = Input(shape=(soup_dim,))

    user_embedding = Embedding(input_dim=n_users, output_dim=embedding_dim)(user_input)
    movie_embedding = Embedding(input_dim=n_movies, output_dim=embedding_dim)(movie_input)

    user_vector = Flatten()(user_embedding)
    movie_vector = Flatten()(movie_embedding)

    combined = Concatenate()([user_vector, movie_vector, soup_input])
    dense = Dense(128, activation='relu')(combined)
    dense = Dropout(0.2)(dense)
    dense = Dense(64, activation='relu')(dense)
    output = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[user_input, movie_input, soup_input], outputs=output)
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
model = create_hybrid_ncf(len(user_to_index), len(movie_to_index), soup_dim=data_tfidf.shape[1])
model.fit([train_user, train_movie, train_soup], train_labels, epochs=10, batch_size=64, validation_split=0.1)


Epoch 1/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 16ms/step - accuracy: 0.6360 - loss: 0.6348 - val_accuracy: 0.6822 - val_loss: 0.5932
Epoch 2/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7234 - loss: 0.5487 - val_accuracy: 0.6860 - val_loss: 0.5908
Epoch 3/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7438 - loss: 0.5121 - val_accuracy: 0.6888 - val_loss: 0.6049
Epoch 4/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 9ms/step - accuracy: 0.7577 - loss: 0.4852 - val_accuracy: 0.6856 - val_loss: 0.6141
Epoch 5/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7720 - loss: 0.4576 - val_accuracy: 0.6812 - val_loss: 0.6559
Epoch 6/10
[1m954/954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7864 - loss: 0.4337 - val_accuracy: 0.6776 - val_loss: 0.6983
Epoch 7/10
[1m954/954[0m

<keras.src.callbacks.history.History at 0x7dbfa688b1c0>

In [None]:
loss, accuracy = model.evaluate([test_user, test_movie, test_soup], test_labels)
print(f"Точность модели: {accuracy:.4f}")

[1m530/530[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6663 - loss: 0.8647
Точность модели: 0.6597


In [None]:
def recommend_movies(user_id, top_n=10):
    user_idx = user_to_index[user_id]
    movie_indices = np.array(list(movie_to_index.values()))


    rated_movies = ratings[ratings['userId'] == user_idx]['movieId'].values
    candidate_movies = np.setdiff1d(movie_indices, rated_movies)

    user_array = np.array([user_idx] * len(candidate_movies))
    soup_array = data_tfidf[candidate_movies].toarray()

    predictions = model.predict([user_array, candidate_movies, soup_array])
    top_indices = predictions.flatten().argsort()[-top_n:][::-1]
    recommended_movie_ids = [list(movie_to_index.keys())[list(movie_to_index.values()).index(i)] for i in candidate_movies[top_indices]]

    recommended_movies = data[data['id'].isin(recommended_movie_ids)][['title', 'genres']]
    return recommended_movies

In [None]:
recommend_movies(1, top_n=10)

[1m283/283[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step


Unnamed: 0,title,genres
334,While You Were Sleeping,"[Comedy, Drama, Romance]"
2379,Dry Cleaning,[Drama]
2545,Frankenstein,"[Drama, Horror, Science Fiction]"
4864,Amélie,"[Comedy, Romance]"
8494,Letter from an Unknown Woman,[Drama]
18372,"Don't Worry, I'm Fine",[Drama]
