<a href="https://colab.research.google.com/github/AnastasiaDMW/RecSys/blob/main/RecSys.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Загрузка данных

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from google.colab import drive
from torch.utils.data import Dataset, DataLoader

drive.mount('/content/drive')

movies_path = "/content/drive/My Drive/Data/movies.csv"
ratings_path = "/content/drive/My Drive/Data/ratings.csv"
tags_path = "/content/drive/My Drive/Data/tags.csv"

Mounted at /content/drive


## Реализация RecSys на основе подхода Content-based recommender system

In [None]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)
tags = pd.read_csv(tags_path)

movies['genres'] = movies['genres'].str.split('|')
genres_expanded = movies['genres'].explode()
genres_dummies = pd.get_dummies(genres_expanded).groupby(level=0).max()
movies_features = movies[['movieId']].join(genres_dummies)

tags_grouped = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
tfidf = TfidfVectorizer(max_features=500)
tags_tfidf = tfidf.fit_transform(tags_grouped['tag'])
tags_tfidf_df = pd.DataFrame(tags_tfidf.toarray(), index=tags_grouped['movieId'])

movies_features = movies_features.set_index('movieId').join(tags_tfidf_df, how='left').fillna(0)

def recommend_movies(user_id, movies_features, ratings, top_n=10, batch_size=1000):
    user_ratings = ratings[ratings['userId'] == user_id]
    watched_movie_ids = user_ratings['movieId'].tolist()

    if not watched_movie_ids:
        return []

    user_vector = movies_features.loc[watched_movie_ids].values
    user_vector = np.mean(user_vector, axis=0).reshape(1, -1)

    all_movie_ids = movies_features.index.tolist()
    unseen_movie_ids = [mid for mid in all_movie_ids if mid not in watched_movie_ids]

    similarities = []
    for i in range(0, len(unseen_movie_ids), batch_size):
        batch_ids = unseen_movie_ids[i:i+batch_size]
        batch_vectors = movies_features.loc[batch_ids].values
        sim = cosine_similarity(user_vector, batch_vectors)[0]
        similarities.extend(zip(batch_ids, sim))

    similarities.sort(key=lambda x: x[1], reverse=True)
    recommended_ids = [mid for mid, _ in similarities[:top_n]]

    return recommended_ids

user_id = 1
top_recommendations = recommend_movies(user_id, movies_features, ratings, top_n=5)
recommended_movies = movies[movies['movieId'].isin(top_recommendations)]
print(recommended_movies[['title']])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                      title
3608                  Stunt Man, The (1980)
4005                       Flashback (1990)
4681         The Great Train Robbery (1978)
6570              Hunting Party, The (2007)
8597  Dragonheart 2: A New Beginning (2000)


## Реализация RecSys на основе подхода Collaborative Recommender system - Item-Based

In [None]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)
tags = pd.read_csv(tags_path)

user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()
user_id_to_index = {uid: idx for idx, uid in enumerate(user_ids)}
movie_id_to_index = {mid: idx for idx, mid in enumerate(movie_ids)}
index_to_movie_id = {idx: mid for mid, idx in movie_id_to_index.items()}

n_users = len(user_ids)
n_movies = len(movie_ids)
data = ratings['rating'].values
row = ratings['userId'].map(user_id_to_index).values
col = ratings['movieId'].map(movie_id_to_index).values
sparse_matrix = csr_matrix((data, (row, col)), shape=(n_users, n_movies))

item_similarity = cosine_similarity(sparse_matrix.T, dense_output=False)

def predict_rating(user_index, item_index, top_k=20):
    user_ratings = sparse_matrix[user_index].toarray().flatten()
    sim_scores = item_similarity[item_index].toarray().flatten()
    rated_indices = user_ratings.nonzero()[0]
    sim_scores[rated_indices] = 0
    top_indices = np.argsort(sim_scores)[::-1][:top_k]
    if np.sum(sim_scores[top_indices]) > 0:
        return np.dot(sim_scores[top_indices], user_ratings[top_indices]) / np.sum(sim_scores[top_indices])
    else:
        return 0

def recommend_movies(user_id, n_recommendations=10):
    user_index = user_id_to_index[user_id]
    unrated_items = np.where(sparse_matrix[user_index].toarray().flatten() == 0)[0]
    predictions = [(item, predict_rating(user_index, item)) for item in unrated_items]
    predictions.sort(key=lambda x: x[1], reverse=True)
    top_items = predictions[:n_recommendations]
    recommended_movie_ids = [index_to_movie_id[i] for i, _ in top_items]
    return movies[movies['movieId'].isin(recommended_movie_ids)]

print(recommend_movies(200, 10))

     movieId                       title                          genres
2          3     Grumpier Old Men (1995)                  Comedy|Romance
5          6                 Heat (1995)           Action|Crime|Thriller
46        50  Usual Suspects, The (1995)          Crime|Mystery|Thriller
62        70  From Dusk Till Dawn (1996)   Action|Comedy|Horror|Thriller
89       101        Bottle Rocket (1996)  Adventure|Comedy|Crime|Romance
124      151              Rob Roy (1995)        Action|Drama|Romance|War
130      157       Canadian Bacon (1995)                      Comedy|War
136      163            Desperado (1995)          Action|Romance|Western
184      216        Billy Madison (1995)                          Comedy
190      223               Clerks (1994)                          Comedy


## Простые Реализации RecSys (С этим фильмом также смотрят, Популярное среди всех)

In [None]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)
tags = pd.read_csv(tags_path)

def also_watched(movie_id, n=5):
    users_who_watched = ratings[ratings['movieId'] == movie_id]['userId'].unique()
    co_movies = ratings[ratings['userId'].isin(users_who_watched)]
    co_movies_count = co_movies.groupby('movieId').size().sort_values(ascending=False)
    co_movies_count = co_movies_count[co_movies_count.index != movie_id]
    top_movies = co_movies_count.head(n).index.tolist()
    return movies[movies['movieId'].isin(top_movies)]

def popular_movies(n=10):
    movie_popularity = ratings.groupby('movieId').size().sort_values(ascending=False)
    top_movies = movie_popularity.head(n).index.tolist()
    return movies[movies['movieId'].isin(top_movies)]

def baseline_recommendations(user_id, movie_id=None):
    recommendations = {}
    if movie_id is not None:
        recommendations['also_watched'] = also_watched(movie_id, n=5)
    recommendations['popular'] = popular_movies(n=10)
    return recommendations

if __name__ == "__main__":
    user_id_example = 1
    movie_id_example = 1

    recs = baseline_recommendations(user_id_example, movie_id_example)
    for key, df in recs.items():
        print(f"\n{key}")
        print(df[['movieId', 'title']])


also_watched
     movieId                                      title
224      260  Star Wars: Episode IV - A New Hope (1977)
257      296                        Pulp Fiction (1994)
277      318           Shawshank Redemption, The (1994)
314      356                        Forrest Gump (1994)
418      480                       Jurassic Park (1993)

popular
      movieId                                      title
97        110                          Braveheart (1995)
224       260  Star Wars: Episode IV - A New Hope (1977)
257       296                        Pulp Fiction (1994)
277       318           Shawshank Redemption, The (1994)
314       356                        Forrest Gump (1994)
418       480                       Jurassic Park (1993)
461       527                    Schindler's List (1993)
507       589          Terminator 2: Judgment Day (1991)
510       593           Silence of the Lambs, The (1991)
1939     2571                         Matrix, The (1999)


##Two-Tower

In [None]:
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()
user2idx = {uid: idx for idx, uid in enumerate(user_ids)}
movie2idx = {mid: idx for idx, mid in enumerate(movie_ids)}
idx2movie = {idx: mid for mid, idx in movie2idx.items()}

ratings['user_idx'] = ratings['userId'].map(user2idx)
ratings['movie_idx'] = ratings['movieId'].map(movie2idx)

class MyDataset(Dataset):
    def __init__(self, df):
        self.users = df['user_idx'].values
        self.items = df['movie_idx'].values
        self.ratings = df['rating'].values.astype(np.float32)
    def __len__(self):
        return len(self.ratings)
    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

dataloader = DataLoader(MyDataset(ratings), batch_size=512, shuffle=True)

class MyTwoTower(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=32):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
    def forward(self, user_idx, item_idx):
        u = self.user_emb(user_idx)
        v = self.item_emb(item_idx)
        return (u * v).sum(dim=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MyTwoTower(len(user_ids), len(movie_ids), emb_dim=32).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

for epoch in range(5):
    total_loss = 0
    for u, i, r in dataloader:
        u, i, r = u.to(device), i.to(device), r.to(device)
        optimizer.zero_grad()
        pred = model(u, i)
        loss = loss_fn(pred, r)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(r)
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(ratings):.4f}")

def recommend(user_id, top_k=10):
    user_idx = torch.tensor([user2idx[user_id]]).to(device)
    all_items = torch.tensor(range(len(movie_ids))).to(device)
    user_idx_expand = user_idx.repeat(len(movie_ids))
    model.eval()
    with torch.no_grad():
        scores = model(user_idx_expand, all_items)
    top_indices = torch.topk(scores, top_k).indices.cpu().numpy()
    recommended_ids = [idx2movie[i] for i in top_indices]
    return movies[movies['movieId'].isin(recommended_ids)][['movieId','title']]

print(recommend(10, top_k=10))

Epoch 1, Loss: 34.3595
Epoch 2, Loss: 17.9131
Epoch 3, Loss: 10.7810
Epoch 4, Loss: 5.0339
Epoch 5, Loss: 2.4106
      movieId                                              title
390       449                         Fear of a Black Hat (1994)
2251     2988                           Melvin and Howard (1980)
2926     3925                      Stranger Than Paradise (1984)
3240     4380  Princess and the Warrior, The (Krieger und die...
5812    31973  Germany Year Zero (Germania anno zero) (Deutsc...
6474    52784                                  Sharkwater (2006)
6820    61073                                   Hell Ride (2008)
7338    78039                              Blue Valentine (2010)
7463    81819                                    Biutiful (2010)
9571   174055                                     Dunkirk (2017)
