In [22]:
import pandas as pd
from annoy import AnnoyIndex
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用 low_memory=False 选项读取文件
ratings = pd.read_csv('ratings_small.csv', encoding='latin1')
movies_metadata = pd.read_csv('movies_metadata.csv', encoding='latin1', low_memory=False)


# 创建用户-物品矩阵
user_item_matrix = ratings.pivot(index='userId', columns='movieId', values='rating')

# 计算物品相似度
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(user_item_matrix.fillna(0).T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)


In [23]:
def item_based_recommendations(user_id, user_item_matrix, item_similarity_df, num_recommendations=5):
    user_ratings = user_item_matrix.loc[user_id].dropna()
    similar_items = pd.Series(dtype=float)

    for i, rating in user_ratings.items():
        similars = item_similarity_df[i].drop(user_ratings.index)
        similar_items = pd.concat([similar_items, similars])

    similar_items = similar_items.groupby(similar_items.index).sum()
    similar_items.sort_values(inplace=True, ascending=False)

    recommendations = similar_items.head(num_recommendations).index.tolist()
    return recommendations


# 为用户1生成推荐
user_id = 1
recommendations = item_based_recommendations(user_id, user_item_matrix, item_similarity_df)
print(f"Recommendations for user {user_id}: {recommendations}")


Recommendations for user 1: [1387, 1266, 1214, 3108, 2194]


In [None]:

# 基于内容的推荐函数
# 处理文本数据
tfidf = TfidfVectorizer(stop_words='english')
movies_metadata['description'] = movies_metadata['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies_metadata['description'])

# 使用 Annoy 构建近似最近邻索引
f = tfidf_matrix.shape[1]
annoy_index = AnnoyIndex(f, 'angular')

for i in range(tfidf_matrix.shape[0]):
    annoy_index.add_item(i, tfidf_matrix[i].toarray()[0])

annoy_index.build(10)  # 10 棵树

def content_based_recommendations(movie_id, annoy_index, movies_metadata, num_recommendations=5):
    movie_id = str(movie_id)

    if movie_id not in movies_metadata['id'].values:
        print(f"Movie ID {movie_id} not found in metadata")
        return pd.DataFrame()

    idx = movies_metadata[movies_metadata['id'] == movie_id].index[0]
    sim_scores = annoy_index.get_nns_by_item(idx, num_recommendations + 1)[1:]
    return movies_metadata.iloc[sim_scores]

# 为电影生成推荐
movie_id = 1
recommendations = content_based_recommendations(movie_id, annoy_index, movies_metadata)
if not recommendations.empty:
    print(f"Recommendations for movie {movie_id}: {recommendations[['title', 'id']]}")
else:
    print(f"No recommendations available for movie ID {movie_id}")

In [None]:
# 混合推荐函数
def hybrid_recommendations(user_id, movie_id, user_item_matrix, item_similarity_df, annoy_index, movies_metadata, cf_weight=0.5, cb_weight=0.5):
    cf_recs = item_based_recommendations(user_id, user_item_matrix, item_similarity_df, num_recommendations=10)
    cb_recs = content_based_recommendations(movie_id, annoy_index, movies_metadata, num_recommendations=10)

    if cb_recs.empty:
        return cf_recs[:5]

    cf_recs_df = pd.DataFrame({'movieId': cf_recs, 'score': [cf_weight] * len(cf_recs)})
    cb_recs_df = pd.DataFrame({'movieId': cb_recs['id'], 'score': [cb_weight] * len(cb_recs)})

    hybrid_recs = pd.concat([cf_recs_df, cb_recs_df]).groupby('movieId').sum().sort_values(by='score', ascending=False)
    return hybrid_recs.head(5).index.tolist()

# 为用户和电影生成混合推荐
user_id = 1
movie_id = 1
hybrid_recs = hybrid_recommendations(user_id, movie_id, user_item_matrix, item_similarity_df, annoy_index, movies_metadata)
print(f"Hybrid Recommendations for user {user_id}: {hybrid_recs}")

In [16]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# 加载数据
ratings = pd.read_csv('ratings_small.csv')
movies_metadata = pd.read_csv('movies_metadata.csv', encoding='latin1', low_memory=False)

# 数据预处理
# 过滤出合法的 movieId 和 userId
ratings = ratings[['userId', 'movieId', 'rating']]

# 创建 userId 和 movieId 的映射
user_id_mapping = {id: i for i, id in enumerate(ratings['userId'].unique())}
movie_id_mapping = {id: i for i, id in enumerate(ratings['movieId'].unique())}

# 映射到新的索引
ratings['userId'] = ratings['userId'].map(user_id_mapping)
ratings['movieId'] = ratings['movieId'].map(movie_id_mapping)

# 转换为张量
ratings_tensor = torch.tensor(ratings.values, dtype=torch.float32)

# 训练集和测试集划分
train_tensor, test_tensor = train_test_split(ratings_tensor, test_size=0.2, random_state=42)

# 检查数据
print(ratings.head())

   userId  movieId  rating
0       0        0     2.5
1       0        1     3.0
2       0        2     3.0
3       0        3     2.0
4       0        4     4.0


In [18]:
# 定义数据集类
class RatingsDataset(Dataset):
    def __init__(self, ratings):
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.ratings[idx, :3]

train_dataset = RatingsDataset(train_tensor)
test_dataset = RatingsDataset(test_tensor)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [19]:
# 定义模型
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=20):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)

    def forward(self, user, item):
        user_embedded = self.user_embedding(user)
        item_embedded = self.item_embedding(item)
        return (user_embedded * item_embedded).sum(1)

# 获取用户和物品的数量
num_users = ratings['userId'].nunique()
num_items = ratings['movieId'].nunique()

# 初始化模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MatrixFactorization(num_users, num_items, embedding_dim=50).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

print(model)


MatrixFactorization(
  (user_embedding): Embedding(671, 50)
  (item_embedding): Embedding(9066, 50)
)


In [20]:
# 训练模型
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        users = batch[:, 0].long().to(device)
        items = batch[:, 1].long().to(device)
        ratings = batch[:, 2].to(device)

        optimizer.zero_grad()
        outputs = model(users, items)
        loss = criterion(outputs, ratings)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')


Epoch 1/20, Loss: 39.83473800564651
Epoch 2/20, Loss: 8.069735024853957
Epoch 3/20, Loss: 2.413070353672659
Epoch 4/20, Loss: 1.7365394844401845
Epoch 5/20, Loss: 2.1562769540208135
Epoch 6/20, Loss: 2.2979685268956693
Epoch 7/20, Loss: 1.7757424617842805
Epoch 8/20, Loss: 1.4013934999251727
Epoch 9/20, Loss: 1.2803011572213292
Epoch 10/20, Loss: 1.260276926411904
Epoch 11/20, Loss: 1.1655608230738712
Epoch 12/20, Loss: 1.0828681909304252
Epoch 13/20, Loss: 1.0190056783022832
Epoch 14/20, Loss: 0.9833268187076544
Epoch 15/20, Loss: 0.9526256298561462
Epoch 16/20, Loss: 0.9233448592831286
Epoch 17/20, Loss: 0.8937154515958328
Epoch 18/20, Loss: 0.8769234113698955
Epoch 19/20, Loss: 0.8646028796784121
Epoch 20/20, Loss: 0.8457783781510178


In [30]:
def recommend_movies(user_id, model, movies_metadata, user_id_mapping, movie_id_mapping, top_n=10):
    model.eval()

    # 将用户 ID 转换为张量
    user_id_tensor = torch.tensor([user_id_mapping[user_id]]).long().to(device)
    # 创建一个包含所有电影 ID 的张量
    item_ids = torch.arange(num_items).long().to(device)

    # 使用模型进行预测
    with torch.no_grad():
        predictions = model(user_id_tensor.repeat(num_items), item_ids)

    # 对预测结果进行排序
    _, indices = torch.sort(predictions, descending=True)
    recommended_movie_ids = [item_ids[idx].item() for idx in indices]

    # 将推荐的电影 ID 转换为原始 ID
    original_movie_ids = {v: k for k, v in movie_id_mapping.items()}
    recommended_original_ids = [original_movie_ids[mid] for mid in recommended_movie_ids]

    # 获取存在于 movies_metadata 中的电影 ID
    valid_movie_ids = set(movies_metadata['id'].astype(str).unique())

    # 过滤掉不存在的电影 ID
    recommended_original_ids = [mid for mid in recommended_original_ids if str(mid) in valid_movie_ids]

    # 获取推荐的电影
    recommended_movies = []
    for mid in recommended_original_ids:
        if len(recommended_movies) == top_n:
            break
        recommended_movie = movies_metadata[movies_metadata['id'].astype(str) == str(mid)]
        if not recommended_movie.empty:
            recommended_movies.append(recommended_movie)

    return pd.concat(recommended_movies)

# 假设你要为用户ID 1推荐
user_id = 3
recommended_movies = recommend_movies(user_id, model, movies_metadata, user_id_mapping, movie_id_mapping)
print(recommended_movies[['title', 'id']])

                              title     id
5755                 Making Contact   7011
6655            The Meaning of Life   4543
8728              The Holy Mountain   8327
8452   Letter from an Unknown Woman    946
34393                   Kurukshetra  81782
16221                  Doppelganger  32234
6282        Man with a Movie Camera  26317
28180                At Point Blank   8743
11148      The Secret Life of Words    148
7235           Mother, Jugs & Speed  26176
