In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from surprise import KNNWithMeans

# 加载数据
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

# 数据预处理
# 将用户的年龄和职业转化为类别特征
age_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()
users['Age'] = age_encoder.fit_transform(users['Age'])
users['Occupation'] = occupation_encoder.fit_transform(users['Occupation'])

# 合并用户数据和评分数据
merged_data = pd.merge(ratings, users, on='UserID')

# 加载Surprise数据集
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(merged_data[['UserID', 'MovieID', 'Rating']], reader)

# 训练和测试数据集划分
trainset, testset = train_test_split(data, test_size=0.2)

# 使用SVD进行协同过滤
svd = SVD()
svd.fit(trainset)

# 评价模型性能
predictions = svd.test(testset)
print("RMSE: ", accuracy.rmse(predictions))
print("MAE: ", accuracy.mae(predictions))

# 使用TF-IDF进行内容过滤
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['Genres'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# 协同过滤推荐
def collaborative_filtering_recommendations(user_id, n=10):
    user_ratings = ratings[ratings['UserID'] == user_id]
    user_watched = user_ratings['MovieID'].tolist()
    movie_ids = movies['MovieID'].tolist()
    recommendations = []
    for movie_id in movie_ids:
        if (movie_id not in user_watched):
            est = svd.predict(user_id, movie_id).est
            recommendations.append((movie_id, est))
    recommendations = sorted(recommendations, key=lambda x: x[1], reverse=True)
    top_n_recommendations = [movies[movies['MovieID'] == movie_id]['Title'].values[0] for movie_id, _ in recommendations[:n]]
    return top_n_recommendations

# 内容过滤推荐
def content_based_recommendations(title, n=10):
    idx = movies[movies['Title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_n_similar = [movies['Title'].iloc[i[0]] for i in sim_scores[1:n+1]]
    return top_n_similar

# 混合推荐
def hybrid_recommendations(user_id, title, alpha=0.5, n=10):
    content_recs = content_based_recommendations(title, n)
    collab_recs = collaborative_filtering_recommendations(user_id, n)
    
    content_recs_set = set(content_recs)
    collab_recs_set = set(collab_recs)
    
    common_recs = content_recs_set.intersection(collab_recs_set)
    unique_recs = content_recs_set.union(collab_recs_set) - common_recs
    
    final_recs = list(common_recs) + list(unique_recs)
    return final_recs[:n]

# 模型评估
def evaluate_model():
    y_true = []
    y_pred = []
    for uid, iid, true_r, _ in ratings.itertuples(index=False):
        y_true.append(true_r)
        y_pred.append(round(svd.predict(uid, iid).est))  # 将预测值四舍五入以转化为离散值
    precision = precision_score(y_true, y_pred, average='macro', zero_division=1)
    recall = recall_score(y_true, y_pred, average='macro', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='macro', zero_division=1)
    print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')

# 示例使用
user_id = 1
movie_title = "Toy Story (1995)"
print("协同过滤推荐:", collaborative_filtering_recommendations(user_id))
print("内容过滤推荐:", content_based_recommendations(movie_title))
print("混合推荐:", hybrid_recommendations(user_id, movie_title))

# 评估模型
evaluate_model()


RMSE: 0.8798
RMSE:  0.8798213069999258
MAE:  0.6909
MAE:  0.6908804763490413
协同过滤推荐: ['Usual Suspects, The (1995)', 'Rear Window (1954)', 'Almost Famous (2000)', 'Sling Blade (1996)', 'Shawshank Redemption, The (1994)', 'Sanjuro (1962)', 'Blade Runner (1982)', 'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)', "It's a Wonderful Life (1946)", 'Sting, The (1973)']
内容过滤推荐: ['Aladdin and the King of Thieves (1996)', 'American Tail, An (1986)', 'American Tail: Fievel Goes West, An (1991)', 'Rugrats Movie, The (1998)', "Bug's Life, A (1998)", 'Toy Story 2 (1999)', 'Saludos Amigos (1943)', 'Chicken Run (2000)', 'Adventures of Rocky and Bullwinkle, The (2000)', 'Balto (1995)']
混合推荐: ["Bug's Life, A (1998)", 'Sling Blade (1996)', 'American Tail, An (1986)', 'American Tail: Fievel Goes West, An (1991)', "It's a Wonderful Life (1946)", 'Toy Story 2 (1999)', 'Rugrats Movie, The (1998)', 'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)', 'Chicken Run (2000)',

In [2]:
# ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
# movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
# users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

In [3]:
import torch
torch.cuda.empty_cache()

In [9]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# 加载数据
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=['UserID', 'MovieID', 'Rating', 'Timestamp'], engine='python')
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=['MovieID', 'Title', 'Genres'], engine='python', encoding='ISO-8859-1')
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'], engine='python')

# 数据预处理
# 将用户的年龄和职业转化为类别特征
age_encoder = LabelEncoder()
occupation_encoder = LabelEncoder()
gender_encoder = LabelEncoder()
users['Age'] = age_encoder.fit_transform(users['Age'])
users['Occupation'] = occupation_encoder.fit_transform(users['Occupation'])
users['Gender'] = gender_encoder.fit_transform(users['Gender'])

# 合并用户数据和评分数据
merged_data = pd.merge(ratings, users, on='UserID')

# 创建用户-物品评分矩阵
user_movie_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating').fillna(0)
user_movie_matrix_np = user_movie_matrix.values

# 定义Dataset类
class AutoencoderDataset(Dataset):
    def __init__(self, matrix):
        self.matrix = matrix
    
    def __len__(self):
        return len(self.matrix)
    
    def __getitem__(self, idx):
        return torch.tensor(self.matrix[idx], dtype=torch.float)

# 定义自编码器模型
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

# 设置设备为GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 实例化数据集
dataset = AutoencoderDataset(user_movie_matrix_np)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# 实例化模型
input_dim = user_movie_matrix_np.shape[1]
model = Autoencoder(input_dim).to(device)

# 定义损失函数和优化器
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练模型
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    with tqdm(dataloader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch+1}")
        for data in tepoch:
            data = data.to(device)
            
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, data)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.size(0)
            tepoch.set_postfix(loss=train_loss / len(dataloader.dataset))
    train_loss /= len(dataloader.dataset)
    print(f'Epoch {epoch+1}, Loss: {train_loss:.4f}')

# 评估模型
def evaluate_model():
    model.eval()
    reconstructed = []
    with torch.no_grad():
        for data in dataloader:
            data = data.to(device)
            outputs = model(data)
            reconstructed.extend(outputs.cpu().numpy())
    
    reconstructed = np.array(reconstructed)
    rmse = sqrt(mean_squared_error(user_movie_matrix_np, reconstructed))
    print(f'RMSE: {rmse:.4f}')

# 评估模型
evaluate_model()

# 示例使用
def recommend_movies(user_id, n=10):
    user_index = user_id - 1  # 假设用户ID从1开始
    user_vector = torch.tensor(user_movie_matrix_np[user_index], dtype=torch.float).to(device)
    with torch.no_grad():
        reconstructed_vector = model(user_vector).cpu().numpy()
    
    recommended_movie_indices = reconstructed_vector.argsort()[-n:][::-1]
    recommended_movie_ids = [user_movie_matrix.columns[i] for i in recommended_movie_indices]
    recommended_movies = movies[movies['MovieID'].isin(recommended_movie_ids)]
    return recommended_movies

# 示例使用
user_id = 1
recommended_movies = recommend_movies(user_id)
print(f'Recommended movies for user {user_id}:\n', recommended_movies[['Title', 'Genres']])


Epoch 1: 100%|██████████| 95/95 [00:00<00:00, 104.15batch/s, loss=0.48] 


Epoch 1, Loss: 0.4800


Epoch 2: 100%|██████████| 95/95 [00:00<00:00, 230.56batch/s, loss=0.41] 


Epoch 2, Loss: 0.4098


Epoch 3: 100%|██████████| 95/95 [00:00<00:00, 217.04batch/s, loss=0.39]  


Epoch 3, Loss: 0.3901


Epoch 4: 100%|██████████| 95/95 [00:00<00:00, 226.84batch/s, loss=0.38]  


Epoch 4, Loss: 0.3804


Epoch 5: 100%|██████████| 95/95 [00:00<00:00, 247.29batch/s, loss=0.373] 


Epoch 5, Loss: 0.3732


Epoch 6: 100%|██████████| 95/95 [00:00<00:00, 248.43batch/s, loss=0.368] 


Epoch 6, Loss: 0.3679


Epoch 7: 100%|██████████| 95/95 [00:00<00:00, 245.02batch/s, loss=0.363] 


Epoch 7, Loss: 0.3628


Epoch 8: 100%|██████████| 95/95 [00:00<00:00, 243.10batch/s, loss=0.359] 


Epoch 8, Loss: 0.3586


Epoch 9: 100%|██████████| 95/95 [00:00<00:00, 241.03batch/s, loss=0.356] 


Epoch 9, Loss: 0.3556


Epoch 10: 100%|██████████| 95/95 [00:00<00:00, 242.41batch/s, loss=0.353] 


Epoch 10, Loss: 0.3532


Epoch 11: 100%|██████████| 95/95 [00:00<00:00, 227.43batch/s, loss=0.351] 


Epoch 11, Loss: 0.3512


Epoch 12: 100%|██████████| 95/95 [00:00<00:00, 224.47batch/s, loss=0.35]  


Epoch 12, Loss: 0.3497


Epoch 13: 100%|██████████| 95/95 [00:00<00:00, 233.99batch/s, loss=0.348] 


Epoch 13, Loss: 0.3479


Epoch 14: 100%|██████████| 95/95 [00:00<00:00, 239.68batch/s, loss=0.347] 


Epoch 14, Loss: 0.3472


Epoch 15: 100%|██████████| 95/95 [00:00<00:00, 231.63batch/s, loss=0.346] 


Epoch 15, Loss: 0.3460


Epoch 16: 100%|██████████| 95/95 [00:00<00:00, 237.59batch/s, loss=0.345] 


Epoch 16, Loss: 0.3447


Epoch 17: 100%|██████████| 95/95 [00:00<00:00, 230.35batch/s, loss=0.344] 


Epoch 17, Loss: 0.3438


Epoch 18: 100%|██████████| 95/95 [00:00<00:00, 227.15batch/s, loss=0.344] 


Epoch 18, Loss: 0.3436


Epoch 19: 100%|██████████| 95/95 [00:00<00:00, 232.91batch/s, loss=0.342] 


Epoch 19, Loss: 0.3423


Epoch 20: 100%|██████████| 95/95 [00:00<00:00, 227.13batch/s, loss=0.342] 


Epoch 20, Loss: 0.3416


Epoch 21: 100%|██████████| 95/95 [00:00<00:00, 237.01batch/s, loss=0.341] 


Epoch 21, Loss: 0.3406


Epoch 22: 100%|██████████| 95/95 [00:00<00:00, 212.69batch/s, loss=0.34]  


Epoch 22, Loss: 0.3400


Epoch 23: 100%|██████████| 95/95 [00:00<00:00, 220.17batch/s, loss=0.339] 


Epoch 23, Loss: 0.3392


Epoch 24: 100%|██████████| 95/95 [00:00<00:00, 216.34batch/s, loss=0.338] 


Epoch 24, Loss: 0.3384


Epoch 25: 100%|██████████| 95/95 [00:00<00:00, 229.36batch/s, loss=0.338] 


Epoch 25, Loss: 0.3381


Epoch 26: 100%|██████████| 95/95 [00:00<00:00, 233.09batch/s, loss=0.338] 


Epoch 26, Loss: 0.3376


Epoch 27: 100%|██████████| 95/95 [00:00<00:00, 226.47batch/s, loss=0.337] 


Epoch 27, Loss: 0.3368


Epoch 28: 100%|██████████| 95/95 [00:00<00:00, 241.09batch/s, loss=0.336] 


Epoch 28, Loss: 0.3361


Epoch 29: 100%|██████████| 95/95 [00:00<00:00, 239.01batch/s, loss=0.336] 


Epoch 29, Loss: 0.3356


Epoch 30: 100%|██████████| 95/95 [00:00<00:00, 224.18batch/s, loss=0.335] 


Epoch 30, Loss: 0.3352


Epoch 31: 100%|██████████| 95/95 [00:00<00:00, 214.64batch/s, loss=0.335] 


Epoch 31, Loss: 0.3345


Epoch 32: 100%|██████████| 95/95 [00:00<00:00, 239.20batch/s, loss=0.334] 


Epoch 32, Loss: 0.3338


Epoch 33: 100%|██████████| 95/95 [00:00<00:00, 234.07batch/s, loss=0.333] 


Epoch 33, Loss: 0.3330


Epoch 34: 100%|██████████| 95/95 [00:00<00:00, 235.39batch/s, loss=0.333] 


Epoch 34, Loss: 0.3325


Epoch 35: 100%|██████████| 95/95 [00:00<00:00, 237.86batch/s, loss=0.332] 


Epoch 35, Loss: 0.3319


Epoch 36: 100%|██████████| 95/95 [00:00<00:00, 231.79batch/s, loss=0.331] 


Epoch 36, Loss: 0.3314


Epoch 37: 100%|██████████| 95/95 [00:00<00:00, 232.16batch/s, loss=0.331] 


Epoch 37, Loss: 0.3311


Epoch 38: 100%|██████████| 95/95 [00:00<00:00, 243.29batch/s, loss=0.331] 


Epoch 38, Loss: 0.3306


Epoch 39: 100%|██████████| 95/95 [00:00<00:00, 224.27batch/s, loss=0.331] 


Epoch 39, Loss: 0.3307


Epoch 40: 100%|██████████| 95/95 [00:00<00:00, 229.15batch/s, loss=0.33]  


Epoch 40, Loss: 0.3302


Epoch 41: 100%|██████████| 95/95 [00:00<00:00, 234.06batch/s, loss=0.329] 


Epoch 41, Loss: 0.3294


Epoch 42: 100%|██████████| 95/95 [00:00<00:00, 228.74batch/s, loss=0.329] 


Epoch 42, Loss: 0.3290


Epoch 43: 100%|██████████| 95/95 [00:00<00:00, 242.12batch/s, loss=0.328] 


Epoch 43, Loss: 0.3285


Epoch 44: 100%|██████████| 95/95 [00:00<00:00, 242.53batch/s, loss=0.328] 


Epoch 44, Loss: 0.3282


Epoch 45: 100%|██████████| 95/95 [00:00<00:00, 242.48batch/s, loss=0.328] 


Epoch 45, Loss: 0.3278


Epoch 46: 100%|██████████| 95/95 [00:00<00:00, 252.75batch/s, loss=0.328] 


Epoch 46, Loss: 0.3277


Epoch 47: 100%|██████████| 95/95 [00:00<00:00, 252.99batch/s, loss=0.327] 


Epoch 47, Loss: 0.3273


Epoch 48: 100%|██████████| 95/95 [00:00<00:00, 253.13batch/s, loss=0.327] 


Epoch 48, Loss: 0.3271


Epoch 49: 100%|██████████| 95/95 [00:00<00:00, 235.51batch/s, loss=0.327] 


Epoch 49, Loss: 0.3266


Epoch 50: 100%|██████████| 95/95 [00:00<00:00, 249.39batch/s, loss=0.327] 


Epoch 50, Loss: 0.3266
RMSE: 0.8689
Recommended movies for user 1:
                                   Title                               Genres
0                      Toy Story (1995)          Animation|Children's|Comedy
33                          Babe (1995)              Children's|Comedy|Drama
315    Shawshank Redemption, The (1994)                                Drama
360               Lion King, The (1994)         Animation|Children's|Musical
584                      Aladdin (1992)  Animation|Children's|Comedy|Musical
591         Beauty and the Beast (1991)         Animation|Children's|Musical
907            Wizard of Oz, The (1939)   Adventure|Children's|Drama|Musical
1081  E.T. the Extra-Terrestrial (1982)      Children's|Drama|Fantasy|Sci-Fi
2286               Bug's Life, A (1998)          Animation|Children's|Comedy
3045                 Toy Story 2 (1999)          Animation|Children's|Comedy
