In [None]:
import numpy as np
import pandas as pd
import math
from scipy.stats import pearsonr
import random
from concurrent.futures import ThreadPoolExecutor
import concurrent

返回根据Jaccard系数高低排列的物品序列

In [None]:
def rcm_result_protol(some_anime_id, some_result, df):
    def calculate_similarity(anime_id, result):
        return {
            "rcm_anime_id": result,
            "protol_anime_id": anime_id,
            "weight": anime_Jaccard(df, anime_id, result),
        }

    with ThreadPoolExecutor() as executor:
        similarity_list = list(executor.map(calculate_similarity, some_anime_id, some_result))

    unique_dict = {item["rcm_anime_id"]: item for item in sorted(similarity_list, key=lambda x: x['weight'], reverse=True)}

    return list(unique_dict.values())


根据Jaccard系数排列的物品序列，通过SlopeOne算法计算其预测评分，在返回根据预测评分高低排列的物品序列

In [None]:
def rcm_result_with_slopeone(some_protol_result,user_id,df):
    list = []
    for i in range(len(some_protol_result)):
        list.append({
            "rcm_anime_id":some_protol_result[i]["rcm_anime_id"],
            "protol_anime_id":some_protol_result[i]["protol_anime_id"],
            "predict_rating":df.loc[(df['user_id'] == user_id) & (df['anime_id'] == some_protol_result[i]["protol_anime_id"])]['rating'].tolist()[0]+anime_rating_deficiency(df,some_protol_result[i]['protol_anime_id'],some_protol_result[i]['rcm_anime_id'])
        })
    return sorted(list,key=lambda x: x['predict_rating'], reverse=True) 
    

寻找相似的用户

In [None]:
def find_similiar_user(vector, df):
    def calculate_pearsonr(row):
        vector_or = row.fillna(0)
        pc = pearsonr(vector, vector_or)
        return pd.Series({"user_id": row.name, "statistic": pc[0], "pvalue": pc[1]})

    # 使用 ThreadPoolExecutor 并行计算皮尔逊相关系数
    with ThreadPoolExecutor() as executor:
        result = list(executor.map(calculate_pearsonr, [row for _, row in df.iterrows()]))

    result = pd.DataFrame(result)
    sorted_result = result.sort_values(by="statistic", ascending=False)

    top_users = sorted_result.head(100)["user_id"].tolist()

    # 使用向量化操作替换循环
    matrix = df.loc[top_users].fillna(0).values

    return pd.DataFrame(matrix).mean().sort_values(ascending=False).index



移除alist中的blist元素

In [None]:
def remove_dumplicate_element(alist,blist):
    result = [x for x in alist if x not in blist]
    return result

Jaccard系数计算

In [None]:
def anime_Jaccard(df,anime_x,anime_y):
    anime1_users = set(df[df['anime_id'] == anime_x]['user_id'])
    anime2_users = set(df[df['anime_id'] == anime_y]['user_id'])
    intersection = len(anime1_users.intersection(anime2_users))
    union = len(anime1_users.union(anime2_users))
    return intersection / union

两物体的平均评分差计算

In [None]:
def anime_rating_deficiency(df,anime_x,anime_y):
    book1_ratings = df[df['anime_id'] == anime_x]
    book2_ratings = df[df['anime_id'] == anime_y]
    merged_ratings = pd.merge(book1_ratings, book2_ratings, on='user_id')
    return merged_ratings.mean()["rating_x"]-merged_ratings.mean()["rating_y"]

读取数据集

In [None]:
anime = pd.read_csv("./anime_data/anime.csv")
rating = pd.read_csv("./anime_data/rating.csv")

组合生成用户-物品-评分表格

In [None]:
rating = rating.drop_duplicates(subset=['user_id', 'anime_id'])
rating['rating'] = rating['rating'].clip(lower=random.randint(1,10))
user_rating_matrix = rating.pivot(index="user_id",columns="anime_id",values="rating")

获取用户向量

In [None]:
user_id = 3
vector=user_rating_matrix.loc[user_id].fillna(0)

获取相似用户的高分animeID以及用户打分及格的animeId

In [None]:
some_list = find_similiar_user(vector,user_rating_matrix)
rated_anime_ids = rating[(rating['user_id'] == user_id)& (rating['rating']>=6)]['anime_id'].tolist()
# some_list = remove_dumplicate_element(some_list,rated_anime_ids)

根据及格animeID与相似用户高分ID获得推荐列表

In [None]:
protol_result=rcm_result_protol(rated_anime_ids,some_list,rating)

In [None]:
final_list = rcm_result_with_slopeone(protol_result,user_id,rating)


In [None]:
result = [item["rcm_anime_id"] for item in final_list]
len(set(rated_anime_ids)&set(result))/len(set(rated_anime_ids))

In [None]:
testList = []
for i in range(100):
    testList.append(1+i)

In [24]:
totalBack = 0
i_count = 0
for i in range(10):
    user_id = testList[i+1]
    vector=user_rating_matrix.loc[user_id].fillna(0)
    rated_anime_ids = rating[(rating['user_id'] == user_id)& (rating['rating']>=6)]['anime_id'].tolist()
    if len(rated_anime_ids) <= 0:
        continue
    result = [item["rcm_anime_id"] for item in rcm_result_with_slopeone(rcm_result_protol(rated_anime_ids,find_similiar_user(vector,user_rating_matrix),rating),user_id,rating)]
    # result = [x + 1 for x in result]
    totalBack+=len(set(rated_anime_ids)&set(result))/len(set(rated_anime_ids))
    i_count+=1


In [25]:
totalBack/i_count

0.008645622394482227

In [None]:
vector=user_rating_matrix.loc[1].fillna(0)
rated_anime_ids = rating[(rating['user_id'] == 1)& (rating['rating']>=6)]['anime_id'].tolist()
result = [item["rcm_anime_id"] for item in rcm_result_with_slopeone(rcm_result_protol(rated_anime_ids,find_similiar_user(vector,user_rating_matrix),rating),1,rating)]

In [None]:
result