In [53]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-small/movies.csv
/kaggle/input/movielens-small/ratings.csv
/kaggle/input/movielens-small/tags.csv
/kaggle/input/movielens-small/links.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


In [54]:
df = pd.read_csv('/kaggle/input/movielens-small/ratings.csv')

In [None]:
df.shape

In [None]:
df.head()

# Pre Work

划分训练集和测试集

In [55]:
from sklearn.model_selection import train_test_split
#data:需要进行分割的数据集
#random_state:设置随机种子，保证每次运行生成相同的随机数
#test_size:将数据分割成训练集的比例

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [56]:
train_set.to_csv('movies_train.csv')
test_set.to_csv('movies_test.csv')

## Loading

In [57]:
ratings = pd.read_csv('/kaggle/input/movielens-small/ratings.csv')
movies = pd.read_csv('/kaggle/input/movielens-small/movies.csv')
links = pd.read_csv('/kaggle/input/movielens-small/links.csv')
credits = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)

In [58]:
def load_node_csv(path, index_col):
    """Loads csv containing node information

    Args:
        path (str): path to csv file
        index_col (str): column name of index column

    Returns:
        dict: mapping of csv row to node id
    """
    df = pd.read_csv(path, index_col=index_col)
    mapping = {index: i for i, index in enumerate(df.index.unique())}
    return mapping


user_mapping = load_node_csv('/kaggle/input/movielens-small/ratings.csv', index_col='userId')
movie_mapping = load_node_csv('/kaggle/input/movielens-small/ratings.csv', index_col='movieId')

# 把rating中的userId换成mapping之后的userID
ratings['userId'] = ratings['userId'].apply(lambda x : user_mapping[x])
ratings['movieId'] = ratings['movieId'].apply(lambda x : movie_mapping[x])



In [61]:
train_set, test_set = train_test_split(ratings, test_size=0.2, random_state=42)
train_set.to_csv('movies_train.csv')
test_set.to_csv('movies_test.csv')

## Metrics

Given rec list and true list, calculate recall and NDCG

In [187]:
# computes recall@K and precision@K
def RecallPrecision_ATk(groundTruth, r, k):
    """Computers recall @ k and precision @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not
        k (intg): determines the top k items to compute precision and recall on

    Returns:
        tuple: recall @ k, precision @ k
    """
        
    num_correct_pred = (np.array(r)).sum(axis=1)  # number of correctly predicted items per user
    # number of items liked by each user in the test set
    user_num_liked = np.array([len(groundTruth[i])
                                  for i in range(len(groundTruth))])
    recall = np.mean(num_correct_pred / user_num_liked)
    precision = np.mean(num_correct_pred) / k
    return recall.item(), precision.item()

def get_Recall_Precision(top_k_pred_list,k):
    r = []
    for i in range(0, len(top_k_pred_list)):
        label = list(map(lambda x: x in true_interact_list_1[i], top_k_pred_list[i]))
        r.append(label)
    recall, precision = RecallPrecision_ATk(true_interact_list_1, r, k)
    return recall, precision

def get_Recall_Precision_svd(top_k_item,k):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
        
    return get_Recall_Precision(top_k_pred_list,k)

In [130]:
# computes NDCG@K
def NDCGatK_r(groundTruth, r, k):
    """Computes Normalized Discounted Cumulative Gain (NDCG) @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not 即只包括0,1的序列，长度为k；如果第i个item in recommendation list在ground truth里面，那么就为1
        k (int): determines the top k items to compute ndcg on

    Returns:
        float: ndcg @ k
    """
    assert len(r) == len(groundTruth)

    test_matrix = np.zeros((len(r), k))

    for i, items in enumerate(groundTruth):
        length = min(len(items), k)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = np.sum(max_r * 1. / np.log2(np.arange(2, k + 2)), axis=1)
    dcg = r * (1. / np.log2(np.arange(2, k + 2)))
    dcg = np.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[np.isnan(ndcg)] = 0.
    return np.mean(ndcg).item()

def get_NDCG_with_pred(top_k_item,k):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
    
    r = []
#     print(top_k_pred_list)
    for i in range(0, len(top_k_pred_list)):
        label = list(map(lambda x: x in true_interact_list_1[i], top_k_pred_list[i]))
        r.append(label)
#     print(len(top_k_pred_list))
#     print(len(r))
#     print(len(r[0]))
#     print(r[0])
    
#     return r
    return NDCGatK_r(true_interact_list_1,r,k)

In [132]:
# hit

def getHit(top_n_pred_list, true_interact_list):
    assert len(top_n_pred_list) == len(true_interact_list)
    
    N = len(top_n_pred_list)
    sum = 0
    for i in range(0, N):
        if len(set(top_n_pred_list[i])& set(true_interact_list[i])) != 0 :
            sum +=1
            
    
    return sum/N 

def get_hit_with_pred(top_k_item):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
        
    return getHit(top_k_pred_list,true_interact_list_1)

# SVD only

- CF: collaborative filtering

- CBF: content based filtering

#### 最火热的20部电影（avg rating降序）

In [190]:
# get df of the top 50 rated movies on average; 每个电影至少有30人评分
rating_df = pd.read_csv("/kaggle/input/movielens-small/ratings.csv")
size_df = rating_df.groupby('movieId').size().to_frame('size')
rating_df = rating_df.groupby('movieId').agg('mean').join(other=size_df)
rating_df = rating_df[rating_df['size'] >= 30].sort_values(by='rating', ascending=False)
rating_df['movieId'] = rating_df.index
rating_df = rating_df.head(20)
top_movie_id_list = rating_df['movieId'].tolist()

### SVD for Collaborative Filter

In [111]:
train_set, test_set = train_test_split(df, test_size=0.05, random_state=42)
train_set = train_set[train_set['rating']>=4]
test_set = test_set[test_set['rating']>=4]

In [112]:
from surprise import Reader, Dataset, SVD

reader = Reader()
train_data = Dataset.load_from_df(train_set[['userId', 'movieId', 'rating']], reader) # data: ratings
test_data = Dataset.load_from_df(test_set[['userId', 'movieId', 'rating']], reader) # data: ratings

trainset = train_data.build_full_trainset()
# trainset_for_pick_item = train_data.build_full_trainset().build_testset()
testset = test_data.build_full_trainset().build_testset()

svd = SVD()
svd.fit(trainset)

svd.predict(1,1).est # svd.predict(userID,movieID).est   est表示estimate

4.516213974704435

In [165]:
whole_movie_id_list = train_set['movieId'].unique() # 训练集中的电影id
whole_pred_user_list = test_set['userId'].unique() # 测试集中的user ID


# 获得测试集中真正的interact列表 （设置threshold rating=4； 4分以上表示交互了）
true_interact_list_1 = []
uid_ = -1
temp = []
predictions = svd.test(testset)
for uid, iid, true_r, est, _ in predictions:
    if uid_ == -1:
        uid_ = uid
    if uid == uid_:
        temp.append(iid)
    else:
        true_interact_list_1.append(temp)
        uid_ = uid
        temp = []
        temp.append(iid)
        
true_interact_list_1.append(temp)


In [123]:
# 接下来定义get_top_n()函数，它能根据predictions结果进行解析，获取top_n字典，该字典的key是user-id，value是该user打分（预测值）最高的n个item-id。
# predictions的数据结构，是surprise中的算法自带接口algo.test()的输出值。
# 使用：predictions = algo.test(testset)

from collections import defaultdict

def get_top_n(n=10):

    # First map the predictions to each user.
    top_n = defaultdict(list)
    
    # uid： 用户ID
    # iid： item ID
    # true_r： 真实得分
    # est：估计得分
    
    
    for uid in whole_pred_user_list:
        for i in range(0,len(whole_movie_id_list)):
            iid = whole_movie_id_list[i]
            est = svd.predict(uid,iid).est
            top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    # 为每一个用户都寻找K个得分最高的item
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


#### top5

In [116]:
def get_hit_with_pred(top_k_item):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
        
    return getHit(top_k_pred_list,true_interact_list_1)

In [None]:
# # Print the recommended items for each user
# top_n = get_top_n(n=5)

In [None]:
# count = 0
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings], [rating for (_,rating) in user_ratings])
    
#     count += 1
#     if count > 5:
#         break


In [None]:
# get_hit_with_pred(top_n)

In [None]:
# get_NDCG_with_pred(top_n,5)

#### top20

In [138]:
top_20 = get_top_n(n=20)

In [126]:
get_hit_with_pred(top_20)

0.10647181628392484

In [140]:
get_NDCG_with_pred(top_20,20)

0.015052341395727602

In [188]:
get_Recall_Precision_svd(top_20,20)

(0.033253877739558964, 0.0059498956158663885)

**if we simply recommend top 20 popular movies?**

In [191]:
# determine the correctness of top20 predictions
r = []
for i in range(0, len(true_interact_list_1)):
    label = list(map(lambda x: x in true_interact_list_1[i], top_movie_id_list))
    r.append(label)


ndcg = NDCGatK_r(true_interact_list_1, r, 20)

print(f'NDCG: {ndcg}')

NDCG: 0.04330991831696389


In [193]:
getHit([top_movie_id_list]*len(true_interact_list_1), true_interact_list_1)

0.2045929018789144

# SVD + Content Based Filtering + Popularity based recommend

In [None]:
movies

- genres

In [194]:
genres = set([])

for i in movies['genres'].apply(lambda x: x.split('|')):
    genres = genres.union(set(i))

genres.remove('(no genres listed)') # 是inplace操作

In [195]:
genres_dict = dict(zip(genres,range(0,len(genres))))

In [196]:
def get_genres_array(x):
    l = x.split('|')
    if l[0].startswith('('):
        return np.zeros(len(genres_dict))
    
    t = np.zeros(len(genres_dict))
    for _ in l:
        t[genres_dict[_]] = 1
    return t
            
        
movies['genres_array'] = movies['genres'].apply(get_genres_array)
movies

Unnamed: 0,movieId,title,genres,genres_array
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
9739,193585,Flint (2017),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


- cast crews and directors

In [197]:
movies_df = movies.merge(links,on='movieId',how='inner').drop(columns=['imdbId'])
movies_df = movies_df.dropna()
movies_df['tmdbId']=movies_df['tmdbId'].astype('int32')

In [83]:
movies_df.head(10)

Unnamed: 0,movieId,title,genres,genres_array,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",862
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8844
2,3,Grumpier Old Men (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",31357
4,5,Father of the Bride Part II (1995),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",11862
5,6,Heat (1995),Action|Crime|Thriller,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...",949
6,7,Sabrina (1995),Comedy|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",11860
7,8,Tom and Huck (1995),Adventure|Children,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",45325
8,9,Sudden Death (1995),Action,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9091
9,10,GoldenEye (1995),Action|Adventure|Thriller,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",710


In [198]:
tmdb = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv').drop(columns=['title'])
movies_df = movies_df.merge(tmdb,left_on='tmdbId',right_on='movie_id').drop(columns=['movie_id','tmdbId'])
movies_df

Unnamed: 0,movieId,title,genres,genres_array,cast,crew
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 14, ""character"": ""Woody (voice)"",...","[{""credit_id"": ""52fe4284c3a36847f8024f55"", ""de..."
1,10,GoldenEye (1995),Action|Adventure|Thriller,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""52fe426ec3a36847f801e16f"", ""de..."
2,11,"American President, The (1995)",Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 1, ""character"": ""Andrew Shepherd""...","[{""credit_id"": ""52fe44dac3a36847f80adfa3"", ""de..."
3,14,Nixon (1995),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 1, ""character"": ""Richard Nixon"", ...","[{""credit_id"": ""52fe43c59251416c7501d705"", ""de..."
4,15,Cutthroat Island (1995),Action|Adventure|Romance,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 1, ""character"": ""Morgan Adams"", ""...","[{""credit_id"": ""52fe42f4c3a36847f802f69f"", ""de..."
...,...,...,...,...,...,...
3532,160644,Indignation (2016),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 0, ""character"": ""Marcus Messner"",...","[{""credit_id"": ""58514b91c3a3682dfe017405"", ""de..."
3533,160954,Nerve (2016),Drama|Thriller,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 4, ""character"": ""Vee Delmonico"", ...","[{""credit_id"": ""57993c2f925141234800341d"", ""de..."
3534,161127,The Infiltrator (2016),Crime|Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[{""cast_id"": 2, ""character"": ""Robert Mazur"", ""...","[{""credit_id"": ""578af3a79251417aca003525"", ""de..."
3535,161580,Bad Moms (2016),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[{""cast_id"": 0, ""character"": ""Amy Mitchell"", ""...","[{""credit_id"": ""5690c7adc3a3686b52001c68"", ""de..."


In [199]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(literal_eval)

In [200]:
# Get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan


# Returns the list top 3 elements or entire list; whichever is more.
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [201]:
# Define new director, cast, genres and keywords features that are in a suitable form.
movies_df['director'] = movies_df['crew'].apply(get_director)
movies_df['cast'] = movies_df['cast'].apply(get_list)

movies_df

Unnamed: 0,movieId,title,genres,genres_array,cast,crew,director
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f55', 'de...",John Lasseter
1,10,GoldenEye (1995),Action|Adventure|Thriller,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Pierce Brosnan, Sean Bean, Izabella Scorupco]","[{'credit_id': '52fe426ec3a36847f801e16f', 'de...",Martin Campbell
2,11,"American President, The (1995)",Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Michael Douglas, Annette Bening, Michael J. Fox]","[{'credit_id': '52fe44dac3a36847f80adfa3', 'de...",Rob Reiner
3,14,Nixon (1995),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Anthony Hopkins, Joan Allen, Powers Boothe]","[{'credit_id': '52fe43c59251416c7501d705', 'de...",Oliver Stone
4,15,Cutthroat Island (1995),Action|Adventure|Romance,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Geena Davis, Matthew Modine, Frank Langella]","[{'credit_id': '52fe42f4c3a36847f802f69f', 'de...",Renny Harlin
...,...,...,...,...,...,...,...
3532,160644,Indignation (2016),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Logan Lerman, Sarah Gadon, Tracy Letts]","[{'credit_id': '58514b91c3a3682dfe017405', 'de...",James Schamus
3533,160954,Nerve (2016),Drama|Thriller,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Emma Roberts, Dave Franco, Emily Meade]","[{'credit_id': '57993c2f925141234800341d', 'de...",Henry Joost
3534,161127,The Infiltrator (2016),Crime|Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[Bryan Cranston, Diane Kruger, John Leguizamo]","[{'credit_id': '578af3a79251417aca003525', 'de...",Brad Furman
3535,161580,Bad Moms (2016),Comedy,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Mila Kunis, Kristen Bell, Kathryn Hahn]","[{'credit_id': '5690c7adc3a3686b52001c68', 'de...",Jon Lucas


In [89]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

# Apply clean_data function to your features.
features = ['cast', 'director']
for feature in features:
    movies_df[feature] = movies_df[feature].apply(clean_data)

In [202]:
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + x['director']
movies_df['soup'] = movies_df.apply(create_soup, axis=1)
movies_df.head(5)

Unnamed: 0,movieId,title,genres,genres_array,cast,crew,director,soup
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f55', 'de...",John Lasseter,Tom Hanks Tim Allen Don Rickles John Lasseter
1,10,GoldenEye (1995),Action|Adventure|Thriller,"[0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Pierce Brosnan, Sean Bean, Izabella Scorupco]","[{'credit_id': '52fe426ec3a36847f801e16f', 'de...",Martin Campbell,Pierce Brosnan Sean Bean Izabella Scorupco Mar...
2,11,"American President, The (1995)",Comedy|Drama|Romance,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...","[Michael Douglas, Annette Bening, Michael J. Fox]","[{'credit_id': '52fe44dac3a36847f80adfa3', 'de...",Rob Reiner,Michael Douglas Annette Bening Michael J. Fox ...
3,14,Nixon (1995),Drama,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Anthony Hopkins, Joan Allen, Powers Boothe]","[{'credit_id': '52fe43c59251416c7501d705', 'de...",Oliver Stone,Anthony Hopkins Joan Allen Powers Boothe Olive...
4,15,Cutthroat Island (1995),Action|Adventure|Romance,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[Geena Davis, Matthew Modine, Frank Langella]","[{'credit_id': '52fe42f4c3a36847f802f69f', 'de...",Renny Harlin,Geena Davis Matthew Modine Frank Langella Renn...


获得电影之间的相似度，based on genres, director AND casts

- genres
- soup

In [203]:
arr = np.array(movies_df.genres_array)
# 将每个数组按行堆叠起来，形成一个(3560, 19)的矩阵
mat_genre = np.stack(arr, axis=0)
print(mat_genre)

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim1 = cosine_similarity(mat_genre, mat_genre)

print(cosine_sim1.shape)

[[0. 1. 0. ... 0. 1. 0.]
 [0. 0. 1. ... 0. 1. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 1. 0.]]
(3537, 3537)


In [204]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies_df['soup'])

count_matrix.shape

(3537, 5682)

In [205]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [206]:
# Combine Cosine Similarity
LAMBDA = 0.5
cosine_sim = cosine_sim1 * LAMBDA + cosine_sim2 * (1-LAMBDA)

In [207]:
indices = pd.Series(movies_df.index, index=movies_df['movieId']).drop_duplicates()
indices # (index in cosine_sim_matrix , movieId) 

movieId
1            0
10           1
11           2
14           3
15           4
          ... 
160644    3532
160954    3533
161127    3534
161580    3535
163056    3536
Length: 3537, dtype: int64

In [208]:
def get_two_movies_sim(movieId1,movieId2):
    return cosine_sim[indices[movieId1],indices[movieId2]]

- Weighted RATING (pop)

In [209]:
tmdb_movies = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
C= tmdb_movies['vote_average'].mean()
m= tmdb_movies['vote_count'].quantile(0.9)

def weighted_rating(x, m=m, C=C):
    if x['vote_count'] < m:
        return 0
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

tmdb_movies['pop'] = tmdb_movies.apply(weighted_rating, axis=1)


In [210]:
tmdb_movies[['pop','popularity']].corr()
# WR 和 popularity 的线性相关性达到了0.613，非常高！

Unnamed: 0,pop,popularity
pop,1.0,0.612795
popularity,0.612795,1.0


In [211]:
tmdb_movies.sort_values('pop',ascending=False)['title'].head(5)

1881    The Shawshank Redemption
662                   Fight Club
65               The Dark Knight
3232                Pulp Fiction
96                     Inception
Name: title, dtype: object

In [212]:
tmdb_movies.sort_values('popularity',ascending=False)['title'].head(5)

546                    Minions
95                Interstellar
788                   Deadpool
94     Guardians of the Galaxy
127         Mad Max: Fury Road
Name: title, dtype: object

In [213]:
WR = tmdb_movies[['id','pop']].merge(links,left_on='id',right_on='tmdbId')[['movieId','pop']]
WR.head(5)

Unnamed: 0,movieId,pop
0,72998,7.050669
1,53125,6.665696
2,136020,6.239396
3,91529,7.346721
4,93363,6.096368


### content-pop-based 协同过滤推荐：

- Step1: 遍历user的看过的电影的列表（评分4及以上的），找到每个电影的TOP__相似的电影，将这些电影放到推荐列表中。如果他/她看过这个电影，则移除。
    - 推荐列表中的电影 score = rating * similarity_coeff （只算一次，和相似度最大的那个算）
        
- Step2：在上述推荐列表中，每一个电影的scores += log(它们自己的WR+1)

- Step3: 把TOP20 popular的电影也加入到候选的推荐列表中，计算score = rating*simi_coeff + log(WR+1)
```{python}
        simi = []
        
        for i in top20 popular movies：
            
            for j in users' interacted movies：
                
               simi.append((simi（i,j),movieId))
               
        score += rating * similarity_coeff （只算一次，和相似度最大的那个算） + log(WR+1)
```

- Step3：按照推荐列表中的电影的scores进行降序排序，选择TOP-K个电影。

----
**问题**： pop的数据有缺失

In [None]:
# # 对similarity的行做归一化。
# cosine_sim_ = cosine_sim / np.sum(cosine_sim, axis=1, keepdims=True)

In [214]:
len(df['movieId'].unique())

9724

In [215]:
df_ = df.merge(movies_df,on='movieId')[['userId','movieId','rating']]
len(df_['movieId'].unique())

3536

In [216]:
train_set, test_set = train_test_split(df_, test_size=0.01, random_state=42)
train_set_th4 = train_set[train_set['rating']>=4]
test_set_th4 = test_set[test_set['rating']>=4]

In [217]:
test_set_th4['userId'].unique()

array([483, 525, 312, 578, 452, 160, 356, 111, 301,  64, 113, 140, 288,
       227, 453,  80, 118, 608, 256, 200,  89, 266, 105, 169, 281, 504,
       157, 484, 517, 606, 291,  47,  21, 426, 219, 103,  68, 248, 119,
       176, 182, 450, 317, 522,  36,  63, 177,  98,  15, 503, 554,  84,
       318,  18, 275, 477, 514, 168, 573,  72, 297, 362, 590, 448, 232,
       380, 309, 474, 100, 599, 220, 137,   1, 222,  45, 580, 357, 263,
       353, 495, 114, 527,  59, 166, 520, 154,  44, 122, 130,  11, 560,
       361, 104, 352,  57, 226, 469, 247, 432,  66, 332, 597, 553, 526,
       340, 440, 305, 414, 123, 279,   4, 542, 570, 274, 438, 261, 186,
       205, 488, 387, 276, 326, 193, 550, 240, 184, 125, 475, 610, 603,
       543, 359, 417, 339, 319,  40, 500, 233,  62, 534,  27,  19,  50,
       427,  92, 214, 505, 491, 595, 230,  56, 485, 480, 460, 135, 563,
       323,  23, 556, 249, 286, 391, 334, 151, 165, 159, 587, 601, 486,
       304, 513, 600, 225, 425, 456, 581, 399, 211, 178, 199,  1

In [218]:
top_20_wr = WR.sort_values('pop',ascending=False).head(20)

In [219]:
def get_topK_simiar_movies(movieId, cosine_sim = cosine_sim, K=3):
    idx = indices[movieId]
#     rating = train_set_th4[train_set_th4['movieId']==movieId][train_set_th4['userId']==userId]['rating'].values[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:K+1]
    movie_indices = [i[0] for i in sim_scores]

    return movies_df['movieId'].iloc[movie_indices].values

def get_user_allPos(userId):
    if len(train_set_th4[train_set_th4['userId']==userId])==0:
        return 'Newer'
    return train_set_th4[train_set_th4['userId']==userId]['movieId'].unique()
    
def CPB_TOPK_REC(userId, K=20, k_movie = 10, pop_penalty = 0.2):
    rec = pd.DataFrame(columns=['movieId','score'])
    
    all_pos = get_user_allPos(userId)
    if all_pos == 'Newer': #如果是新来的user,直接推荐top20的电影
        print('--------------NEW user--------',userId)
        for m_top20_pop_id in top_20_wr['movieId'].values:
            rec = rec.append({'movieId': m_top20_pop_id, 'score': 5}, ignore_index=True)
        return rec.head(K)['movieId'].values
    
    for m_watched_id in get_user_allPos(userId):
        for m_simi_id in get_topK_simiar_movies(m_watched_id):
          if m_simi_id not in all_pos:
              simi = get_two_movies_sim(m_simi_id, m_watched_id)
              rating = train_set_th4[train_set_th4['userId']==userId][train_set_th4['movieId']==m_watched_id]['rating'].values[0]
              rec = rec.append({'movieId': m_simi_id, 'score': simi*rating + pop_penalty*np.log(WR[WR['movieId']==m_simi_id]['pop'].values[0]+1)}, ignore_index=True)
    
    # 把TOP20 popular的电影也加入到候选的推荐列表中，计算score = rating*simi_coeff + log(WR+1)
    for m_top20_pop_id in top_20_wr['movieId'].values:
        for m_watched_id in get_user_allPos(userId):
          if m_top20_pop_id not in all_pos:
              simi = get_two_movies_sim(m_top20_pop_id, m_watched_id)
              rating = train_set_th4[train_set_th4['userId']==userId][train_set_th4['movieId']==m_watched_id]['rating'].values[0]
              rec = rec.append({'movieId': m_top20_pop_id, 'score': simi*rating + pop_penalty*np.log(WR[WR['movieId']==m_top20_pop_id]['pop'].values[0]+1)}, ignore_index=True)

    rec = rec.groupby('movieId').max().reset_index() # （只算一次，和相似度最大的那个算）
    
    rec = rec.sort_values('score',ascending=False)
    return rec.head(K)['movieId'].values
#     return rec

In [220]:
true_interact_list = []
uid_ = -1
temp = []

from surprise import Reader, Dataset, SVD

reader = Reader()

train_data = Dataset.load_from_df(train_set_th4[['userId', 'movieId', 'rating']], reader) # data: ratings
test_data = Dataset.load_from_df(test_set_th4[['userId', 'movieId', 'rating']], reader) # data: ratings

trainset = train_data.build_full_trainset()
testset = test_data.build_full_trainset().build_testset()

svd = SVD()
svd.fit(trainset)
predictions = svd.test(testset)

for uid, iid, true_r, est, _ in predictions:
    if uid_ == -1:
        uid_ = uid
    if uid == uid_:
        if true_r >= 4:
            temp.append(iid)
    else:
        true_interact_list.append(temp)
        uid_ = uid
        temp = []
        if true_r >= 4:
            temp.append(iid)
        
true_interact_list.append(temp)

def get_NDCG_with_pred(top_k_pred_list,k):
    r = []
    for i in range(0, len(top_k_pred_list)):
        label = list(map(lambda x: x in true_interact_list[i], top_k_pred_list[i]))
        r.append(label)

    return NDCGatK_r(true_interact_list,r,k)

################################################
def test(K=20, k_movie=10, pop_penalty=0.1):
    pred = []
    for u in test_set_th4['userId'].unique():
        pred.append(list(CPB_TOPK_REC(u,  K=K, k_movie = k_movie, pop_penalty = pop_penalty)))
    
    return pred
#     h = getHit(pred, true_interact_list)
#     ndcg = get_NDCG_with_pred(pred,k=20)
#     print(h)
#     print(ndcg)

In [None]:
pred2 = test()



In [221]:
# computes recall@K and precision@K
def RecallPrecision_ATk(groundTruth, r, k):
    """Computers recall @ k and precision @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not
        k (intg): determines the top k items to compute precision and recall on

    Returns:
        tuple: recall @ k, precision @ k
    """
        
    num_correct_pred = (np.array(r)).sum(axis=1)  # number of correctly predicted items per user
    # number of items liked by each user in the test set
    user_num_liked = np.array([len(groundTruth[i])
                                  for i in range(len(groundTruth))])
    recall = np.mean(num_correct_pred / user_num_liked)
    precision = np.mean(num_correct_pred) / k
    return recall.item(), precision.item()

def get_Recall_Precision(top_k_pred_list,k):
    r = []
    for i in range(0, len(top_k_pred_list)):
        label = list(map(lambda x: x in true_interact_list[i], top_k_pred_list[i]))
        r.append(label)
    recall, precision = RecallPrecision_ATk(true_interact_list, r, k)
    return recall, precision

# computes NDCG@K
def NDCGatK_r(groundTruth, r, k):
    """Computes Normalized Discounted Cumulative Gain (NDCG) @ k

    Args:
        groundTruth (list): list of lists containing highly rated items of each user
        r (list): list of lists indicating whether each top k item recommended to each user
            is a top k ground truth item or not 即只包括0,1的序列，长度为k；如果第i个item in recommendation list在ground truth里面，那么就为1
        k (int): determines the top k items to compute ndcg on

    Returns:
        float: ndcg @ k
    """
    assert len(r) == len(groundTruth)

    test_matrix = np.zeros((len(r), k))

    for i, items in enumerate(groundTruth):
        length = min(len(items), k)
        test_matrix[i, :length] = 1
    max_r = test_matrix
    idcg = np.sum(max_r * 1. / np.log2(np.arange(2, k + 2)), axis=1)
    dcg = r * (1. / np.log2(np.arange(2, k + 2)))
    dcg = np.sum(dcg, axis=1)
    idcg[idcg == 0.] = 1.
    ndcg = dcg / idcg
    ndcg[np.isnan(ndcg)] = 0.
    return np.mean(ndcg).item()

def get_NDCG_with_pred(top_k_item,k):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
    
    r = []
#     print(top_k_pred_list)
    for i in range(0, len(top_k_pred_list)):
        label = list(map(lambda x: x in true_interact_list[i], top_k_pred_list[i]))
        r.append(label)
#     print(len(top_k_pred_list))
#     print(len(r))
#     print(len(r[0]))
#     print(r[0])
    
#     return r
    return NDCGatK_r(true_interact_list,r,k)

# hit

def getHit(top_n_pred_list, true_interact_list):
    assert len(top_n_pred_list) == len(true_interact_list)
    
    N = len(top_n_pred_list)
    sum = 0
    for i in range(0, N):
        if len(set(top_n_pred_list[i])& set(true_interact_list[i])) != 0 :
            sum +=1
            
    
    return sum/N 

def get_hit_with_pred(top_k_item):
    top_k_pred_list = []
    for uid, user_ratings in top_k_item.items():
        top_k_pred_list.append([iid for (iid, _) in user_ratings])
        
    return getHit(top_k_pred_list,true_interact_list)

In [None]:
print('hit',getHit(pred2,true_interact_list))
print('ndcg:',get_NDCG_with_pred(pred2,k=20))
print('recall:',get_Recall_Precision(pred2,k=20)[0])
print('precision:',get_Recall_Precision(pred2,k=20)[1])