# 一、导包取数

In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

from sklearn.metrics.pairwise import pairwise_distances
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
movie_metadata = pd.read_csv('data/movies_metadata.csv')
print(movie_metadata.shape)
ratings_small = pd.read_csv('data/ratings_small.csv')
print(ratings_small.shape)

(45466, 24)
(100004, 4)


In [7]:
movie_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [8]:
ratings_small.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

# 二、数据分析处理

### 2.1、获取指定数据列，并合并电影数据和评论数据

In [9]:
movie_df = movie_metadata[['id','title']]
rated_df = ratings_small[['userId', 'movieId', 'rating']]

In [10]:
# 转为数值
movie_df['id'] = pd.to_numeric(movie_df['id'], errors='coerce')
rated_df['movieId'] = pd.to_numeric(rated_df['movieId'], errors='coerce')

In [11]:
# 删除有空值的行
movie_df.dropna(axis=0,subset=['id'], inplace=True)
print(movie_df.shape)
movie_df.drop_duplicates(subset=['id'], inplace=True)
print(movie_df.shape)

(45463, 2)
(45433, 2)


In [12]:
# 使用join合并，合并后的数据条数为两个df的和，需要去除userId为空的数据。
join_data = movie_df.set_index('id').join(rated_df.set_index('movieId'))
print(join_data.shape)
join_data.dropna(axis=0,subset=['userId'], inplace=True)
print(join_data.shape)

(87592, 3)
(44989, 3)


In [13]:
# 使用merge合并，合并后的数据无需剔除
merge_df = movie_df.merge(rated_df, left_on='id', right_on='movieId')
print(join_data.shape)

(44989, 3)


In [14]:
# 统计每个电影的评论的个数
rating_count = join_data.groupby('title')['rating'].count()
all_data = join_data.join(rating_count, on='title', rsuffix='_count').reset_index()

## 2.2、筛选数据

In [15]:
# 查看评论数量的分位数：有21%的电影评论数大于20 
rating_count.quantile(np.arange(0.6,1,0.01))

0.60      7.00
0.61      7.00
0.62      7.00
0.63      8.00
0.64      8.00
0.65      9.00
0.66      9.00
0.67     10.00
0.68     10.00
0.69     11.00
0.70     12.00
0.71     12.00
0.72     13.00
0.73     14.00
0.74     14.00
0.75     15.75
0.76     17.00
0.77     18.00
0.78     19.00
0.79     20.00
0.80     21.00
0.81     22.33
0.82     24.00
0.83     26.00
0.84     27.00
0.85     29.00
0.86     31.00
0.87     34.00
0.88     37.00
0.89     41.77
0.90     45.00
0.91     49.00
0.92     52.56
0.93     59.00
0.94     64.42
0.95     71.00
0.96     83.28
0.97     98.21
0.98    119.14
0.99    168.49
Name: rating, dtype: float64

In [16]:
# 选择评论数量大于等于20的电影
movie_top = all_data.query('rating_count > 20')
movie_top.shape

(34552, 5)

In [17]:
# movie_top.isnull().sum()
movie_top = movie_top.drop_duplicates(['userId', 'title'])
movie_top.duplicated(['userId', 'title']).sum()

0

In [18]:
# 重塑数据，并将有效评论置为1，无效评论置为0
df_apriori = movie_top.pivot(index='userId', columns='title', values='rating').fillna(0)
df_apriori = df_apriori.applymap(lambda x: x if x<=0 else 1)
df_apriori.shape

(671, 580)

# 三、关联规则推荐

In [19]:
# 挖掘频繁项集
frequent_itemsets = apriori(df_apriori, min_support=0.1, use_colnames=True)
# frequent_itemsets.sort_values('support', ascending=False)
# 关联规则过滤
results = association_rules(frequent_itemsets, metric='lift', min_threshold=1)
# results.sort_values('lift', ascending=False)

results['antecedent_len'] = results['antecedents'].apply(lambda x: len(x))
results['consequent_len'] = results['consequents'].apply(lambda x: len(x))
results.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len
0,(5 Card Stud),(48 Hrs.),0.298063,0.298063,0.108793,0.365,1.224575,0.019952,1.105413,1,1
1,(48 Hrs.),(5 Card Stud),0.298063,0.298063,0.108793,0.365,1.224575,0.019952,1.105413,1,1
2,(A Clockwork Orange),(48 Hrs.),0.152012,0.298063,0.102832,0.676471,2.269559,0.057523,2.169625,1,1
3,(48 Hrs.),(A Clockwork Orange),0.298063,0.152012,0.102832,0.345,2.269559,0.057523,1.294638,1,1
4,(A Nightmare on Elm Street),(48 Hrs.),0.268256,0.298063,0.156483,0.583333,1.957083,0.076526,1.68465,1,1


In [20]:
# 根据用户的观看记录'Batman Returns'推荐多部电影
results_len1 = results.query('antecedent_len == 1')
results_len1['is_Batman_Returns'] = results_len1['antecedents'].map(lambda x: 1 if list(x)[0]=='Batman Returns' else 0)
df_recomendation_list = results_len1.query('is_Batman_Returns == 1').sort_values('lift', ascending=False)
df_recomendation_list.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len,is_Batman_Returns
63984,(Batman Returns),"(Silent Hill, The Hours, Reservoir Dogs, Monso...",0.298063,0.107303,0.102832,0.345,3.215208,0.070849,1.362897,1,4,1
36085,(Batman Returns),"(Silent Hill, Wag the Dog, Reservoir Dogs)",0.298063,0.105812,0.101341,0.34,3.213239,0.069803,1.35483,1,3,1
63893,(Batman Returns),"(Silent Hill, Sissi, Reservoir Dogs, Monsoon W...",0.298063,0.107303,0.101341,0.34,3.168611,0.069358,1.352572,1,4,1
63353,(Batman Returns),"(Silent Hill, Rain Man, Reservoir Dogs, Monsoo...",0.298063,0.107303,0.101341,0.34,3.168611,0.069358,1.352572,1,4,1
36016,(Batman Returns),"(Silent Hill, The Hours, Reservoir Dogs)",0.298063,0.116244,0.108793,0.365,3.139936,0.074145,1.391741,1,3,1


In [21]:
# 根据用户的观看记录'Batman Returns'推荐单部电影
df_recomendation_one = results_len1.query('is_Batman_Returns==1 and consequent_len==1').sort_values('lift', ascending=False)
df_recomendation_one.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,antecedent_len,consequent_len,is_Batman_Returns
628,(Batman Returns),(Reservoir Dogs),0.298063,0.228018,0.177347,0.595,2.609444,0.109384,1.906129,1,1,1
407,(Batman Returns),(Ariel),0.298063,0.159463,0.120715,0.405,2.539766,0.073185,1.412666,1,1,1
668,(Batman Returns),(Wag the Dog),0.298063,0.19225,0.14307,0.48,2.496744,0.085767,1.553365,1,1,1
664,(Batman Returns),(To Kill a Mockingbird),0.298063,0.262295,0.193741,0.65,2.478125,0.11556,2.107728,1,1,1
631,(Batman Returns),(Romeo + Juliet),0.298063,0.163934,0.120715,0.405,2.4705,0.071853,1.405152,1,1,1


# 四、协同过滤推荐（基于用户）

## 4.1、数据准备

In [22]:
data_small = pd.read_csv('data/ratings_small.csv')
print(data_small.shape)
data_small.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [23]:
# 为movie重新分配id
movie_id = pd.DataFrame(data_small['movieId'].unique())
movie_id['movidid'] = range(len(movie_id))
movie_id.set_index(0, inplace=True)

data_small = data_small.join(movie_id, on='movieId', how='left')
data_small.drop(['movieId'], axis=1, inplace=True)
data_small.head()

Unnamed: 0,userId,rating,timestamp,movidid
0,1,2.5,1260759144,0
1,1,3.0,1260759179,1
2,1,3.0,1260759182,2
3,1,2.0,1260759185,3
4,1,4.0,1260759205,4


In [24]:
# 获取用户和movie信息
users = data_small['userId'].nunique()
movies = data_small['movidid'].nunique()
print(users, movies)

671 9066


In [67]:
# 构建用户和movie矩阵——训练数据
user_item_matrix = np.zeros((users, movies))
print(len(user_item_matrix))

671


671

In [26]:
# 拆分数据集
train_data, test_data = train_test_split(data_small, test_size=0.3)
print(train_data.shape, test_data.shape)

(70002, 4) (30002, 4)


In [68]:
# 将用户评分填写到矩阵中——训练数据
for line in train_data.itertuples():
    user_item_matrix[line[1]-1, line[4]] = line[2]   # 因为userId是从1开始的，所以要减1
print(user_item_matrix.shape)

(671, 9066)


(671, 9066)

## 4.2、训练

In [28]:
# 计算用户相似度
# 当前矩阵的行向量表示的是用户，每一行表示为每个用户的向量
# 在使用pairwise_distances计算距离时，是计算每个用户向量之间的距离
# 可以用类似的方法来计算每个电影向量之间的距离（需要在构建矩阵时注意行和列的设置）
user_similarity = pairwise_distances(user_item_matrix, metric='cosine')
user_similarity.shape

(671, 671)

In [29]:
# 查看数据分位数
user_similarity_k = np.triu(user_similarity,k=1)
user_similarity_k_non = np.round(user_similarity_k[user_similarity_k.nonzero()],3)
np.percentile(user_similarity_k_non, [0, 25, 50, 75, 100])

array([0.347, 0.898, 0.947, 0.985, 1.   ])

In [64]:
# 预测用户评分
mean_movie_rating = np.mean(user_item_matrix, axis=1)
rating_diff = user_item_matrix - mean_movie_rating[:, np.newaxis]

# 预测公式
user_precdiction = mean_movie_rating[:,np.newaxis] + user_similarity.dot(rating_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [66]:
# 进行结果评估
user_precdiction_notnon = user_precdiction[user_item_matrix.nonzero()]
user_actial_notnon = user_item_matrix[user_item_matrix.nonzero()]


# 计算均方误差
erro_train = sqrt(mean_squared_error(user_actial_notnon, user_precdiction_notnon))
erro_train


3.3871491574989117

## 4.3、预测

In [76]:

# 构建用户和movie矩阵——测试数据
user_item_matrix_test = np.zeros((users, movies))
len(user_item_matrix_test)
# 将用户评分填写到矩阵中——测试数据
for line in test_data.itertuples():
    user_item_matrix_test[line[1]-1, line[4]] = line[2]   # 因为userId是从1开始的，所以要减1
user_item_matrix_test.shape

(671, 9066)

In [77]:
rating_diff_test = user_item_matrix_test - mean_movie_rating[:, np.newaxis]
# 预测公式
user_precdiction_test = mean_movie_rating[:,np.newaxis] + user_similarity.dot(rating_diff_test) / np.array([np.abs(user_similarity).sum(axis=1)]).T

In [78]:
# 进行结果评估
user_precdiction_notnon_test = user_precdiction_test[user_item_matrix_test.nonzero()]
print(user_precdiction_notnon_test.shape)
user_actial_notnon_test = user_item_matrix_test[user_item_matrix_test.nonzero()]
print(user_actial_notnon_test.shape)

# 计算均方误差
erro_train_test = sqrt(mean_squared_error(user_precdiction_notnon_test, user_actial_notnon_test))
erro_train_test

(30002,)
(30002,)


3.5180228632464368

## 4.4、协同过滤代码整合

In [None]:
data_small = pd.read_csv('data/ratings_small.csv')

# 为movie重新分配id
movie_id = pd.DataFrame(data_small['movieId'].unique())
movie_id['movidid'] = range(len(movie_id))
movie_id.set_index(0, inplace=True)
data_small = data_small.join(movie_id, on='movieId', how='left')
data_small.drop(['movieId'], axis=1, inplace=True)

# 获取用户和movie信息
users = data_small['userId'].nunique()
movies = data_small['movidid'].nunique()

# 拆分数据集
train_data, test_data = train_test_split(data_small, test_size=0.3)


def get_matrix(data):
    matrix = np.zeros((users, movies))
    for line in data.itertuples():
        matrix[line[1]-1, line[4]] = line[2]   # 因为userId是从1开始的，所以要减1
    return matrix


mean_movie_rating = np.mean(user_item_matrix, axis=1)
# 计算用户相似度
user_similarity = pairwise_distances(user_item_matrix, metric='cosine')
def train_model(metrix):
    # 预测用户评分
    rating_diff = metrix - mean_movie_rating[:, np.newaxis]
    # 预测公式
    user_precdiction = mean_movie_rating[:,np.newaxis] + user_similarity.dot(rating_diff) / np.array([np.abs(user_similarity).sum(axis=1)]).T

    return user_precdiction


def assess(user_precdiction):
    user_precdiction_notnon = user_precdiction[user_item_matrix.nonzero()]
    user_actial_notnon = user_item_matrix[user_item_matrix.nonzero()]
    # 计算均方误差
    erro_train = sqrt(mean_squared_error(user_actial_notnon, user_precdiction_notnon))
    
    return erro_train

# 训练
user_item_matrix = get_matrix(train_data)   # 构建矩阵
user_precdiction = train_model(user_item_matrix)   # 计算预测结果
erro_train = assess(user_precdiction)   # 评估

# 预测
user_item_matrix_test = get_matrix(test_data)   # 构建矩阵
user_precdiction_test = train_model(user_item_matrix_test)   # 计算预测结果
erro_train_test = assess(user_precdiction_test)   # 评估