### **1. Import Libraries**

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings
warnings.simplefilter('ignore')

### **2. Load Dataset**

**Small MovieLens Dataset**： 631个用户对9,742部电影的评价数据,共包含100,836个评分与3683个标签.

In [43]:
movies = pd.read_csv('./input_data/small/movies.csv')
tags = pd.read_csv('./input_data/small/tags.csv')
links = pd.read_csv('./input_data/small/links.csv')
ratings = pd.read_csv('./input_data/small/ratings.csv')

### **3. Understand Dataset**

#### movies Dataframe

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [5]:
movies.shape

(9742, 3)

In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.4+ KB


#### Tags Dataframe

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [9]:
tags.shape

(3683, 4)

In [10]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
userId       3683 non-null int64
movieId      3683 non-null int64
tag          3683 non-null object
timestamp    3683 non-null int64
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


#### Ratings Dataframe

In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [12]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [13]:
ratings.shape

(100836, 4)

In [14]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


#### Links Datafram

In [15]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [16]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [17]:
links.shape

(9742, 3)

In [18]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
imdbId     9742 non-null int64
tmdbId     9734 non-null float64
dtypes: float64(1), int64(2)
memory usage: 228.4 KB


### **4. Build Recommendation System**

#### 4.1 Simple Recommendation System

In [44]:
def reset_title(x):
    if ',' in x:
        return ' '.join(x[:-7].split(',')[::-1]).strip()
    else:
        return x[:-7]

In [45]:
# movies表中title列格式为：电影名字(上映年份)，将年份拆出新增year列
movies['year'] = movies['title'].apply(lambda x: x[-5:-1])
# 由于数据集本身原因,部分电影名字出现词语顺序错误,如‘Shining, The’
movies['title'] = movies['title'].apply(reset_title)
# 将genres转换为list格式
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [46]:
movies[movies['movieId'] == 1258]

Unnamed: 0,movieId,title,genres,year
957,1258,The Shining,[Horror],1980


In [47]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [57]:
# 计算每部电影的平均评分(average_raing),其等于总分数(sum_rating)/评分人数(num_rating)
# trick: 当一部电影的评分人数过少时,其平均评分并不客观,因此,只考虑推荐评分人数超过50人的电影
average_rating = dict()
rate_count = dict()
for movieId in set(ratings['movieId']):
    sum_rating = sum(ratings[ratings['movieId'] == movieId]['rating'])
    num_rating = len(ratings[ratings['movieId'] == movieId]['rating'])
    average_rating[movieId] = round(sum_rating / num_rating, 2)
    rate_count[movieId] = num_rating

In [58]:
# 建表统计每部电影的被评次数,表列名columns = ['movieId', 'vote_count']
movie_rate_count = pd.DataFrame(rate_count, index=[0]).transpose().reset_index()
movie_rate_count.columns = ['movieId', 'rate_count']
# 建立电影平均评分表,其列名columns = ['movieId', 'average_rating']
movie_average_rating = pd.DataFrame(average_rating, index=[0]).transpose().reset_index()
movie_average_rating.columns = ['movieId', 'average_rating']

In [59]:
# 表格合并,采用默认连接方式：how = inner
movie_ratings = pd.merge(movie_rate_count,movie_average_rating,on='movieId')
df = pd.merge(movies,movie_ratings,on='movieId')

In [60]:
df.head()

Unnamed: 0,movieId,title,genres,year,rate_count,average_rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215,3.92
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,110,3.43
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,52,3.26
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,7,2.36
4,5,Father of the Bride Part II,[Comedy],1995,49,3.07


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9724 entries, 0 to 9723
Data columns (total 6 columns):
movieId           9724 non-null int64
title             9724 non-null object
genres            9724 non-null object
year              9724 non-null object
rate_count        9724 non-null int64
average_rating    9724 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 531.8+ KB


**评分权重**
- 当两部电影平均得分一样时,该推荐哪一部呢？评分权重可以解决这样的问题.
- 评分权重综合衡量了每部电影的平均评分以及被评次数,其计算公式为:  
&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;$Weighted\ Rating = \frac{v}{v+m}\cdot r+\frac{m}{v+m}\cdot c$  
  * $v$ 为每部电影被评次数,即 df 表格中的 rate_count  
  * $m$ 为每部电影至少被评 $m$ 次才会被推荐  
  * $r$ 为每部电影的平均评分,即 df 表格中的 average_rating  
  * $c$ 为所有电影的平均评分
- 如何决定合适的 $m$ 值呢？由于相当数量的电影被评次数为1, 因此我们这里认为每部电影的被评次数超过80%电影的被评次数时,该电影才有可能会被推荐,即选取所有电影被评次数的80%分位为 $m$ 值.

In [67]:
m = df['rate_count'].quantile(0.8)
c = df['average_rating'].mean()
c, m

(3.2624598930481303, 12.0)

In [68]:
def weighted_rating(x):
    v = x['rate_count']
    r = x['average_rating']
    return (v / (v + m) * r + m / (v + m) * c)

In [69]:
df['weighted_rating'] = df.apply(weighted_rating, axis = 1)

In [70]:
df = df.sort_values('weighted_rating', ascending=False)

In [71]:
df.head()

Unnamed: 0,movieId,title,genres,year,rate_count,average_rating,weighted_rating
277,318,The Shawshank Redemption,"[Crime, Drama]",1994,317,4.43,4.387415
659,858,The Godfather,"[Crime, Drama]",1972,192,4.29,4.229556
2224,2959,Fight Club,"[Action, Crime, Drama, Thriller]",1999,218,4.27,4.217433
224,260,Star Wars: Episode IV - A New Hope,"[Action, Adventure, Sci-Fi]",1977,251,4.23,4.185854
46,50,The Usual Suspects,"[Crime, Mystery, Thriller]",1995,204,4.24,4.185692


In [72]:
# 将df表格中的genres进行拆分, 缩小电影分类粒度
'''
>>> s
    a  b
one 1  2
two 3  4
>>> s.stack()
one a  1
    b  2
two c  3
    d  4
'''
series = df.apply(lambda x: pd.Series(x['genres']), axis=1
                 ).stack().reset_index(level=1, drop=True)
series.name = 'genre'
recommendation = df.drop('genres', axis=1).join(series)

In [73]:
recommendation.head(5)

Unnamed: 0,movieId,title,year,rate_count,average_rating,weighted_rating,genre
0,1,Toy Story,1995,215,3.92,3.88524,Adventure
0,1,Toy Story,1995,215,3.92,3.88524,Animation
0,1,Toy Story,1995,215,3.92,3.88524,Children
0,1,Toy Story,1995,215,3.92,3.88524,Comedy
0,1,Toy Story,1995,215,3.92,3.88524,Fantasy


In [74]:
def rating_recommendation_engine(genre):
    df = recommendation[recommendation['genre'] == genre]
    movies_recommended = df.sort_values('weighted_rating', ascending=False
                                       ).drop('genre', axis=1).head(250)
    return movies_recommended

In [75]:
genres_option = ['Action','Adventure','Animation','Children','Comedy',
                 'Crime','Documentary','Drama','Fantasy','Horror',
                 'Mystery','Romance','Sci-Fic','Thriller'
                 'War','Wertern']
genre = 'Horror'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended.head(10)

Unnamed: 0,movieId,title,year,rate_count,average_rating,weighted_rating
510,593,The Silence of the Lambs,1991,279,4.16,4.122988
956,1258,The Shining,1980,109,4.08,3.998922
919,1219,Psycho,1960,83,4.04,3.941784
1066,1387,Jaws,1975,91,4.01,3.922908
1615,2160,Rosemary's Baby,1968,32,4.17,3.922489
914,1214,Alien,1979,146,3.97,3.916263
5324,8874,Shaun of the Dead,2004,77,4.01,3.909208
901,1200,Aliens,1986,126,3.96,3.899344
915,1215,Army of Darkness,1993,51,4.04,3.891897
2077,2762,The Sixth Sense,1999,179,3.89,3.850573


In [76]:
genre = 'Sci-Fi'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended.head(10)

Unnamed: 0,movieId,title,year,rate_count,average_rating,weighted_rating
224,260,Star Wars: Episode IV - A New Hope,1977,251,4.23,4.185854
897,1196,Star Wars: Episode V - The Empire Strikes Back,1980,211,4.22,4.168473
1938,2571,The Matrix,1999,278,4.19,4.151619
910,1210,Star Wars: Episode VI - Return of the Jedi,1983,196,4.14,4.089373
4900,7361,Eternal Sunshine of the Spotless Mind,2004,131,4.16,4.084682
474,541,Blade Runner,1982,124,4.1,4.026099
900,1199,Brazil,1985,59,4.18,4.024923
7355,79132,Inception,2010,143,4.07,4.007481
968,1270,Back to the Future,1985,171,4.04,3.989014
6755,60069,WALL·E,2008,104,4.06,3.977496


In [77]:
genre = 'Action'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended.head(10)

Unnamed: 0,movieId,title,year,rate_count,average_rating,weighted_rating
2224,2959,Fight Club,1999,218,4.27,4.217433
224,260,Star Wars: Episode IV - A New Hope,1977,251,4.23,4.185854
897,1196,Star Wars: Episode V - The Empire Strikes Back,1980,211,4.22,4.168473
6693,58559,The Dark Knight,2008,149,4.24,4.16714
899,1198,Raiders of the Lost Ark (Indiana Jones and the...,1981,200,4.21,4.156366
898,1197,The Princess Bride,1987,142,4.23,4.154607
1938,2571,The Matrix,1999,278,4.19,4.151619
908,1208,Apocalypse Now,1979,107,4.22,4.123441
1502,2028,Saving Private Ryan,1998,188,4.15,4.096748
910,1210,Star Wars: Episode VI - Return of the Jedi,1983,196,4.14,4.089373


In [78]:
genre = 'Romance'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended.head(10)

Unnamed: 0,movieId,title,year,rate_count,average_rating,weighted_rating
898,1197,The Princess Bride,1987,142,4.23,4.154607
694,912,Casablanca,1942,100,4.24,4.135264
314,356,Forrest Gump,1994,329,4.16,4.128415
3617,4973,Le) Amelie (Fabuleux destin d'Amélie Poulain,2001,120,4.18,4.096587
4900,7361,Eternal Sunshine of the Spotless Mind,2004,131,4.16,4.084682
1729,2324,Life Is Beautiful (La Vita è bella),1997,88,4.15,4.043495
690,908,North by Northwest,1959,57,4.18,4.020428
1283,1704,Good Will Hunting,1997,141,4.08,4.015879
2144,2858,American Beauty,1999,204,4.06,4.015692
680,898,The Philadelphia Story,1940,29,4.31,4.003403
