### **1. Import Libraries**

In [73]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.stem.snowball import SnowballStemmer
import warnings
warnings.simplefilter('ignore')

### **2. Load Dataset**

**Small MovieLens Dataset**： 631个用户对9,742部电影的评价数据,共包含100,836个评分与3683个标签.

In [240]:
movies = pd.read_csv('./input_data/small/movies.csv')
tags = pd.read_csv('./input_data/small/tags.csv')
links = pd.read_csv('./input_data/small/links.csv')
ratings = pd.read_csv('./input_data/small/ratings.csv')

### **3. Understand Dataset**

#### movies Dataframe

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [5]:
movies.shape

(9742, 3)

In [6]:
movies[movies.isnull().values == True]

Unnamed: 0,movieId,title,genres


In [6]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.4+ KB


#### tags Dataframe

In [7]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [8]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [9]:
tags.shape

(3683, 4)

In [10]:
tags[tags.isnull().values == True]

Unnamed: 0,userId,movieId,tag,timestamp


In [11]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
userId       3683 non-null int64
movieId      3683 non-null int64
tag          3683 non-null object
timestamp    3683 non-null int64
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


#### ratings Dataframe

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [15]:
ratings.shape

(100836, 4)

In [16]:
ratings[ratings.isnull().values == True]

Unnamed: 0,userId,movieId,rating,timestamp


In [17]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


#### links Datafram

In [18]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [19]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [20]:
links.shape

(9742, 3)

In [22]:
links[links.isnull().values == True]

Unnamed: 0,movieId,imdbId,tmdbId
624,791,113610,
843,1107,102336,
2141,2851,81454,
3027,4051,56600,
5532,26587,92337,
5854,32600,377059,
6059,40697,105946,
7382,79299,874957,


In [23]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
imdbId     9742 non-null int64
tmdbId     9734 non-null float64
dtypes: float64(1), int64(2)
memory usage: 228.4 KB


### **4. Build Recommendation System**

In [241]:
# 剔除tmdbIb值为NaN的电影
links = links[links['tmdbId'].notnull()]['movieId']
movies = movies[movies['movieId'].isin(links)]

In [242]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9734 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9734 non-null int64
title      9734 non-null object
genres     9734 non-null object
dtypes: int64(1), object(2)
memory usage: 304.2+ KB


In [243]:
def reset_title(x):
    if ',' in x:
        return ' '.join(x[:-7].split(',')[::-1]).strip()
    else:
        return x[:-7]

In [244]:
# movies表中title列格式为：电影名字(上映年份)，将年份拆出新增year列
movies['year'] = movies['title'].apply(lambda x: x[-5:-1])
# 由于数据集本身原因,部分电影名字出现词语顺序错误,如‘Shining, The’
movies['title'] = movies['title'].apply(reset_title)
# 将genres转换为list格式
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [245]:
movies[movies['movieId'] == 1258]

Unnamed: 0,movieId,title,genres,year
957,1258,The Shining,[Horror],1980


In [246]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


- 计算每部电影的平均评分(average_raing),其等于总分数(sum_rating) $\div$ 被评次数(rate_count).
- trick: 当一部电影被评次数过少时,其平均评分并不客观,因此,只考虑推荐被评次数超过m的电影.
- 如何决定合适的 $m$ 值呢？由于相当数量的电影被评次数为1, 因此我们这里认为每部电影的被评次数超过80%电影的被评次数时,该电影才有可能会被推荐,即选取所有电影被评次数的80%分位为 $m$ 值.

In [247]:
average_rating = dict()
rate_count = dict()
for movieId in set(ratings['movieId']):
    sum_rating = sum(ratings[ratings['movieId'] == movieId]['rating'])
    num_rating = len(ratings[ratings['movieId'] == movieId]['rating'])
    average_rating[movieId] = round(sum_rating / num_rating, 2)
    rate_count[movieId] = num_rating

In [248]:
# 建表统计每部电影的被评次数,表列名columns = ['movieId', 'vote_count']
movie_rate_count = pd.DataFrame(rate_count, index=[0]).transpose().reset_index()
movie_rate_count.columns = ['movieId', 'rate_count']
# 建立电影平均评分表,其列名columns = ['movieId', 'average_rating']
movie_average_rating = pd.DataFrame(average_rating, index=[0]).transpose().reset_index()
movie_average_rating.columns = ['movieId', 'average_rating']

In [249]:
# 表格合并,采用默认连接方式：how = inner
movie_ratings = pd.merge(movie_rate_count,movie_average_rating,on='movieId')
df = pd.merge(movies,movie_ratings,on='movieId')

In [252]:
m = df['rate_count'].quantile(0.95)
df = df[df['rate_count'] >= 47]
df.head()

Unnamed: 0,movieId,title,genres,year,rate_count,average_rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215,3.92
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,110,3.43
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,52,3.26
4,5,Father of the Bride Part II,[Comedy],1995,49,3.07
5,6,Heat,"[Action, Crime, Thriller]",1995,102,3.95


In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 491 entries, 0 to 8853
Data columns (total 6 columns):
movieId           491 non-null int64
title             491 non-null object
genres            491 non-null object
year              491 non-null object
rate_count        491 non-null int64
average_rating    491 non-null float64
dtypes: float64(1), int64(2), object(3)
memory usage: 26.9+ KB


In [254]:
# 将不同形式的标签词语统一转换为其次干形式, 如['dogs','Dog','Dogs','dog'],为同一个词dog
stemmer = SnowballStemmer('english')
s = tags[['movieId', 'tag']]

In [255]:
# 将属于同一部电影的标签提取出来并集中
def tag_extraction(df):
    d = dict()
    for movieId in set(df['movieId']):
        tag = df[df['movieId'] == movieId]['tag']
        d[movieId] = str(list(set([stemmer.stem(i) for i in tag])))
    return d

In [256]:
movie_tags = pd.DataFrame(tag_extraction(s), index=[0]).transpose().reset_index()
movie_tags.columns = ['movieId', 'tag']
movie_tags['tag'] = movie_tags['tag'].apply(literal_eval)

In [257]:
movie_tags.head()

Unnamed: 0,movieId,tag
0,1,"[fun, pixar]"
1,2,"[fantasi, magic board gam, robin william, game]"
2,3,"[moldi, old]"
3,122882,"[cinematographi, beauti, visually app]"
4,5,"[pregnanc, remak]"


In [258]:
df2 = pd.merge(df, movie_tags, how = 'left',on = 'movieId')

In [259]:
df2.head()

Unnamed: 0,movieId,title,genres,year,rate_count,average_rating,tag
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215,3.92,"[fun, pixar]"
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,110,3.43,"[fantasi, magic board gam, robin william, game]"
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,52,3.26,"[moldi, old]"
3,5,Father of the Bride Part II,[Comedy],1995,49,3.07,"[pregnanc, remak]"
4,6,Heat,"[Action, Crime, Thriller]",1995,102,3.95,


In [260]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 491 entries, 0 to 490
Data columns (total 7 columns):
movieId           491 non-null int64
title             491 non-null object
genres            491 non-null object
year              491 non-null object
rate_count        491 non-null int64
average_rating    491 non-null float64
tag               334 non-null object
dtypes: float64(1), int64(2), object(4)
memory usage: 30.7+ KB


In [261]:
df2['tag'] = df2['tag'].apply(lambda x: [] if str(x) == 'nan' else x)
df2['description'] = df2['genres'] + df2['tag']
df2['description'] = df2['description'].apply(lambda x: ' '.join(x))

In [262]:
df2.head()

Unnamed: 0,movieId,title,genres,year,rate_count,average_rating,tag,description
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,215,3.92,"[fun, pixar]",Adventure Animation Children Comedy Fantasy fu...
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,110,3.43,"[fantasi, magic board gam, robin william, game]",Adventure Children Fantasy fantasi magic board...
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,52,3.26,"[moldi, old]",Comedy Romance moldi old
3,5,Father of the Bride Part II,[Comedy],1995,49,3.07,"[pregnanc, remak]",Comedy pregnanc remak
4,6,Heat,"[Action, Crime, Thriller]",1995,102,3.95,[],Action Crime Thriller


In [263]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1,2),min_df=0,stop_words='english')
tfidf_matrix = tf.fit_transform(df2['description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [264]:
titles = df2['title']
indices = pd.Series(df2.index, index=df2['title'])

In [265]:
indices.head()

title
Toy Story                      0
Jumanji                        1
Grumpier Old Men               2
Father of the Bride Part II    3
Heat                           4
dtype: int64

In [266]:
def get_recommendation(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [269]:
get_recommendation('The Dark Knight').head(10)

194                Batman Returns
326            Gone in 60 Seconds
346                          Blow
382             Gangs of New York
452        No Country for Old Men
425                 Batman Begins
102                        Batman
302                       RoboCop
4                            Heat
34     Die Hard: With a Vengeance
Name: title, dtype: object

In [267]:
get_recommendation('Inception').head(10)

302                   RoboCop
432            V for Vendetta
387       The Matrix Reloaded
397    The Matrix Revolutions
471                Iron Man 2
372           Minority Report
465                District 9
358              Donnie Darko
36            Johnny Mnemonic
146                 The Abyss
Name: title, dtype: object