### **1. Import Libraries**

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings
warnings.simplefilter('ignore')

### **2. Load Dataset**

**Small MovieLens Dataset**： 631个用户对9,742部电影的评价数据,共包含100,836个评分与3683个标签.

In [359]:
movies = pd.read_csv('./input_data/small/movies.csv')
tags = pd.read_csv('./input_data/small/tags.csv')
links = pd.read_csv('./input_data/small/links.csv')
ratings = pd.read_csv('./input_data/small/ratings.csv')

### **3. Understand Dataset**

#### movies Dataframe

In [108]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [109]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [110]:
movies.shape

(9742, 3)

In [111]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
title      9742 non-null object
genres     9742 non-null object
dtypes: int64(1), object(2)
memory usage: 228.4+ KB


#### Tags Dataframe

In [112]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [113]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [114]:
tags.shape

(3683, 4)

In [115]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
userId       3683 non-null int64
movieId      3683 non-null int64
tag          3683 non-null object
timestamp    3683 non-null int64
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


#### Ratings Dataframe

In [116]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [117]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [118]:
ratings.shape

(100836, 4)

In [119]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


#### Links Datafram

In [120]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [121]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

In [122]:
links.shape

(9742, 3)

In [123]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
imdbId     9742 non-null int64
tmdbId     9734 non-null float64
dtypes: float64(1), int64(2)
memory usage: 228.4 KB


### **4. Build Recommendation System**

#### 4.1 Simple Recommendation System

In [360]:
# movies表中title列格式为：电影名字(上映年份)，将年份拆出新增year列
movies['year'] = movies['title'].apply(lambda x: x[-5:-1])
# 由于数据集本身原因,部分电影名字出现词语顺序错误,如‘Shining, The’
movies['title'] = movies['title'].apply(lambda x: 
    ' '.join(x[:-7].split(',')[::-1]).lstrip() if ',' in x else x[:-7])
# 将genres转换为list格式
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))

In [364]:
movies[movies['movieId'] == 1258]

Unnamed: 0,movieId,title,genres,year
957,1258,The Shining,[Horror],1980


In [365]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [366]:
# 计算每部电影的平均评分(average_raing),其等于总分数(sum_rating)/评分人数(num_rating)
# trick: 当一部电影的评分人数过少时,其平均评分并不客观,因此,只考虑推荐评分人数超过50人的电影
average_rating = dict()
for movieId in set(ratings['movieId']):
    sum_rating = sum(ratings[ratings['movieId'] == movieId]['rating'])
    num_rating = len(ratings[ratings['movieId'] == movieId]['rating'])
    if num_rating > 50:
        average_rating[movieId] = round(sum_rating / num_rating, 2)

In [367]:
# 建立电影平均评分表,其列名columns = ['movieId', 'average_rating']
movie_average_rating = pd.DataFrame(average_rating, index=range(1)).transpose()
movie_average_rating.columns = ['average_rating']
movie_average_rating['movieId'] = movie_average_rating.index
movie_average_rating.index = list(range(len(movie_average_rating)))
movie_average_rating = movie_average_rating[['movieId', 'average_rating']]

In [368]:
movie_average_rating.head()

Unnamed: 0,movieId,average_rating
0,1,3.92
1,2,3.43
2,3,3.26
3,6,3.95
4,7,3.19


In [369]:
# 合并movies与movie_average_rating两个表格, 连接方式：inner, 合并键：movieId
df = pd.merge(movies, movie_average_rating, on='movieId')

In [370]:
df.head()

Unnamed: 0,movieId,title,genres,year,average_rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,3.92
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,3.43
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,3.26
3,6,Heat,"[Action, Crime, Thriller]",1995,3.95
4,7,Sabrina,"[Comedy, Romance]",1995,3.19


In [371]:
# 将df表格中的genres进行拆分, 缩小电影分类粒度
'''
>>> s
    a  b
one 1  2
two 3  4
>>> s.stack()
one a  1
    b  2
two c  3
    d  4
'''
series = df.apply(lambda x: pd.Series(x['genres']), axis=1
                 ).stack().reset_index(level=1, drop=True)
series.name = 'genre'
recommendation = df.drop('genres', axis=1).join(series)

In [372]:
recommendation.head(5)

Unnamed: 0,movieId,title,year,average_rating,genre
0,1,Toy Story,1995,3.92,Adventure
0,1,Toy Story,1995,3.92,Animation
0,1,Toy Story,1995,3.92,Children
0,1,Toy Story,1995,3.92,Comedy
0,1,Toy Story,1995,3.92,Fantasy


In [373]:
def rating_recommendation_engine(genre):
    df = recommendation[recommendation['genre'] == genre]
    movies_recommended = df.sort_values('average_rating', ascending=False
                                       ).drop('genre', axis=1).head(10)
    return movies_recommended

In [374]:
genres_option = ['Action','Adventure','Animation','Children','Comedy',
                 'Crime','Documentary','Drama','Fantasy','Horror',
                 'Mystery','Romance','Sci-Fic','Thriller'
                 'War','Wertern']
genre = 'Horror'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended

Unnamed: 0,movieId,title,year,average_rating
99,593,The Silence of the Lambs,1991,4.16
165,1258,The Shining,1980,4.08
153,1215,Army of Darkness,1993,4.04
154,1219,Psycho,1960,4.04
182,1387,Jaws,1975,4.01
373,8874,Shaun of the Dead,2004,4.01
152,1214,Alien,1979,3.97
354,6502,28 Days Later,2002,3.97
144,1200,Aliens,1986,3.96
265,2762,The Sixth Sense,1999,3.89


In [378]:
genre = 'Sci-Fi'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended

Unnamed: 0,movieId,title,year,average_rating
44,260,Star Wars: Episode IV - A New Hope,1977,4.23
140,1196,Star Wars: Episode V - The Empire Strikes Back,1980,4.22
248,2571,The Matrix,1999,4.19
143,1199,Brazil,1985,4.18
365,7361,Eternal Sunshine of the Spotless Mind,2004,4.16
150,1210,Star Wars: Episode VI - Return of the Jedi,1983,4.14
88,541,Blade Runner,1982,4.1
424,79132,Inception,2010,4.07
410,60069,WALL·E,2008,4.06
434,112852,Guardians of the Galaxy,2014,4.05


In [382]:
genre = 'Action'
movies_recommended = rating_recommendation_engine(genre)
movies_recommended

Unnamed: 0,movieId,title,year,average_rating
273,2959,Fight Club,1999,4.27
407,58559,The Dark Knight,2008,4.24
141,1197,The Princess Bride,1987,4.23
44,260,Star Wars: Episode IV - A New Hope,1977,4.23
149,1208,Apocalypse Now,1979,4.22
140,1196,Star Wars: Episode V - The Empire Strikes Back,1980,4.22
142,1198,Raiders of the Lost Ark (Indiana Jones and the...,1981,4.21
248,2571,The Matrix,1999,4.19
120,908,North by Northwest,1959,4.18
145,1201,Il) il cattivo il brutto The (Buono the Ba...,1966,4.15
