In [1]:
%config ZMQInteractiveShell.ast_node_interactivity='all'
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 以下表格记录了用户对电影的评分
ratings = pd.read_csv("./data/ratings_small.csv")
ratings.shape
ratings.head()

(100004, 4)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
# 每部电影打分的人数
count = ratings.groupby("movieId").agg({ "timestamp": "count"}).reset_index()
type(count)
count.head()

pandas.core.frame.DataFrame

Unnamed: 0,movieId,timestamp
0,1,247
1,2,107
2,3,59
3,4,13
4,5,56


In [4]:
# 每部电影的平均分，以及有多少人打分
ratings_mean = ratings.groupby("movieId").agg({"rating": np.mean, "timestamp": "count"}).rename(columns={"timestamp": "count"}).reset_index()
ratings_mean.head()

Unnamed: 0,movieId,rating,count
0,1,3.87247,247
1,2,3.401869,107
2,3,3.161017,59
3,4,2.384615,13
4,5,3.267857,56


In [5]:
# 下一张表格是每部电影的具体信息
meta = pd.read_csv("./data/movies_metadata.csv")
meta.shape
meta.head()

(45466, 24)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
# 只需要电影的ID和名字
# meta.loc[:,["id", "title"]].head()
movie_titles = meta[["id", "title"]]
movie_titles.shape
movie_titles.head()

(45466, 2)

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [7]:
ratings_mean.info()
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9066 entries, 0 to 9065
Data columns (total 3 columns):
movieId    9066 non-null int64
rating     9066 non-null float64
count      9066 non-null int64
dtypes: float64(1), int64(2)
memory usage: 212.6 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
id       45466 non-null object
title    45460 non-null object
dtypes: object(2)
memory usage: 710.5+ KB


**脏数据处理，把字符串id转化为int**

In [8]:
for i in range(movie_titles.shape[0]):
    try:
        id = int(movie_titles.loc[i, "id"])
    except ValueError:
        movie_titles.loc[i, "id"] = 99999999999999
        
movie_titles.loc[:, "id"] = pd.to_numeric(movie_titles.loc[:, "id"])
movie_titles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 2 columns):
id       45466 non-null int64
title    45460 non-null object
dtypes: int64(1), object(1)
memory usage: 710.5+ KB


**merge**

In [9]:
ratings_mean.shape
movie_titles.shape

(9066, 3)

(45466, 2)

In [10]:
# 把电影的评分和电影名称表格拼接起来
# 默认是inner内连接，连接两边都有的值,左表以index作为连接关键字,右表用id作为关键字
movie_name_ratings = pd.merge(ratings_mean, movie_titles, how="inner", left_index=True, right_on="id")
movie_name_ratings.shape
movie_name_ratings.head()

(3140, 5)

Unnamed: 0,movieId,rating,count,id,title
4342,3,3.161017,59,2,Ariel
12947,4,2.384615,13,3,Shadows in Paradise
17,6,3.884615,104,5,Four Rooms
474,7,3.283019,53,6,Judgment Night
256,12,2.861111,18,11,Star Wars


In [11]:
# 先按count排序，count一样，按rating排序
movie_name_ratings.sort_values(by=["count", "rating"], ascending=False).head(5)

Unnamed: 0,movieId,rating,count,id,title
6620,356,4.054252,341,321,Mambo Italiano
1501,296,4.256173,324,266,Contempt
882,318,4.487138,311,284,The Apartment
1177,593,4.138158,304,525,The Blues Brothers
8828,260,4.221649,291,232,Rumble Fish
