# 布尔索引

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#  是一个 Jupyter Notebook 的魔法命令(让 matplotlib 绘制的图形直接嵌入显示在 Notebook 单元格输出中，而不是弹出单独的窗口显示)
%matplotlib inline  

## 计算布尔值统计信息


In [5]:
pd.options.display.max_columns = 50

movie = pd.read_csv('movie.csv', index_col='title')
movie.head()

Unnamed: 0_level_0,year,color,content_rating,duration,director_name,director_fb,actor1,actor1_fb,actor2,actor2_fb,actor3,actor3_fb,gross,genres,num_reviews,num_voted_users,plot_keywords,language,country,budget,imdb_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,2009.0,Color,PG-13,178.0,James Cameron,0.0,CCH Pounder,1000.0,Joel David Moore,936.0,Wes Studi,855.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,723.0,886204,avatar|future|marine|native|paraplegic,English,USA,237000000.0,7.9
Pirates of the Caribbean: At World's End,2007.0,Color,PG-13,169.0,Gore Verbinski,563.0,Johnny Depp,40000.0,Orlando Bloom,5000.0,Jack Davenport,1000.0,309404152.0,Action|Adventure|Fantasy,302.0,471220,goddess|marriage ceremony|marriage proposal|pi...,English,USA,300000000.0,7.1
Spectre,2015.0,Color,PG-13,148.0,Sam Mendes,0.0,Christoph Waltz,11000.0,Rory Kinnear,393.0,Stephanie Sigman,161.0,200074175.0,Action|Adventure|Thriller,602.0,275868,bomb|espionage|sequel|spy|terrorist,English,UK,245000000.0,6.8
The Dark Knight Rises,2012.0,Color,PG-13,164.0,Christopher Nolan,22000.0,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0,448130642.0,Action|Thriller,813.0,1144337,deception|imprisonment|lawlessness|police offi...,English,USA,250000000.0,8.5
Star Wars: Episode VII - The Force Awakens,,,,,Doug Walker,131.0,Doug Walker,131.0,Rob Walker,12.0,,,,Documentary,,8,,,,,7.1


In [6]:
# 判断电影时长是否超过2小时
movie_2_hours = movie['duration'] > 120
movie_2_hours.head(10)

title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
John Carter                                    True
Spider-Man 3                                   True
Tangled                                       False
Avengers: Age of Ultron                        True
Harry Potter and the Half-Blood Prince         True
Name: duration, dtype: bool

In [7]:
# 有多少时长超过2小时的电影
movie_2_hours.sum()

np.int64(1039)

In [8]:
# 超过2小时的电影的比例
movie_2_hours.mean()

np.float64(0.2113506916192026)

In [9]:
# 用 describe 输出一些该布尔series信息
movie_2_hours.describe()

count      4916
unique        2
top       False
freq       3877
Name: duration, dtype: object

In [10]:
# 实际上，dureation 这列是有缺失值的，要想获得真正的超过两小时的电影的比例，需要先删掉缺失值
movie['duration'].dropna().gt(120).mean()  # .gt(120) - 判断每个值是否大于 120（返回布尔值 True/False）

np.float64(0.21199755152009794)

In [11]:
# 统计fase和true值的比例
movie_2_hours.value_counts(normalize=True)  # normalize=True 参数表示返回比例而非绝对计数

duration
False    0.788649
True     0.211351
Name: proportion, dtype: float64

In [12]:
# 比较同一个dataframe中的两列
actors = movie[['actor1_fb', 'actor2_fb']].dropna()

(actors['actor1_fb'] > actors['actor2_fb']).mean()

np.float64(0.9777687130328371)

# 构建多个布尔条件

In [14]:
# 创建多个布尔条件
criteria1 = movie.imdb_score > 8
criteria2 = movie.content_rating == 'PG-13'
criteria3 = (movie.year < 2000) | (movie.year >= 2010)

criteria2.head()

title
Avatar                                         True
Pirates of the Caribbean: At World's End       True
Spectre                                        True
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
Name: content_rating, dtype: bool

In [15]:
# 将这些布尔条件合成一个
criteria_final = criteria1 & criteria2 & criteria3
criteria_final.head()

title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool

In [16]:
# 在 pandas 中，位运算符 （& | ~）的优先级高于比较运算符，因此如果前面的条件3不加括号，就会报错
criteria3 = movie.year < 2000 | movie.year >= 2010  # （movie.year < (2000 | movie.year) >= 2010）

TypeError: Cannot perform 'ror_' with a dtyped [float64] array and scalar of type [bool]

## 用布尔索引过滤

In [18]:
# 创建布尔条件
crit_a1 = movie.imdb_score > 8
crit_a2 = movie.content_rating == 'PG-13'
crit_a3 = (movie.year < 2000) | (movie.year > 2009)
final_crit_a = crit_a1 & crit_a2 & crit_a3

In [19]:
# 创建第二个布尔条件
crit_b1 = movie.imdb_score < 5
crit_b2 = movie.content_rating == 'R'
crit_b3 = (movie.year >= 2000) & (movie.year <= 2010)
final_crit_b = crit_b1 & crit_b2 & crit_b3

In [20]:
# 将两个条件用 | 运算符连接起来
final_crit_all = final_crit_a | final_crit_b
final_crit_all.head()

title
Avatar                                        False
Pirates of the Caribbean: At World's End      False
Spectre                                       False
The Dark Knight Rises                          True
Star Wars: Episode VII - The Force Awakens    False
dtype: bool

In [21]:
# 用最终的布尔条件过滤数据
movie[final_crit_all].head()

Unnamed: 0_level_0,year,color,content_rating,duration,director_name,director_fb,actor1,actor1_fb,actor2,actor2_fb,actor3,actor3_fb,gross,genres,num_reviews,num_voted_users,plot_keywords,language,country,budget,imdb_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Dark Knight Rises,2012.0,Color,PG-13,164.0,Christopher Nolan,22000.0,Tom Hardy,27000.0,Christian Bale,23000.0,Joseph Gordon-Levitt,23000.0,448130642.0,Action|Thriller,813.0,1144337,deception|imprisonment|lawlessness|police offi...,English,USA,250000000.0,8.5
The Avengers,2012.0,Color,PG-13,173.0,Joss Whedon,0.0,Chris Hemsworth,26000.0,Robert Downey Jr.,21000.0,Scarlett Johansson,19000.0,623279547.0,Action|Adventure|Sci-Fi,703.0,995415,alien invasion|assassin|battle|iron man|soldier,English,USA,220000000.0,8.1
Captain America: Civil War,2016.0,Color,PG-13,147.0,Anthony Russo,94.0,Robert Downey Jr.,21000.0,Scarlett Johansson,19000.0,Chris Evans,11000.0,407197282.0,Action|Adventure|Sci-Fi,516.0,272670,based on comic book|knife|marvel cinematic uni...,English,USA,250000000.0,8.2
Guardians of the Galaxy,2014.0,Color,PG-13,121.0,James Gunn,571.0,Bradley Cooper,14000.0,Vin Diesel,14000.0,Djimon Hounsou,3000.0,333130696.0,Action|Adventure|Sci-Fi,653.0,682155,bounty hunter|outer space|raccoon|talking anim...,English,USA,170000000.0,8.1
Interstellar,2014.0,Color,PG-13,169.0,Christopher Nolan,22000.0,Matthew McConaughey,11000.0,Anne Hathaway,11000.0,Mackenzie Foy,6000.0,187991439.0,Adventure|Drama|Sci-Fi,712.0,928227,black hole|father daughter relationship|saving...,English,USA,165000000.0,8.6


In [22]:
# 使用 loc，对指定的列做过滤操作，可以清楚的看到过滤是否起作用
cols = ['imdb_score', 'content_rating', 'year']
movie_filtered = movie.loc[final_crit_all, cols]
movie_filtered.head()

Unnamed: 0_level_0,imdb_score,content_rating,year
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight Rises,8.5,PG-13,2012.0
The Avengers,8.1,PG-13,2012.0
Captain America: Civil War,8.2,PG-13,2016.0
Guardians of the Galaxy,8.1,PG-13,2014.0
Interstellar,8.6,PG-13,2014.0


In [26]:
# 用一个场部二表达式代替前面由短布尔表达式生成的布尔条件
final_crit_a2 = (movie.imdb_score > 8) & \
      (movie.content_rating == 'PG-13') & \
      ((movie.year < 2000) | (movie.year > 2009))
final_crit_a2.equals(final_crit_a)


True