In [1]:
import pandas as pd
import numpy as np

# 1.检查索引

In [2]:
#读取college数据集，提取所有的列
college = pd.read_csv('data/college.csv')
columns = college.columns
columns

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL',
       'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS', 'UGDS_WHITE',
       'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI',
       'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'PCTPELL',
       'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10', 'GRAD_DEBT_MDN_SUPP'],
      dtype='object')

In [3]:
#用values属性，访问底层的numpy数组
columns.values

array(['INSTNM', 'CITY', 'STABBR', 'HBCU', 'MENONLY', 'WOMENONLY',
       'RELAFFIL', 'SATVRMID', 'SATMTMID', 'DISTANCEONLY', 'UGDS',
       'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP', 'UGDS_ASIAN', 'UGDS_AIAN',
       'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA', 'UGDS_UNKN', 'PPTUG_EF',
       'CURROPER', 'PCTPELL', 'PCTFLOAN', 'UG25ABV', 'MD_EARN_WNE_P10',
       'GRAD_DEBT_MDN_SUPP'], dtype=object)

In [4]:
#取出该数组的第6个值
columns[5]

'WOMENONLY'

In [5]:
#取出该数组的第2,9,11
columns[[1,8,10]]

Index(['CITY', 'SATMTMID', 'UGDS'], dtype='object')

In [6]:
#逆序切片选取
columns[-7:-4]

Index(['PPTUG_EF', 'CURROPER', 'PCTPELL'], dtype='object')

In [7]:
#索引有许多和Series和DataFrame相同的方法
columns.min(),columns.max(),columns.isnull().sum()

('CITY', 'WOMENONLY', 0)

In [8]:
#索引对象可以直接通过字符串方法修改
columns + '_A'

Index(['INSTNM_A', 'CITY_A', 'STABBR_A', 'HBCU_A', 'MENONLY_A', 'WOMENONLY_A',
       'RELAFFIL_A', 'SATVRMID_A', 'SATMTMID_A', 'DISTANCEONLY_A', 'UGDS_A',
       'UGDS_WHITE_A', 'UGDS_BLACK_A', 'UGDS_HISP_A', 'UGDS_ASIAN_A',
       'UGDS_AIAN_A', 'UGDS_NHPI_A', 'UGDS_2MOR_A', 'UGDS_NRA_A',
       'UGDS_UNKN_A', 'PPTUG_EF_A', 'CURROPER_A', 'PCTPELL_A', 'PCTFLOAN_A',
       'UG25ABV_A', 'MD_EARN_WNE_P10_A', 'GRAD_DEBT_MDN_SUPP_A'],
      dtype='object')

In [10]:
#索引对象也可以通过比较运算符，得到BOOL索引
columns > 'G'

array([ True, False,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True])

In [12]:
#尝试用赋值的方法，修改索引对象的一个值，会导致类型错误，因为索引对象是不可变类型
columns[1] = 'city'

TypeError: Index does not support mutable operations

In [13]:
#索引对象支持集合运算：联合、交叉、求差、对称差
#切片
c1 = columns[:4]
c1

Index(['INSTNM', 'CITY', 'STABBR', 'HBCU'], dtype='object')

In [14]:
c2 = columns[2:5]
c2

Index(['STABBR', 'HBCU', 'MENONLY'], dtype='object')

In [15]:
#联合
c1.union(c2)

Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR'], dtype='object')

In [16]:
c1 | c2

Index(['CITY', 'HBCU', 'INSTNM', 'MENONLY', 'STABBR'], dtype='object')

In [17]:
#对称差
c1.symmetric_difference(c2)

Index(['CITY', 'INSTNM', 'MENONLY'], dtype='object')

In [18]:
c1 ^ c2

Index(['CITY', 'INSTNM', 'MENONLY'], dtype='object')

# 2.求笛卡尔积

In [19]:
#创建两个有不同索引、但包含一些相同值得Series
s1 = pd.Series(index=list('aaab'),data=np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int32

In [20]:
s2 = pd.Series(index=list('cababb'),data=np.arange(6))
s2

c    0
a    1
b    2
a    3
b    4
b    5
dtype: int32

In [21]:
#二者相加，以产生一个笛卡尔积
s1 + s2

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

In [22]:
#当两组索引元素完全相同、顺序也相同时，不会生成笛卡尔积；
#索引会按照它们的位置对齐。
#下面两个Series完全相同，结果也是整数
s1 = pd.Series(index=list('aaabb'),data=np.arange(5))
s2 = pd.Series(index=list('aaabb'),data=np.arange(5))
s1 + s2

a    0
a    2
a    4
b    6
b    8
dtype: int32

In [23]:
#如果索引元素相同，但顺序不同，是能产生笛卡尔积的
s1 = pd.Series(index=list('aaabb'),data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'),data=np.arange(5))
s1 + s2

a    2
a    3
a    4
a    3
a    4
a    5
a    4
a    5
a    6
b    3
b    4
b    4
b    5
dtype: int32

# 3.索引爆炸

In [24]:
#读取employee数据集，设定行索引是RACE
employee = pd.read_csv('data/employee.csv',index_col='RACE')
employee.head()

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,DEPARTMENT,BASE_SALARY,EMPLOYMENT_TYPE,GENDER,EMPLOYMENT_STATUS,HIRE_DATE,JOB_DATE
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Hispanic/Latino,0,ASSISTANT DIRECTOR (EX LVL),Municipal Courts Department,121862.0,Full Time,Female,Active,2006-06-12,2012-10-13
Hispanic/Latino,1,LIBRARY ASSISTANT,Library,26125.0,Full Time,Female,Active,2000-07-19,2010-09-18
White,2,POLICE OFFICER,Houston Police Department-HPD,45279.0,Full Time,Male,Active,2015-02-03,2015-02-03
White,3,ENGINEER/OPERATOR,Houston Fire Department (HFD),63166.0,Full Time,Male,Active,1982-02-08,1991-05-25
White,4,ELECTRICIAN,General Services Department,56347.0,Full Time,Male,Active,1989-06-19,1994-10-22


In [25]:
#选取BASE_SALARY做成两个Series，判断二者是否相同
salary1 = employee['BASE_SALARY']
salary2 = employee['BASE_SALARY']
salary1 is salary2

True

In [26]:
#结果是True，表明二者指向的同一个对象；
#意味着，如果修改一个，另一个也会改变；
#所以复制数据，使用copy()最好。
salary1 = employee['BASE_SALARY'].copy()
salary2 = employee['BASE_SALARY'].copy()
salary1 is salary2

False

In [27]:
#对其中一个做索引排序，比较二者是否不同
salary1 = salary1.sort_index()
salary1.head()

RACE
American Indian or Alaskan Native    78355.0
American Indian or Alaskan Native    26125.0
American Indian or Alaskan Native    98536.0
American Indian or Alaskan Native        NaN
American Indian or Alaskan Native    55461.0
Name: BASE_SALARY, dtype: float64

In [28]:
salary2.head()

RACE
Hispanic/Latino    121862.0
Hispanic/Latino     26125.0
White               45279.0
White               63166.0
White               56347.0
Name: BASE_SALARY, dtype: float64

In [29]:
#将两个Series相加
salary_add = salary1 + salary2
salary_add.head()

RACE
American Indian or Alaskan Native    138702.0
American Indian or Alaskan Native    156710.0
American Indian or Alaskan Native    176891.0
American Indian or Alaskan Native    159594.0
American Indian or Alaskan Native    127734.0
Name: BASE_SALARY, dtype: float64

In [30]:
#再将salary1与其自身相加；
#查看几个所得结果的长度，可以看到长度从2000到达117万了
salary_add1 = salary1 + salary1
len(salary1),len(salary2),len(salary_add),len(salary_add1)

(2000, 2000, 1175424, 2000)

In [31]:
#验证salary_add值的个数。因为笛卡尔积是作用在相同索引元素上的，可以对其平方值求和
index_vc = salary1.index.value_counts(dropna=False)
index_vc

Black or African American            700
White                                665
Hispanic/Latino                      480
Asian/Pacific Islander               107
NaN                                   35
American Indian or Alaskan Native     11
Others                                 2
Name: RACE, dtype: int64

In [32]:
index_vc.pow(2).sum()

1175424

# 4.用不等索引填充数值 

In [33]:
#读取三个baseball数据集，行索引设为playerID
baseball_14 = pd.read_csv('data/baseball14.csv',index_col='playerID')
baseball_15 = pd.read_csv('data/baseball15.csv',index_col='playerID')
baseball_16 = pd.read_csv('data/baseball16.csv',index_col='playerID')
baseball_14.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
altuvjo01,2014,1,HOU,AL,158,660,85,225,47,3,...,59.0,56.0,9.0,36,53.0,7.0,5.0,1.0,5.0,20.0
cartech02,2014,1,HOU,AL,145,507,68,115,21,1,...,88.0,5.0,2.0,56,182.0,6.0,5.0,0.0,4.0,12.0
castrja01,2014,1,HOU,AL,126,465,43,103,21,2,...,56.0,1.0,0.0,34,151.0,1.0,9.0,1.0,3.0,11.0
corpoca01,2014,1,HOU,AL,55,170,22,40,6,0,...,19.0,0.0,0.0,14,37.0,0.0,3.0,1.0,2.0,3.0
dominma01,2014,1,HOU,AL,157,564,51,121,17,0,...,57.0,0.0,1.0,29,125.0,2.0,5.0,2.0,7.0,23.0


In [34]:
#用索引方法difference，找到哪些索引标签在baseball_14中，
#却不在baseball_15、baseball_16中
baseball_14.index.difference(baseball_15.index)

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')

In [35]:
baseball_14.index.difference(baseball_16.index)

Index(['cartech02', 'corpoca01', 'dominma01', 'fowlede01', 'grossro01',
       'guzmaje01', 'hoeslj01', 'krausma01', 'preslal01', 'singljo02',
       'villajo01'],
      dtype='object', name='playerID')

In [38]:
#找到每名球员在过去三个赛季的击球数，H列包含了这个数据
hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']
hits_14.head()

playerID
altuvjo01    225
cartech02    115
castrja01    103
corpoca01     40
dominma01    121
Name: H, dtype: int64

In [39]:
#将hits_14和hits_15两列相加
(hits_14 + hits_15).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01      NaN
corpoca01      NaN
Name: H, dtype: float64

In [40]:
#congeha01和corpoca01在2015年是有记录的，但是结果缺失了。
#使用add方法和参数fill_value，避免产生缺失值
hits_14.add(hits_15,fill_value=0).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01     46.0
corpoca01     40.0
Name: H, dtype: float64

In [41]:
#再将2016的数据也加上
hits_total = hits_14.add(hits_15,fill_value=0).add(hits_16,fill_value=0)
hits_total.head()

playerID
altuvjo01    641.0
bregmal01     53.0
cartech02    193.0
castrja01    243.0
congeha01     46.0
Name: H, dtype: float64

In [42]:
#检查结果中是否有缺失值
hits_total.hasnans

False

In [43]:
#如果一个元素在两个Series都是缺失值，即便使用了fill_value，相加的结果仍是缺失值
s = pd.Series(index=['a','b','c','d'],data=[np.nan,3,np.nan,1])
s

a    NaN
b    3.0
c    NaN
d    1.0
dtype: float64

In [44]:
s1 = pd.Series(index=['a','b','c'],data=[np.nan,6,10])
s1

a     NaN
b     6.0
c    10.0
dtype: float64

In [45]:
s.add(s1,fill_value=5)

a     NaN
b     9.0
c    15.0
d     6.0
dtype: float64

In [46]:
s1.add(s,fill_value=5)

a     NaN
b     9.0
c    15.0
d     6.0
dtype: float64

In [49]:
#从baseball_14中选取一些列
df_14 = baseball_14[['G','AB','R','H']]
df_14.head()

Unnamed: 0_level_0,G,AB,R,H
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,158,660,85,225
cartech02,145,507,68,115
castrja01,126,465,43,103
corpoca01,55,170,22,40
dominma01,157,564,51,121


In [50]:
#再从baseball_15中选取一些列，有相同的、也有不同的
df_15 = baseball_15[['AB','R','H','HR']]
df_15.head()

Unnamed: 0_level_0,AB,R,H,HR
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,638,86,200,15
cartech02,391,50,78,24
castrja01,337,38,71,11
congeha01,201,25,46,11
correca01,387,52,108,22


In [51]:
#将二者相加的话，只要行或列不能对齐，就会产生缺失值。
#style属性的highlight_null方法可以高亮缺失值
(df_14 + df_15).head(10).style.highlight_null('yellow')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,,425.0,,171.0
cartech02,898.0,,193.0,,118.0
castrja01,802.0,,174.0,,81.0
congeha01,,,,,
corpoca01,,,,,
correca01,,,,,
dominma01,,,,,
fowlede01,,,,,
gattiev01,,,,,
gomezca01,,,,,


In [53]:
#即便使用了fill_value=0,有些值也会是缺失值，
#这是因为一些行和列的组合根本不存在输入的数据中
df_14.add(df_15,fill_value=0).head(10).style.highlight_null('yellow')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298,158.0,425,15.0,171
cartech02,898,145.0,193,24.0,118
castrja01,802,126.0,174,11.0,81
congeha01,201,,46,11.0,25
corpoca01,170,55.0,40,,22
correca01,387,,108,22.0,52
dominma01,564,157.0,121,,51
fowlede01,434,116.0,120,,61
gattiev01,566,,139,27.0,66
gomezca01,149,,36,4.0,19


# 5.从不同的DataFrame追加列

In [57]:
#读取employee数据，选取‘DEPARTMENT’,'BASE_SALARY'这两列
employee = pd.read_csv('data/employee.csv')
dept_sal = employee[['DEPARTMENT','BASE_SALARY']]
#在每个部门内，对BASE_SALARY进行排序
dept_sal = dept_sal.sort_values(['DEPARTMENT','BASE_SALARY'],ascending=[True,False])
#用drop_duplicates方法保留每个部门的第一行
max_dept_sal = dept_sal.drop_duplicates(subset='DEPARTMENT')
max_dept_sal.head()

Unnamed: 0,DEPARTMENT,BASE_SALARY
1494,Admn. & Regulatory Affairs,140416.0
149,City Controller's Office,64251.0
236,City Council,100000.0
647,Convention and Entertainment,38397.0
1500,Dept of Neighborhoods (DON),89221.0


In [58]:
#使用DEPARTMENT作为行索引
max_dept_sal = max_dept_sal.set_index('DEPARTMENT')
employee = employee.set_index('DEPARTMENT')
#现在行索引包含匹配值了，可以向employee的DataFrame新增一列
employee['MAX_DEPT_SALARY'] = max_dept_sal['BASE_SALARY']
pd.options.display.max_columns = 6
employee

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,...,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Municipal Courts Department,0,ASSISTANT DIRECTOR (EX LVL),121862.0,...,2006-06-12,2012-10-13,121862.0
Library,1,LIBRARY ASSISTANT,26125.0,...,2000-07-19,2010-09-18,107763.0
Houston Police Department-HPD,2,POLICE OFFICER,45279.0,...,2015-02-03,2015-02-03,199596.0
Houston Fire Department (HFD),3,ENGINEER/OPERATOR,63166.0,...,1982-02-08,1991-05-25,210588.0
General Services Department,4,ELECTRICIAN,56347.0,...,1989-06-19,1994-10-22,89194.0
Houston Police Department-HPD,5,SENIOR POLICE OFFICER,66614.0,...,1984-11-26,2005-03-26,199596.0
Public Works & Engineering-PWE,6,ENGINEER,71680.0,...,2012-03-26,2012-03-26,178331.0
Houston Airport System (HAS),7,CARPENTER,42390.0,...,2013-11-04,2013-11-04,186192.0
Public Works & Engineering-PWE,8,DEPUTY ASSISTANT DIRECTOR (EXECUTIVE LEV,107962.0,...,1993-11-15,2013-01-05,178331.0
Houston Airport System (HAS),9,AIRPORT OPERATIONS COORDINATOR,44616.0,...,2016-03-14,2016-03-14,186192.0


In [59]:
#现在可以用query查看是否有BASE_SALARY大于MAX_DEPT_SALARY的
employee.query('BASE_SALARY > MAX_DEPT_SALARY')

Unnamed: 0_level_0,UNIQUE_ID,POSITION_TITLE,BASE_SALARY,...,HIRE_DATE,JOB_DATE,MAX_DEPT_SALARY
DEPARTMENT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [60]:
#用random从dept_sal随机取10行，不做替换
np.random.seed(1234)
random_salary = dept_sal.sample(n=10).set_index('DEPARTMENT')
random_salary

Unnamed: 0_level_0,BASE_SALARY
DEPARTMENT,Unnamed: 1_level_1
Public Works & Engineering-PWE,50586.0
Houston Police Department-HPD,66614.0
Houston Police Department-HPD,66614.0
Housing and Community Devp.,78853.0
Houston Police Department-HPD,66614.0
Parks & Recreation,
Public Works & Engineering-PWE,37211.0
Public Works & Engineering-PWE,54683.0
Human Resources Dept.,58474.0
Health & Human Services,47050.0


In [61]:
#random_salary中是有重复索引的，employee DataFrame的标签要对应random_salary中的多个标签
employee['RANDOM_DALARY'] = random_salary['BASE_SALARY']

ValueError: cannot reindex from a duplicate axis

In [62]:
#选取max_dept_sal['BASE_SALARY']的前三行，赋值给employee['MAX_SALARY2']
employee['MAX_SALARY2'] = max_dept_sal['BASE_SALARY'].head(3)
#对MAX_SALARY2统计
employee.MAX_SALARY2.value_counts()

140416.0    29
100000.0    11
64251.0      5
Name: MAX_SALARY2, dtype: int64

In [63]:
#因为只填充了三个部门的值，所以其他部门在结果中都是缺失值
employee.MAX_SALARY2.isnull().mean()

0.9775

# 6.高亮每列的最大值

In [64]:
pd.options.display.max_rows = 8
#读取college数据集，INSTNM作为列
college = pd.read_csv('data/college.csv',index_col='INSTNM')
college.dtypes

CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
                       ...   
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
Length: 26, dtype: object

In [66]:
#MD_WNE_P10和GRAD_DEBT_MDN_SUPP两列是对象类型，对其进行检查，发现含有字符串
college.MD_EARN_WNE_P10.iloc[0]

'30300'

In [67]:
college.GRAD_DEBT_MDN_SUPP.iloc[0]

'33888'

In [68]:
#降序检查
college.MD_EARN_WNE_P10.sort_values(ascending=False).head()

INSTNM
Sharon Regional Health System School of Nursing    PrivacySuppressed
Northcoast Medical Training Academy                PrivacySuppressed
Success Schools                                    PrivacySuppressed
Louisiana Culinary Institute                       PrivacySuppressed
Bais Medrash Toras Chesed                          PrivacySuppressed
Name: MD_EARN_WNE_P10, dtype: object

In [71]:
#可以用to_numeric，将某列的值做强制转换
cols = ['MD_EARN_WNE_P10','GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col],errors='coerce')
college.dtypes.loc[cols]

MD_EARN_WNE_P10       float64
GRAD_DEBT_MDN_SUPP    float64
dtype: object

In [72]:
#用select_dtypes方法过滤出数值列
college_n = college.select_dtypes(include=[np.number])
college_n.head()

Unnamed: 0_level_0,HBCU,MENONLY,WOMENONLY,...,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama A & M University,1.0,0.0,0.0,...,0.1049,30300.0,33888.0
University of Alabama at Birmingham,0.0,0.0,0.0,...,0.2422,39700.0,21941.5
Amridge University,0.0,0.0,0.0,...,0.854,40100.0,23370.0
University of Alabama in Huntsville,0.0,0.0,0.0,...,0.264,45500.0,24097.0
Alabama State University,1.0,0.0,0.0,...,0.127,26600.0,33118.5


In [73]:
#有的列只含有两个值，用nunique()方法挑出这些列
criteria = college_n.nunique() == 2
criteria.head()

HBCU          True
MENONLY       True
WOMENONLY     True
RELAFFIL      True
SATVRMID     False
dtype: bool

In [75]:
#将BOOL Series传给索引运算符，生成二元列的列表
binary_cols = college_n.columns[criteria].tolist()
binary_cols

['HBCU', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'DISTANCEONLY', 'CURROPER']

In [77]:
#用drop方法删除这些列
college_n2 = college_n.drop(labels=binary_cols,axis='columns')
college_n2.head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,...,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama A & M University,424.0,420.0,4206.0,...,0.1049,30300.0,33888.0
University of Alabama at Birmingham,570.0,565.0,11383.0,...,0.2422,39700.0,21941.5
Amridge University,,,291.0,...,0.854,40100.0,23370.0
University of Alabama in Huntsville,595.0,590.0,5451.0,...,0.264,45500.0,24097.0
Alabama State University,425.0,430.0,4811.0,...,0.127,26600.0,33118.5


In [78]:
#用idxmax方法选出每列最大值的行索引标签
max_cols = college_n2.idxmax()
max_cols

SATVRMID                      California Institute of Technology
SATMTMID                      California Institute of Technology
UGDS                               University of Phoenix-Arizona
UGDS_WHITE                Mr Leon's School of Hair Design-Moscow
                                         ...                    
PCTFLOAN                                  ABC Beauty College Inc
UG25ABV                           Dongguk University-Los Angeles
MD_EARN_WNE_P10                     Medical College of Wisconsin
GRAD_DEBT_MDN_SUPP    Southwest University of Visual Arts-Tucson
Length: 18, dtype: object

In [79]:
#用unique()方法选出所有不重复的列名
unique_max_cols = max_cols.unique()
unique_max_cols[:5]

array(['California Institute of Technology',
       'University of Phoenix-Arizona',
       "Mr Leon's School of Hair Design-Moscow",
       'Velvatex College of Beauty Culture',
       'Thunderbird School of Global Management'], dtype=object)

In [80]:
#用max_cols选出只包含最大值的行，用style的highlight_max()高亮
college_n2.loc[unique_max_cols].style.highlight_max()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
California Institute of Technology,765.0,785.0,983.0,0.2787,0.0153,0.1221,0.4385,0.001,0.0,0.057,0.0875,0.0,0.0,0.1126,0.2303,0.0082,77800.0,11812.5
University of Phoenix-Arizona,,,151558.0,0.3098,0.1555,0.076,0.0082,0.0042,0.005,0.1131,0.0131,0.3152,0.0,0.6009,0.592,,,33000.0
Mr Leon's School of Hair Design-Moscow,,,16.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625,0.625,0.2,,15710.0
Velvatex College of Beauty Culture,,,25.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.7692,0.0,0.52,,
Thunderbird School of Global Management,,,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,118900.0,
Cosmopolitan Beauty and Tech School,,,110.0,0.0091,0.0,0.0182,0.9727,0.0,0.0,0.0,0.0,0.0,0.3182,0.7761,0.1244,0.9545,,
Haskell Indian Nations University,430.0,440.0,805.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0224,0.8396,0.0,0.2089,22800.0,
Palau Community College,,,602.0,0.0,0.0017,0.0,0.0,0.0,0.9983,0.0,0.0,0.0,0.3887,0.856,0.0,0.2616,24700.0,
LIU Brentwood,,,15.0,0.0,0.1333,0.2667,0.0,0.0,0.0,0.5333,0.0,0.0667,0.4,0.5652,0.7826,0.7826,44600.0,25499.0
California University of Management and Sciences,,,98.0,0.0102,0.0204,0.0,0.0408,0.0,0.0,0.0,0.9286,0.0,0.0,0.0926,0.0556,0.6852,,


In [81]:
#用axis参数可以高亮每行的最大值
college = pd.read_csv('data/college.csv',index_col='INSTNM')
college_ugds = college.filter(like='UGDS_').head()
college_ugds.style.highlight_max(axis='columns')

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [82]:
pd.Timedelta(1,unit='Y')

Timedelta('365 days 05:49:12')

# 7.用链式方法重现idxmax

In [4]:
#和前面一样，只选出数值列
college = pd.read_csv('data/college.csv',index_col='INSTNM')
cols = ['MD_EARN_WNE_P10','GRAD_DEBT_MDN_SUPP']
for col in cols:
    college[col] = pd.to_numeric(college[col],errors='coerce')

college_n = college.select_dtypes(include=[np.number])
criteria = college_n.nunique() == 2
binary_cols = college_n.columns[criteria].tolist()
college_n = college_n.drop(labels=binary_cols,axis='columns')

college_n.max().head()

SATVRMID         765.0
SATMTMID         785.0
UGDS          151558.0
UGDS_WHITE         1.0
UGDS_BLACK         1.0
dtype: float64

In [5]:
#college_n.max()可以选出每列的最大值，用eq方法比较DataFrame的每个值和该列的最大值
college_n.eq(college_n.max()).head()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
#用any方法，选出至少包含一个True值的行
has_row_max = college_n.eq(college_n.max()).any(axis='columns')
has_row_max.head()

INSTNM
Alabama A & M University               False
University of Alabama at Birmingham    False
Amridge University                     False
University of Alabama in Huntsville    False
Alabama State University               False
dtype: bool

In [7]:
#因为只有18列，has_row_max最多只能有18个True，来看下实际共有多少
college_n.shape

(7535, 18)

In [8]:
has_row_max.sum()

401

In [10]:
#结果很奇怪，这是因为许多百分比的列的最大值是1.转而使用cumsum()累积求和
#说明很多列的最大值有很多
college_n.eq(college_n.max()).cumsum()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
University of Alabama at Birmingham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Amridge University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
University of Alabama in Huntsville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Alabama State University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The University of Alabama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Central Alabama Community College,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Athens State University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Auburn University at Montgomery,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Auburn University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
#对累加和再次累加，那么1就只会出现在最大值首次出现的位置
college_n.eq(college_n.max()).cumsum().cumsum()

Unnamed: 0_level_0,SATVRMID,SATMTMID,UGDS,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Alabama A & M University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
University of Alabama at Birmingham,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Amridge University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
University of Alabama in Huntsville,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Alabama State University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The University of Alabama,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Central Alabama Community College,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Athens State University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Auburn University at Montgomery,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Auburn University,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
#现在可以用eq方法和1进行比较，然后用any方法，选出所有至少包含一个True值的行
has_row_max2 = college_n.eq(college_n.max()) \
                        .cumsum()\
                        .cumsum()\
                        .eq(1)\
                        .any(axis='columns')
has_row_max2.head()

INSTNM
Alabama A & M University               False
University of Alabama at Birmingham    False
Amridge University                     False
University of Alabama in Huntsville    False
Alabama State University               False
dtype: bool

In [13]:
#查看有多少True值
has_row_max2.sum()

16

In [14]:
#直接通过BOOL索引选出这些学校
idxmax_cols = has_row_max2[has_row_max2].index
idxmax_cols

Index(['Thunderbird School of Global Management',
       'Southwest University of Visual Arts-Tucson', 'ABC Beauty College Inc',
       'Velvatex College of Beauty Culture',
       'California Institute of Technology',
       'Le Cordon Bleu College of Culinary Arts-San Francisco',
       'MTI Business College Inc', 'Dongguk University-Los Angeles',
       'Mr Leon's School of Hair Design-Moscow',
       'Haskell Indian Nations University', 'LIU Brentwood',
       'Medical College of Wisconsin', 'Palau Community College',
       'California University of Management and Sciences',
       'Cosmopolitan Beauty and Tech School', 'University of Phoenix-Arizona'],
      dtype='object', name='INSTNM')

In [15]:
#和idxmax方法的结果比较
set(college_n.idxmax().unique()) == set(idxmax_cols)

True

In [16]:
#耗时比较
%timeit college_n.idxmax().values

2.36 ms ± 155 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [18]:
%timeit college_n.eq(college_n.max()) \
                        .cumsum()\
                        .cumsum()\
                        .eq(1)\
                        .any(axis='columns')\
                        [lambda x:x].index

6.4 ms ± 207 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# 8.找到最常见的最大值

In [19]:
#读取college，过滤出只包含本科生种族比例信息的列
pd.options.display.max_rows = 40
college = pd.read_csv('data/college.csv',index_col='INSTNM')
college_ugds = college.filter(like='UGDS_')
college_ugds.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [20]:
#用idxmax方法选出每行种族比例最高的列名
highest_percentage_race = college_ugds.idxmax(axis='columns')
highest_percentage_race.head()

INSTNM
Alabama A & M University               UGDS_BLACK
University of Alabama at Birmingham    UGDS_WHITE
Amridge University                     UGDS_BLACK
University of Alabama in Huntsville    UGDS_WHITE
Alabama State University               UGDS_BLACK
dtype: object

In [21]:
#用value_counts，查看最大值的分布
highest_percentage_race.value_counts(normalize=True)

UGDS_WHITE    0.670352
UGDS_BLACK    0.151586
UGDS_HISP     0.129473
UGDS_UNKN     0.023422
UGDS_ASIAN    0.012074
UGDS_AIAN     0.006110
UGDS_NRA      0.004073
UGDS_NHPI     0.001746
UGDS_2MOR     0.001164
dtype: float64

In [23]:
#对于黑人比例最高的学校，排名第二的种族的分布情况
college_black = college_ugds[highest_percentage_race == 'UGDS_BLACK']
college_black = college_black.drop('UGDS_BLACK',axis='columns')
college_black.idxmax(axis='columns').value_counts(normalize=True)

UGDS_WHITE    0.661228
UGDS_HISP     0.230326
UGDS_UNKN     0.071977
UGDS_NRA      0.018234
UGDS_ASIAN    0.009597
UGDS_2MOR     0.006718
UGDS_NHPI     0.000960
UGDS_AIAN     0.000960
dtype: float64