# 索引对齐

## 检查索引

In [1]:
import pandas as pd 
import numpy as np 


In [None]:
# 提取所有列
college = pd.read_csv("college.csv")
columns = college.columns
columns

Index(['instnm', 'city', 'stabbr', 'hbcu', 'menonly', 'womenonly', 'relaffil',
       'satvrmid', 'satmtmid', 'distanceonly', 'ugds', 'ugds_white',
       'ugds_black', 'ugds_hisp', 'ugds_asian', 'ugds_aian', 'ugds_nhpi',
       'ugds_2mor', 'ugds_nra', 'ugds_unkn', 'pptug_ef', 'curroper', 'pctpell',
       'pctfloan', 'ug25abv', 'md_earn_wne_p10', 'grad_debt_mdn_supp'],
      dtype='object')

In [None]:
# 用 values 属性，访问底层的numpy数组
columns.values

array(['instnm', 'city', 'stabbr', 'hbcu', 'menonly', 'womenonly',
       'relaffil', 'satvrmid', 'satmtmid', 'distanceonly', 'ugds',
       'ugds_white', 'ugds_black', 'ugds_hisp', 'ugds_asian', 'ugds_aian',
       'ugds_nhpi', 'ugds_2mor', 'ugds_nra', 'ugds_unkn', 'pptug_ef',
       'curroper', 'pctpell', 'pctfloan', 'ug25abv', 'md_earn_wne_p10',
       'grad_debt_mdn_supp'], dtype=object)

In [6]:
# 取出该数组的第六个值
columns[5]

'womenonly'

In [7]:
# 取出该数组的第2/9/11
columns[[1, 8, 10]]

Index(['city', 'satmtmid', 'ugds'], dtype='object')

In [8]:
# 逆序切片选取
columns[-7:-4]

Index(['pptug_ef', 'curroper', 'pctpell'], dtype='object')

In [9]:
# 索引有许多和series 和dataframe相同的方法
columns.min(), columns.max(), columns.isnull().sum()

('city', 'womenonly', 0)

In [10]:
# 索引对象可以直接通过字符串修改
columns + 'A'

Index(['instnmA', 'cityA', 'stabbrA', 'hbcuA', 'menonlyA', 'womenonlyA',
       'relaffilA', 'satvrmidA', 'satmtmidA', 'distanceonlyA', 'ugdsA',
       'ugds_whiteA', 'ugds_blackA', 'ugds_hispA', 'ugds_asianA', 'ugds_aianA',
       'ugds_nhpiA', 'ugds_2morA', 'ugds_nraA', 'ugds_unknA', 'pptug_efA',
       'curroperA', 'pctpellA', 'pctfloanA', 'ug25abvA', 'md_earn_wne_p10A',
       'grad_debt_mdn_suppA'],
      dtype='object')

In [11]:
# 索引对象也可以通过比较运算符，得到布尔索引
columns > 'G'

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

In [13]:
# 尝试用赋值的方法，修改索引对象的一个值，会导致类型错误，因为索引对象是不可变类型
columns[1] = 'A'

TypeError: Index does not support mutable operations

In [18]:
# 切片
c1 = columns[:4]
c1

Index(['instnm', 'city', 'stabbr', 'hbcu'], dtype='object')

In [19]:
c2 = columns[2:5]
c2

Index(['stabbr', 'hbcu', 'menonly'], dtype='object')

In [20]:
# 联合
c1.union(c2)

Index(['city', 'hbcu', 'instnm', 'menonly', 'stabbr'], dtype='object')

In [21]:
c1 | c2

  c1 | c2


Index(['city', 'hbcu', 'instnm', 'menonly', 'stabbr'], dtype='object')

In [22]:
# 对称差
c1.symmetric_difference(c2)

Index(['city', 'instnm', 'menonly'], dtype='object')

In [23]:
c1 ^ c2

  c1 ^ c2


Index(['city', 'instnm', 'menonly'], dtype='object')

## 求笛卡尔积
- 笛卡尔积：两个集合中所有可能的有序对组合

In [28]:
''' 
- range(4) 返回一个 range 对象，它是一个迭代器，表示一个包含从 0 到 3 的整数的序列，但并没有立即创建一个列表，只有在迭代时才会生成这些值。
- np.arange(4) 返回一个 NumPy 数组，这意味着它会创建一个实际的数组对象，可以直接操作、切片、进行数学运算等。
'''
print(range(4))
print(np.arange(4))
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

range(0, 4)
[0 1 2 3]


a    0
a    1
a    2
b    3
dtype: int32

In [29]:
# 创建两个有不同索引，但包含一些相同值的series
s1 = pd.Series(index=list('aaab'), data=np.arange(4))
s1

a    0
a    1
a    2
b    3
dtype: int32

In [30]:
s2 = pd.Series(index=list('cababb'), data=np.arange(6))
s2

c    0
a    1
b    2
a    3
b    4
b    5
dtype: int32

In [None]:
# 两者相加，以产生一个笛卡尔积
''' 
s1 + s2 的行为是根据索引对齐相加的：

- Pandas 会根据索引名称匹配对应的元素做相加
- 如果某个索引在其中一个 Series 中不存在，对应结果为 NaN
- 因为索引里有重复的标签，Pandas 会尝试把带有相同标签的所有元素两两对应相加，产生一种“笛卡尔积”式的对齐。

举例：

s1 中标签 'a' 有三个值 [0,1,2]
s2 中标签 'a' 有两个值 [1,3]
对这两个标签 'a'，s1 + s2 会做所有可能的配对相加（3个 × 2个 = 6 个结果）

类似的标签 'b' 有 1 个和 3 个值，对应会产生 3 个结果

标签 'c' 只在 s2 中有，在 s1 中没有对应，结果是 NaN
'''
s1 + s2

a    1.0
a    3.0
a    2.0
a    4.0
a    3.0
a    5.0
b    5.0
b    7.0
b    8.0
c    NaN
dtype: float64

In [32]:
# 当两组索引元素完全相同，顺序也相同时，不会生成笛卡尔积，索引会按照他们的位置对齐
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('aaabb'), data=np.arange(5))
s1 + s2

a    0
a    2
a    4
b    6
b    8
dtype: int32

In [33]:
# 如果索引元素相同，但顺序不同，是能产生笛卡尔积的
s1 = pd.Series(index=list('aaabb'), data=np.arange(5))
s2 = pd.Series(index=list('bbaaa'), data=np.arange(5))
s1 + s2

a    2
a    3
a    4
a    3
a    4
a    5
a    4
a    5
a    6
b    3
b    4
b    4
b    5
dtype: int32

## 索引爆炸

In [2]:
employee = pd.read_csv('employee.csv', index_col='race')
employee.head()

Unnamed: 0_level_0,dept,title,hire_date,salary,sex
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
White,Police,POLICE SERGEANT,2001-12-03,87545.38,Male
Hispanic,Other,ASSISTANT CITY ATTORNEY II,2010-11-15,82182.0,Male
Black,Houston Public Works,SENIOR SLUDGE PROCESSOR,2006-01-09,49275.0,Male
Hispanic,Police,SENIOR POLICE OFFICER,1997-05-27,75942.1,Male
White,Police,SENIOR POLICE OFFICER,2006-01-23,69355.26,Male


In [3]:
# 选取salary做成两个series，判断二者是否相同
salary1 = employee['salary']
salary2 = employee['salary']
salary1 is salary2


True

In [4]:
# 二者指向同一个对象，一个改变另一个也会改变，为了收到全新的数据，使用copy方法
salary1 = employee['salary'].copy()
salary2 = employee['salary'].copy()
salary1 is salary2


False

In [7]:
# 对其中一个做索引排序，比较二者是否相同
salary1 = salary1.sort_index()
salary1.head()

race
Asian    37960.00
Asian    68116.62
Asian    72200.00
Asian    68116.62
Asian    42474.00
Name: salary, dtype: float64

In [8]:
salary2.head()

race
White       87545.38
Hispanic    82182.00
Black       49275.00
Hispanic    75942.10
White       69355.26
Name: salary, dtype: float64

In [9]:
# 将两个saries相加
salary_add = salary1 + salary2
salary_add.head()

race
Asian    106076.62
Asian     74506.00
Asian     79960.00
Asian    154529.00
Asian     83751.20
Name: salary, dtype: float64

In [10]:
# 再将sarlary与其自身相加，查看几个所得结果的长度
salary_add1 = salary1 + salary1
len(salary1), len(salary2), len(salary_add), len(salary_add1)

(24308, 24308, 175738920, 24308)

In [11]:
# 验证salary_add值得个数，因为笛卡尔积是作用在相同索引元素上的，可以对其平方值求和
index_vc = salary1.index.value_counts(dropna=False)
index_vc

race
Black              8661
White              7779
Hispanic           6148
Asian              1547
Native American     146
NaN                  27
Name: count, dtype: int64

In [12]:
index_vc.pow(2).sum()

np.int64(175738920)

## 用不等索引填充数值

In [13]:
baseball_14 = pd.read_csv('baseball14.csv', index_col='playerID')
baseball_15 = pd.read_csv('baseball15.csv', index_col='playerID')
baseball_16 = pd.read_csv('baseball16.csv', index_col='playerID')

baseball_14.head()

Unnamed: 0_level_0,yearID,stint,teamID,lgID,G,AB,R,H,2B,3B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
altuvjo01,2014,1,HOU,AL,158,660,85,225,47,3,...,59.0,56.0,9.0,36,53.0,7.0,5.0,1.0,5.0,20.0
cartech02,2014,1,HOU,AL,145,507,68,115,21,1,...,88.0,5.0,2.0,56,182.0,6.0,5.0,0.0,4.0,12.0
castrja01,2014,1,HOU,AL,126,465,43,103,21,2,...,56.0,1.0,0.0,34,151.0,1.0,9.0,1.0,3.0,11.0
corpoca01,2014,1,HOU,AL,55,170,22,40,6,0,...,19.0,0.0,0.0,14,37.0,0.0,3.0,1.0,2.0,3.0
dominma01,2014,1,HOU,AL,157,564,51,121,17,0,...,57.0,0.0,1.0,29,125.0,2.0,5.0,2.0,7.0,23.0


In [14]:
# 用索引方法difference，找到那些索引标签在baseball_14中，却不再在baseball_15、baseball_16中
baseball_14.index.difference(baseball_15.index)

Index(['corpoca01', 'dominma01', 'fowlede01', 'grossro01', 'guzmaje01',
       'hoeslj01', 'krausma01', 'preslal01', 'singljo02'],
      dtype='object', name='playerID')

In [15]:
baseball_14.index.difference(baseball_16.index)

Index(['cartech02', 'corpoca01', 'dominma01', 'fowlede01', 'grossro01',
       'guzmaje01', 'hoeslj01', 'krausma01', 'preslal01', 'singljo02',
       'villajo01'],
      dtype='object', name='playerID')

In [16]:
hits_14 = baseball_14['H']
hits_15 = baseball_15['H']
hits_16 = baseball_16['H']

hits_14.head()

playerID
altuvjo01    225
cartech02    115
castrja01    103
corpoca01     40
dominma01    121
Name: H, dtype: int64

In [18]:
hits_15.head()

playerID
altuvjo01    200
cartech02     78
castrja01     71
congeha01     46
correca01    108
Name: H, dtype: int64

In [17]:
(hits_14 + hits_15).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01      NaN
corpoca01      NaN
Name: H, dtype: float64

In [19]:
# 使用add方法 和 参数 fill_value 避免产生缺失值
hits_14.add(hits_15, fill_value=0).head()

playerID
altuvjo01    425.0
cartech02    193.0
castrja01    174.0
congeha01     46.0
corpoca01     40.0
Name: H, dtype: float64

In [20]:
hits_total = hits_14.add(hits_15, fill_value=0).add(hits_16, fill_value=0)
hits_total.head()

playerID
altuvjo01    641.0
bregmal01     53.0
cartech02    193.0
castrja01    243.0
congeha01     46.0
Name: H, dtype: float64

In [21]:
# 检查结果中是否有缺失值
hits_total.hasnans

False

In [22]:
# 如果一个元素在两个series都是缺失值，即便使用了fill_value，相加结果也仍是缺失值
s = pd.Series(index=['a', 'b', 'c', 'd'], data=[np.nan, 3, np.nan, 1])
s1 = pd.Series(index=['a', 'b', 'c'], data=[np.nan, 6, 10])
s.add(s1, fill_value=5)

a     NaN
b     9.0
c    15.0
d     6.0
dtype: float64

In [23]:
s1.add(s, fill_value=5)

a     NaN
b     9.0
c    15.0
d     6.0
dtype: float64

In [27]:
# 只要行或者列不能对齐，就会产生缺失值
df_14 = baseball_14[['G','AB', 'R', 'H']]
df_15 = baseball_15[['AB', 'R', 'H', 'HR']]

In [26]:
df_14.head()

Unnamed: 0_level_0,G,AB,R,H
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,158,660,85,225
cartech02,145,507,68,115
castrja01,126,465,43,103
corpoca01,55,170,22,40
dominma01,157,564,51,121


In [25]:
df_15.head()

Unnamed: 0_level_0,AB,R,H,HR
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
altuvjo01,638,86,200,15
cartech02,391,50,78,24
castrja01,337,38,71,11
congeha01,201,25,46,11
correca01,387,52,108,22


In [28]:
(df_14 + df_15).head(10).style.highlight_null('yellow')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,,425.0,,171.0
cartech02,898.0,,193.0,,118.0
castrja01,802.0,,174.0,,81.0
congeha01,,,,,
corpoca01,,,,,
correca01,,,,,
dominma01,,,,,
fowlede01,,,,,
gattiev01,,,,,
gomezca01,,,,,


In [29]:
# 即便使用了fill_value=0,有些值也会是缺失值，这是因为一些行和列的组合根本不存在输入的数据中
df_14.add(df_15, fill_value=0).head(10).style.highlight_null('yellow')

Unnamed: 0_level_0,AB,G,H,HR,R
playerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
altuvjo01,1298.0,158.0,425.0,15.0,171.0
cartech02,898.0,145.0,193.0,24.0,118.0
castrja01,802.0,126.0,174.0,11.0,81.0
congeha01,201.0,,46.0,11.0,25.0
corpoca01,170.0,55.0,40.0,,22.0
correca01,387.0,,108.0,22.0,52.0
dominma01,564.0,157.0,121.0,,51.0
fowlede01,434.0,116.0,120.0,,61.0
gattiev01,566.0,,139.0,27.0,66.0
gomezca01,149.0,,36.0,4.0,19.0


## 从不同的dataframe追加列

In [30]:
# 保留每个部门的第一行（即薪资最高的员工）
employee = pd.read_csv('employee.csv')
dept_sal = employee[['dept', 'salary']]
dept_sal = dept_sal.sort_values(['dept', 'salary'], ascending=[True, False])
max_dept_sal = dept_sal.drop_duplicates(subset='dept')  # drop_duplicates() - 用于删除重复行  subset='dept' - 指定只考虑 'dept' 列来判断重复
max_dept_sal.head()

Unnamed: 0,dept,salary
1732,Fire,342784.0
8405,Health & Human Services,186685.0
3897,Houston Airport System,275000.0
10704,Houston Public Works,275000.0
7564,Library,170000.0


In [31]:
# 行索引相同，向employee的DataFrame新增一列（如果行索引不同，不能直接使用这种方式新增列）
max_dept_sal = max_dept_sal.set_index('dept')
employee = employee.set_index('dept')
employee['MAX_DEPT_SALARY'.lower()] =max_dept_sal['salary']

In [32]:
pd.options.display.max_columns = 6

In [34]:
employee.head()

Unnamed: 0_level_0,title,hire_date,salary,sex,race,max_dept_salary
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Police,POLICE SERGEANT,2001-12-03,87545.38,Male,White,280000.0
Other,ASSISTANT CITY ATTORNEY II,2010-11-15,82182.0,Male,Hispanic,275000.0
Houston Public Works,SENIOR SLUDGE PROCESSOR,2006-01-09,49275.0,Male,Black,275000.0
Police,SENIOR POLICE OFFICER,1997-05-27,75942.1,Male,Hispanic,280000.0
Police,SENIOR POLICE OFFICER,2006-01-23,69355.26,Male,White,280000.0


In [35]:
employee.query('salary > max_dept_salary')

Unnamed: 0_level_0,title,hire_date,salary,sex,race,max_dept_salary
dept,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [39]:
# 用random从dept_sal随机取10行，不做替换 
np.random.seed(1234)  # 1234 是一个随机种子值，它本身没有特殊含义，只是一个任意选择的整数
random_salary = dept_sal.sample(n=10).set_index('dept')  # .sample(n=10)默认采用不放回抽样（即每条记录最多被抽中一次）
random_salary

Unnamed: 0_level_0,salary
dept,Unnamed: 1_level_1
Fire,51194.0
Health & Human Services,44283.0
Other,37398.0
Solid Waste Management,39874.0
Police,98117.24
Parks & Recreation,40685.0
Solid Waste Management,32760.0
Other,59999.94
Fire,66522.56
Fire,40170.26


In [40]:
# random_salary中是有重复索引的，employee DataFrame的标签要对应random_salary中的多个标签
employee['RANDOM_SALARY'.lower()] = random_salary['salary']

ValueError: cannot reindex on an axis with duplicate labels

In [41]:
# 选取max_dept_sal['salary'].head(3)赋值给employee['MAX_SALARY2'.lower()]
employee['MAX_SALARY2'.lower()] = max_dept_sal['salary'].head(3)

In [43]:
employee.max_salary2.value_counts()

max_salary2
342784.0    4376
186685.0    1353
275000.0    1216
Name: count, dtype: int64

In [45]:
# 因为只填充了三个部门值，所以其他部门在结果中都是缺失值
employee.max_salary2.isnull().mean()

np.float64(0.7142915912456804)

## 高亮每列的最大值

In [49]:
pd.options.display.max_rows = 8

In [51]:
college = pd.read_csv('college.csv', index_col='instnm')
college.dtypes

city                   object
stabbr                 object
hbcu                  float64
menonly               float64
                       ...   
pctfloan              float64
ug25abv               float64
md_earn_wne_p10        object
grad_debt_mdn_supp     object
Length: 26, dtype: object

In [56]:
# grad_debt_mdn_supp, md_earn_wne_p10是对象类型，对其进行检查，发现含有字符串
college.md_earn_wne_p10.iloc[0]

'30300'

In [55]:
college.grad_debt_mdn_supp.iloc[0]

'33888'

In [57]:
# 降序检查
college.md_earn_wne_p10.sort_values(ascending=False).head()

instnm
Sharon Regional Health System School of Nursing          PrivacySuppressed
P&A Scholars Beauty School                               PrivacySuppressed
Fairview Beauty Academy                                  PrivacySuppressed
Rabbi Jacob Joseph School                                PrivacySuppressed
Acupuncture and Integrative Medicine College-Berkeley    PrivacySuppressed
Name: md_earn_wne_p10, dtype: object

In [58]:
# 可以用 to_numeric() 函数，将某列的值做强制转换
cols = ['md_earn_wne_p10', 'grad_debt_mdn_supp']
for col in cols:
    college[col] = pd.to_numeric(college[col], errors='coerce')  # errors='coerce'：遇到无法转换的内容时设为NaN

college.dtypes.loc[cols]

md_earn_wne_p10       float64
grad_debt_mdn_supp    float64
dtype: object

In [59]:
# 用 select_dtypes 方法过滤出数值列
college_n = college.select_dtypes(include=[np.number])
college_n.head()

Unnamed: 0_level_0,hbcu,menonly,womenonly,...,ug25abv,md_earn_wne_p10,grad_debt_mdn_supp
instnm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Alabama A & M University,1.0,0.0,0.0,...,0.1049,30300.0,33888.0
University of Alabama at Birmingham,0.0,0.0,0.0,...,0.2422,39700.0,21941.5
Amridge University,0.0,0.0,0.0,...,0.854,40100.0,23370.0
University of Alabama in Huntsville,0.0,0.0,0.0,...,0.264,45500.0,24097.0
Alabama State University,1.0,0.0,0.0,...,0.127,26600.0,33118.5


In [60]:
# 有的列只含有两个值，用ninuqie()方法挑出这些列
criteria = college_n.nunique() == 2
criteria.head()

hbcu          True
menonly       True
womenonly     True
relaffil      True
satvrmid     False
dtype: bool

In [61]:
# 将布尔series传给索引运算符，生成二元列的列表
binary_cols = college_n.columns[criteria].tolist()
binary_cols

['hbcu', 'menonly', 'womenonly', 'relaffil', 'distanceonly', 'curroper']

In [None]:
# 用 drop 方法删除这些列
college_n2 = college_n.drop(labels=binary_cols, axis='columns')
college_n2.head()