# pandas入门

## 1、pandas数据结构介绍

### 1.1、Series

In [2]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame

In [12]:
#Series是一种一维的数组形对象，其包含了数据标签index，与字典dict类似
obj=pd.Series([4,7,-5,3])
print(obj,obj.values,type(obj.values),obj.index,type(obj.index),sep='\n')      #Series的值就是矩阵类型，索引类似range(4)
'''
用value（value得到的是矩阵类型）和index得到其对应的值和索引
'''

0    4
1    7
2   -5
3    3
dtype: int64
[ 4  7 -5  3]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [42]:
#另外Series的索引也可以指定，此时索引的类型变了
obj2=pd.Series([4,7,-5,3],index=['d','b','a','c'])       
print(obj2,obj2.index,type(obj2.index),sep='\n')
print('\n\n')
print(obj2['a'],obj2['c'])

obj2[:]=10                #Series可以改变各个索引的值
print(obj2)
obj2[:]=np.arange(4);obj3=obj2;obj3=np.arange(4)    #一次操作中对所有索引的值进行改变时，需要用括号选取全部数值
print(obj2,obj3)

d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
<class 'pandas.core.indexes.base.Index'>



-5 3
d    10
b    10
a    10
c    10
dtype: int64
d    0
b    1
a    2
c    3
dtype: int64 [0 1 2 3]


In [8]:
#Series同样可以使用布尔索引、与标量相乘、使用数学函数
print(obj2[obj2>2],obj2*2,np.exp(obj2),sep='\n\n')

d    4
b    7
c    3
dtype: int64

d     8
b    14
a   -10
c     6
dtype: int64

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [112]:
#可以将字典转换为Series
dict1={'Ohio':3500,'Texas':7100,'Oregon':1600,'Uyh':500}
obj3=pd.Series(dict1)
print(obj3,dict1,sep='\n\n',end='\n\n')

states=['California','Ohio','Oregon','Texas']
obj4=pd.Series(dict1,index=states)              #可以拟定一个指定顺序的索引序列
print(obj4)                                     #给定索引序列中，没有'California'的值，因此为NaN，同时没有'Uyh'，因此没有这个索引

#用pd.isnull或者pd.notnull来检测是不是缺失数据    可以用外部内建函数，同时也是Series的一个实例
print('\n',pd.isnull(obj4),obj4.isnull(),sep='\n')

Ohio      3500
Texas     7100
Oregon    1600
Uyh        500
dtype: int64

{'Ohio': 3500, 'Texas': 7100, 'Oregon': 1600, 'Uyh': 500}

California       NaN
Ohio          3500.0
Oregon        1600.0
Texas         7100.0
dtype: float64


California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool


In [113]:
#Series自身与其索引均包括name属性
obj4.name='population'
obj4.index.name='state'
print(obj4)

#Series的索引也可以改变。但索引对象无法改变
obj4.index=['a','b','c','d']
print(obj4)
index=obj4.index
print(index)
index[1]='c'

state
California       NaN
Ohio          3500.0
Oregon        1600.0
Texas         7100.0
Name: population, dtype: float64
a       NaN
b    3500.0
c    1600.0
d    7100.0
Name: population, dtype: float64
Index(['a', 'b', 'c', 'd'], dtype='object')


TypeError: Index does not support mutable operations

## 1.2、DataFrame

In [9]:
#DataFrame表示矩阵的数据表，包含了已经排序的列集合。相当于索引相同的多个Series组成的字典
#由字典创建DataFrame
data={'state':['a','b','c','d','e','f'],'year':[2000,2001,2002,2003,2004,2005],'pop':[1.5,1.7,3.6,2.4,1.4,4.5]}
frame=pd.DataFrame(data)
print(data,frame,sep='\n')
#head()只会选出最后的前五行
print('')
print(frame.head())  
#可以指定生成frame的列的顺序
print('')
frame1=pd.DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five','six'])        
print(frame1)                                                   #若指定的列不在字典data中，则frame中该列将显示NaN
print(frame1['state'],frame1.year,sep='\n')  #可以检索得到列。frame1.column仅当column是有效的python变量时才有用
print(frame1.loc['three'])                   #也可以得到行。注意，检索得到的都是frame1数据的视图
#列的值可以改变
print('')
frame1['debt']=16                         #列的值可以改变。这一列都变成16
print(frame1)
frame1['debt']=np.arange(6)               #赋值一个矩阵
print(frame1)
val=pd.Series([-1,1.5,1.7,8],index=['two','four','five','seven'])     #赋值一个Series
frame1['debt']=val                                     #原frame1中没有'seven'行，就会舍弃。空缺处显示NaN
print(frame1)
frame1['op']=val                                       #原先没有的列，将会自动添加新列
print(frame1)

'''
DataFrame可以用pd.DataFrame()进行初始化，其中可以传入多种数据类型
'''
frame=pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['Ohio','Texas','California'])
print('..........\n',frame)

{'state': ['a', 'b', 'c', 'd', 'e', 'f'], 'year': [2000, 2001, 2002, 2003, 2004, 2005], 'pop': [1.5, 1.7, 3.6, 2.4, 1.4, 4.5]}
  state  year  pop
0     a  2000  1.5
1     b  2001  1.7
2     c  2002  3.6
3     d  2003  2.4
4     e  2004  1.4
5     f  2005  4.5

  state  year  pop
0     a  2000  1.5
1     b  2001  1.7
2     c  2002  3.6
3     d  2003  2.4
4     e  2004  1.4

       year state  pop debt
one    2000     a  1.5  NaN
two    2001     b  1.7  NaN
three  2002     c  3.6  NaN
four   2003     d  2.4  NaN
five   2004     e  1.4  NaN
six    2005     f  4.5  NaN
one      a
two      b
three    c
four     d
five     e
six      f
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2003
five     2004
six      2005
Name: year, dtype: int64
year     2002
state       c
pop       3.6
debt      NaN
Name: three, dtype: object

       year state  pop  debt
one    2000     a  1.5    16
two    2001     b  1.7    16
three  2002     c  3.6    16
four   2003     d  2.4    

In [53]:
#关键字del也可以用来删除DataFrame中的列
print(frame1)
del frame1['op']
print(frame1)

       year state  pop  debt   op
one    2000     a  1.5   NaN  NaN
two    2001     b  1.7  -1.0 -1.0
three  2002     c  3.6   NaN  NaN
four   2003     d  2.4   1.5  1.5
five   2004     e  1.4   1.7  1.7
six    2005     f  4.5   NaN  NaN
       year state  pop  debt
one    2000     a  1.5   NaN
two    2001     b  1.7  -1.0
three  2002     c  3.6   NaN
four   2003     d  2.4   1.5
five   2004     e  1.4   1.7
six    2005     f  4.5   NaN


In [36]:
#还可以用包含字典的嵌套字典转换为DataFrame
pop={'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}  #外字典的键为DataFrame的列索引
frame3=pd.DataFrame(pop)
print(frame3)
print(frame3.T)            #还可以用转置操作来调换行与列

print('2')
frame3=pd.DataFrame(pop,index=[2001,2002,2003])             
print(frame3)

#还可以用包含Series的字典来转换为DataFrame
print('3')
pdata={'Ohio':frame3['Ohio'][:-1],'Nevada':frame3['Nevada']}    
frame4=pd.DataFrame(pdata)
print(frame4)

print('4')
ob1=frame4['Ohio']                                #注意，DataFrame选取的都是视图，因此修改视图，也会引起元数据的变化
ob1[:]=np.arange(3)
print(ob1,frame4,sep='\n')

print('5')
frame4.name='1'
print(frame4.name)                               #DataFrame也是有name的，但是不会主动显示

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5
2
      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2003     NaN   NaN
3
      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9
2003   NaN     NaN
4
2001    0.0
2002    1.0
2003    2.0
Name: Ohio, dtype: float64
      Ohio  Nevada
2001   0.0     2.4
2002   1.0     2.9
2003   2.0     NaN
5
1


In [107]:
frame4

Unnamed: 0,Ohio,Nevada
2001,0.0,2.4
2002,1.0,2.9
2003,2.0,


In [109]:
#DataFrame的列和行都有name属性
frame4.index.name='year'
frame4.columns.name='state'
frame4

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,0.0,2.4
2002,1.0,2.9
2003,2.0,


In [110]:
#values属性会将DataFrame中的值以矩阵形式返回
frame4.values

array([[0. , 2.4],
       [1. , 2.9],
       [2. , nan]])

## 1.3、索引对象

In [117]:
#索引对象即指Series或DataFrame的index的类型
index=frame4.index
print(index,index[0:2],sep='\n')
index[2]='c'                              #索引对象不能变

Index([2001, 2002, 2003], dtype='int64', name='year')
Index([2001, 2002], dtype='int64', name='year')


TypeError: Index does not support mutable operations

In [119]:
#可以检测某个值是否在索引对象中，这点类似集合，但是索引对象中可以存在重复的标签
print('c'in index)

label=pd.Index(['foo','foo','bar','bar'])

False


# 2、基本功能

## 2.1、重建索引

In [120]:
#利用reindex方法可以创建一个符合新索引的新对象
obj=pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])
obj2=obj.reindex(['a','b','c','d','e'])
print(obj,obj2,sep='\n')

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [7]:
#reindex中的method参数还支持利用各种插值方法进行插值
obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])
print(obj3,end='\n\n')
re_obj3=obj3.reindex(range(6),method='ffill')    #ffill会将值前向填充
print(re_obj3)

0      blue
2    purple
4    yellow
dtype: object

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [22]:
'''
对DataFrame使用reindex方法，可以改变行、列索引。
仅传入一个序列时，就只会重建行索引
'''
frame=pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['a','c','d'],
                  columns=['Ohio','Texas','California'])
frame2=frame.reindex(['a','b','c','d'])
states=['Texas','Utah','California']
frame3=frame.reindex(columns=states)
print(frame,frame2,frame3,sep='\n')


frame4=frame2.loc[['a','b','c','d'],['Texas','California']]    #一种更为简洁的索引方式
print(frame4)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
   Texas  California
a    1.0         2.0
b    NaN         NaN
c    4.0         5.0
d    7.0         8.0


## 2.2、删除行或列

In [27]:
#利用drop来删除一个或更多的条目

#drop可以删除Series的索引
obj=pd.Series(np.arange(5),index=['a','b','c','d','e'])
obj1=obj.drop('c')                                          #drop删除'c'行
print(obj,obj1,sep='\n')

a    0
b    1
c    2
d    3
e    4
dtype: int32
a    0
b    1
d    3
e    4
dtype: int32


In [34]:
#drop可以删除DataFrame
data=pd.DataFrame(np.arange(16).reshape(4,4),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
data1=data.drop(['Colorado','Ohio'])                    #可以一次删除多个行
data2=data.drop('two',axis='columns')                   #指明要删除的是列，也可以用axis=1
print(data,data1,data2,sep='\n')

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15
          one  three  four
Ohio        0      2     3
Colorado    4      6     7
Utah        8     10    11
New York   12     14    15


In [35]:
#drop还可以直接对原数据操作，删除原对象中的数据，同时不返回新对象
data1=data.drop('Ohio',inplace=True)      #设置inplace即可
print(data,data1,sep='\n\n')

          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

None


## 2.3、索引、选择与过滤

In [2]:
#Series的索引与Numpy数组索引类似，但是Series不仅可以用整数索引
obj=pd.Series(np.arange(4),index=['a','b','c','d'])
print('用整数索引',obj[2],obj[2:4],obj[[1,3]],sep='\n',end='\n\n')
print('用index索引',obj['b'],obj[['b','c']],sep='\n',end='\n\n')
print('也支持bool索引',obj[obj<2],sep='\n',end='\n\n')
print('也支持索引切片',obj['b':'c'],sep='\n',end='\n\n')     #但是索引的切片与包含尾部，与一般的python切片不同
obj1=obj['b':'c']
obj1[:]=5
print(obj)
obj1=obj[['b','c']]            #再次强调，Series的切片是视图，索引是新对象
obj1[:]=10
print(obj)

用整数索引
2
c    2
d    3
dtype: int32
b    1
d    3
dtype: int32

用index索引
1
b    1
c    2
dtype: int32

也支持bool索引
a    0
b    1
dtype: int32

也支持索引切片
b    1
c    2
dtype: int32

a    0
b    5
c    5
d    3
dtype: int32
a    0
b    5
c    5
d    3
dtype: int32


In [6]:
#DataFrame也可以索引出多个列
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print('对列索引',data['two'],data[['three','one']],sep='\n',end='\n\n')
print('对行索引',data[:2],data[data['three']>5],data[data<5],sep='\n',end='\n\n')  
print(data['three']>5)    #data[data['three']>5]其中data['three']>5返回一个bool值Series，再传给data，因此实现对行索引
                          #data<5同样是返回一个bool值DataFrame

对列索引
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12

对行索引
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Ohio      0.0  1.0    2.0   3.0
Colorado  4.0  NaN    NaN   NaN
Utah      NaN  NaN    NaN   NaN
New York  NaN  NaN    NaN   NaN

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool


#### 2.3.1、使用loc和iloc选择数据

In [29]:
#loc和iloc支持像numpy中的数组一样索引，loc支持轴上标签，iloc支持整数标签
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
print('原数据：',data,sep='\n',end='\n\n')
print('使用loc',data.loc['Colorado',['two','three']],data.loc[:'Utah','two'],sep='\n',end='\n\n')
print('使用iloc',data.iloc[2,[3,0,1]],data.iloc[[1,2],[3,0,1]],data.iloc[:,:3][data.three>5],sep='\n',end='\n\n')
'''
DataFrame中选出的一列或一行都会变成Series
'''


原数据：
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15

使用loc
two      5
three    6
Name: Colorado, dtype: int32
Ohio        1
Colorado    5
Utah        9
Name: two, dtype: int32

使用iloc
four    11
one      8
two      9
Name: Utah, dtype: int32
          four  one  two
Colorado     7    4    5
Utah        11    8    9
          one  two  three
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14



'\nDataFrame中选出的一列或一行都会变成Series\n'

In [30]:
#DataFrame对列的选取是视图，对行的不是，而DataFrame的赋予也是视图
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
data1=data.copy()                                    #data1=data传递的是视图
op=data.loc['Colorado',['two','three']]
op[:]=5
print(data)
op=data.loc[:,'two']
op[:]=5
print(data)
op=data.loc[:,'two':'four']
op[:]=10
print(data)                                          #需要注意的是，从DataFrame中选取的列是数据的视图，而非复制。但多个列就是复制


print('\n\n\n')
data=data1
print(data)
op=data.loc[['Ohio','Colorado'],:]
op[:]=5
print(data)
op=data1.loc['Colorado',:]
op[:]=10
print(data1)                                          #而行的索引则不是视图


print('\n\n\n')
data=pd.DataFrame(np.arange(16).reshape((4,4)),
                 index=['Ohio','Colorado','Utah','New York'],
                 columns=['one','two','three','four'])
data1=data
data1['two']=10
print(data)                                           #DataFrame的赋予也是视图

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Ohio        0    5      2     3
Colorado    4    5      6     7
Utah        8    5     10    11
New York   12    5     14    15
          one  two  three  four
Ohio        0    5      2     3
Colorado    4    5      6     7
Utah        8    5     10    11
New York   12    5     14    15




          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
          one  two  three  four
Ohio        0    1      2     3
Colorado   10   10     10    10
Utah        8    9     10    11
New York   12   13     14    15




          one  two  three  four


### 2.4、整数索引

In [34]:
#负数索引仅在pandas对象有指定标签索引时可以使用
#更精确的索引应该使用loc或iloc
ser=pd.Series(np.arange(3))
print(ser[-1])

KeyError: -1

In [33]:
ser2=pd.Series(np.arange(3),index=['a','b','c'])
print(ser2[-1])

2


### 2.5、算术和数据对齐

In [37]:
#不同的pandas对象相加，共有的标签上的值将相加，而任一方有确实的标签上的值将为NaN
s1=pd.Series(np.arange(4),index=['a','c','d','e'])
s2=pd.Series(np.arange(5),index=['a','c','e','f','g'])
print(s1+s2)

'''
在DataFrame中，行和列上都会对齐
'''
data1=pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['Ohio','Texas','Colorado'],
                  columns=list('bcd'))
data2=pd.DataFrame(np.arange(12).reshape((4,3)),
                  index=['Utah','Ohio','Texas','Oregon'],
                  columns=list('bde'))
print(data1+data2)

a    0.0
c    2.0
d    NaN
e    5.0
f    NaN
g    NaN
dtype: float64
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


#### 2.5.1、使用填充值得算术

In [45]:
#使用fill_value标签，使得当对象算术操作时，如果一个对象的轴标签在另一个对象中不存在，就填补对应设定值
data1=pd.DataFrame(np.arange(12).reshape((3,4)),
                  columns=list('abcd'))
data2=pd.DataFrame(np.arange(20).reshape((4,5)),
                  columns=list('abcde'))
print(data1)
data2.loc[1,'b']=np.nan
print(data2)
print(data1.add(data2,fill_value=0))                
print(data1)                                  #计算后，原对象数据不变
print(data2.add(data1,fill_value=0))
print(data2)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
    a     b   c   d   e
0   0   1.0   2   3   4
1   5   NaN   7   8   9
2  10  11.0  12  13  14
3  15  16.0  17  18  19
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
    a     b   c   d   e
0   0   1.0   2   3   4
1   5   NaN   7   8   9
2  10  11.0  12  13  14
3  15  16.0  17  18  19


In [47]:
#当重建索引时，也可以用fill_value方法
data3=data1.reindex(columns=data2.columns,fill_value=0)    #原对象不变，reindex修改后产生一个新对象
print(data1)
print(data3)

   a  b   c   d
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
   a  b   c   d  e
0  0  1   2   3  0
1  4  5   6   7  0
2  8  9  10  11  0


In [49]:
#许多算术方法add,sub,dib,floordiv,mul,pow
print(data1.sub(2))
print(data1.rsub(2))      #每个算术方法都有个对应的'r'方法，就是反过来变成被执行

   a  b  c  d
0 -2 -1  0  1
1  2  3  4  5
2  6  7  8  9
   a  b  c  d
0  2  1  0 -1
1 -2 -3 -4 -5
2 -6 -7 -8 -9


#### 2.5.2、DataFrame和Series间的操作

In [9]:
#DataFrame与Series间的操作类似于数组不同维度间的操作。
arr=np.arange(12).reshape((3,4))
arr1=arr[0]
print(arr-arr1)                      #相当于每一行都减去了arr1。数组只能行相减

[[0 0 0 0]
 [4 4 4 4]
 [8 8 8 8]]
[0 4 8]


In [11]:
data=pd.DataFrame(np.arange(12).reshape((4,3)),
                 index=['Utah','Ohio','Texas','Oregon'],
                 columns=list('bde'))
series=data.iloc[0]
print(data,series,data-series,sep='\n')

#在列上广播
series=data['d']
print(data.sub(series,axis='index'))
print(data.sub(series,axis=0))          

        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
b    0
d    1
e    2
Name: Utah, dtype: int32
        b  d  e
Utah    0  0  0
Ohio    3  3  3
Texas   6  6  6
Oregon  9  9  9
        b  d  e
Utah   -1  0  1
Ohio   -1  0  1
Texas  -1  0  1
Oregon -1  0  1
        b  d  e
Utah   -1  0  1
Ohio   -1  0  1
Texas  -1  0  1
Oregon -1  0  1


### 2.6、函数应用和映射

In [14]:
#数组的ufunc方法对于pandas对象也可以使用
#如np.abs
data=pd.DataFrame(np.random.randn(12).reshape((4,3)),
                 index=['Utah','Ohio','Texas','Oregon'],
                 columns=list('bde'))
print(data)
data1=np.abs(data)
print(data1)

               b         d         e
Utah    1.341858 -1.074797 -0.443409
Ohio    0.842533 -0.993740 -0.168209
Texas  -1.287755  0.925004  1.242492
Oregon -1.519845  0.497270 -0.524634
               b         d         e
Utah    1.341858  1.074797  0.443409
Ohio    0.842533  0.993740  0.168209
Texas   1.287755  0.925004  1.242492
Oregon  1.519845  0.497270  0.524634


In [16]:
#还可以将函数作用到DataFrame的一行或一列一维数组上去。利用DataFrame的apply方法
f=lambda x:x.max()-x.min()
print(data.apply(f))                 #将函数f作用到data的每一列上
print(data.apply(f,axis='columns'))  #利用axis参数，将函数作用到data的每一行上去

b    2.861703
d    1.999801
e    1.767126
dtype: float64
Utah      2.416655
Ohio      1.836273
Texas     2.530247
Oregon    2.017116
dtype: float64


In [17]:
#运用的函数可以更大胆一点
def f(x):
    return pd.Series([x.min(),x.max()],index=['min','max'])
print(data.apply(f))

            b         d         e
min -1.519845 -1.074797 -0.524634
max  1.341858  0.925004  1.242492


In [25]:
#也可以针对DataFrame的每一个元素进行函数操作，使用applymap方法
format1 = lambda x: '%.2f' %x                #这是一种字符串格式化
print(data.applymap(format1))

#针对Series有map方法，可以使函数作用在每个元素上
series=data.iloc[0]
print(series.map(format1))

            b      d      e
Utah     1.34  -1.07  -0.44
Ohio     0.84  -0.99  -0.17
Texas   -1.29   0.93   1.24
Oregon  -1.52   0.50  -0.52
b     1.34
d    -1.07
e    -0.44
Name: Utah, dtype: object


### 2.7、排序与排名

In [6]:
#利用sort_index方法可以实现按行或列索引进行排序。会返回一个新的、排序好的对象，原对象不变
obj=pd.Series(range(4),index=['d','a','b','c'])
print(obj)
print(obj.sort_index())
print('')
#对DataFrame可以指定轴向
data=pd.DataFrame(np.arange(8).reshape((2,4)),
                 index=['three','one'],
                 columns=['d','a','b','c'])
print(data)
print(data.sort_index())             #默认对行轴进行排序
print(data.sort_index(axis=1))       #对列轴排序
print(data.sort_index(ascending=False))    #通过ascending参数进行降序排序

d    0
a    1
b    2
c    3
dtype: int64
a    1
b    2
c    3
d    0
dtype: int64

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
       d  a  b  c
one    4  5  6  7
three  0  1  2  3
       a  b  c  d
three  1  2  3  0
one    5  6  7  4
       d  a  b  c
three  0  1  2  3
one    4  5  6  7


In [12]:
#如果对Series的值（而非轴上索引）进行排序，那么用sort_values方法进行索引
obj=pd.Series([4,2,5,9,-1])
print(obj)
print(obj.sort_values(ascending=False))

#排序时默认情况下，NaN都会排在Series的末尾
obj=pd.Series([4,6,4,6,3,7,np.nan,8,np.nan])
print(obj.sort_values())

0    4
1    2
2    5
3    9
4   -1
dtype: int64
3    9
2    5
0    4
1    2
4   -1
dtype: int64
4    3.0
0    4.0
2    4.0
1    6.0
3    6.0
5    7.0
7    8.0
6    NaN
8    NaN
dtype: float64


In [18]:
#对DataFrame的值进行排序时，同样使用sort_values()方法，可以传入一列或者多列作为排序
data=pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
print(data)
print(data.sort_values(by='b'))             #在'b'列上排序
print(data.sort_values(by=['a','b']))       #首先在'a'列上排序，'a'轴上相同的，再在'b'轴上进行排序
print(data.sort_values(by=0,axis=1))        #在0行上排序，排序轴是轴1

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1
   a  b
0  0  4
1  1  7
2  0 -3
3  1  2


In [24]:
#rank对Series和DataFrame的数据分配名次
obj=pd.Series([7,-5,7,4,2,0,4])
print(obj.rank())                 #为第0行分配名次为6.5，第1行分配名次为1.（排序方式是从小到大）。默认对于相同的值分配平均排名
print(obj.rank(method='first'))   #对于相同的值，第一个出现的排名靠前
print(obj.rank(ascending=False,method='first'))      #也可以降序排名

'''对于DataFrame也同样适用'''
data=pd.DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
print(data)
print(data.rank(axis=0))                    #轴0上分配名次，默认从小到大
print(data.rank(axis=1,ascending=False))   #轴1上分配名词

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64
0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64
0    1.0
1    7.0
2    2.0
3    3.0
4    5.0
5    6.0
6    4.0
dtype: float64
     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
     b    a    c
0  3.0  1.5  2.0
1  4.0  3.5  3.0
2  1.0  1.5  4.0
3  2.0  3.5  1.0
     b    a    c
0  1.0  2.0  3.0
1  1.0  3.0  2.0
2  3.0  2.0  1.0
3  1.0  2.0  3.0


### 2.8、含有重复标签的轴索引

In [30]:
#对重复标签的索引会返回一个序列。is_unique可以判断标签是否重复
obj=pd.Series(np.arange(4),index=list('abcc'))
print(obj.index.is_unique)
print(obj['c'])

False
c    2
c    3
dtype: int32


## 3、描述性统计的概述与计算

In [39]:
#panda对象有一系列数学、统计学方法的集合
data=pd.DataFrame([[1.4,np.nan],[7.1,-4.5],
                 [np.nan,np.nan],[0.75,-1.3]],
                 index=['a','b','c','d'],
                 columns=['one','two'])
print(data)
print(data.sum())        #默认都是在轴0上操作。返回轴0上数值和
print(data.sum(axis=1))  #对于NaN默认自动排除
print(data.sum(skipna=False))   #设置skipna即可不跳过NaN

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
one   NaN
two   NaN
dtype: float64


In [41]:
#有一些方法如idxmin和idxmax返回的是间接统计信息，为值对应的索引标签
print(data.idxmin())
print(data.idxmin(axis=1))

one    d
two    b
dtype: object
a    one
b    two
c    NaN
d    two
dtype: object


In [42]:
#descirbe方法一次产生多个汇总统计
print(data.describe())

            one       two
count  3.000000  2.000000
mean   3.083333 -2.900000
std    3.493685  2.262742
min    0.750000 -4.500000
25%    1.075000 -3.700000
50%    1.400000 -2.900000
75%    4.250000 -2.100000
max    7.100000 -1.300000


#### 3.1、相关性和协方差

In [2]:
#获取股市的信息
import pandas_datareader.data as web
import yfinance as yf
yf.pdr_override()
all_data={ticker:web.get_data_yahoo(ticker,start='2023-05-08',end='2023-05-18')    #获取这四个公司的股价
         for ticker in ['AAPL','IBM','MSFT','GOOG']}
price=pd.DataFrame({ticker:data['Adj Close']
                   for ticker,data in all_data.items()})

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [3]:
print(all_data)    #all是一个字典，内部每一个键对应了一个DateFrame

{'AAPL':                   Open        High         Low       Close   Adj Close  \
Date                                                                     
2023-05-08  172.479996  173.850006  172.110001  173.500000  173.026688   
2023-05-09  173.050003  173.539993  171.600006  171.770004  171.301422   
2023-05-10  173.020004  174.029999  171.899994  173.559998  173.086533   
2023-05-11  173.850006  174.589996  172.169998  173.750000  173.276016   
2023-05-12  173.619995  174.059998  171.000000  172.570007  172.337280   
2023-05-15  173.160004  173.210007  171.470001  172.070007  171.837967   
2023-05-16  171.990005  173.139999  171.800003  172.070007  171.837967   
2023-05-17  171.710007  172.929993  170.419998  172.690002  172.457123   

              Volume  
Date                  
2023-05-08  55962800  
2023-05-09  45326900  
2023-05-10  53724500  
2023-05-11  49514700  
2023-05-12  45497800  
2023-05-15  37266700  
2023-05-16  42110300  
2023-05-17  57951600  , 'IBM':             

In [4]:
print(price)

                  AAPL         IBM        MSFT        GOOG
Date                                                      
2023-05-08  173.026688  120.354980  307.326050  108.239998
2023-05-09  171.301422  119.791466  305.683136  107.940002
2023-05-10  173.086533  120.631790  310.970367  112.279999
2023-05-11  173.276016  119.524536  308.779785  116.900002
2023-05-12  172.337280  121.442467  307.644714  117.919998
2023-05-15  171.837967  121.956551  308.132599  116.959999
2023-05-16  171.837967  122.055412  310.402802  120.089996
2023-05-17  172.457123  124.279816  313.336609  121.480003


In [5]:
price.pct_change()       #以第一行为基准，计算之后，每一行相比上一行的百分比变化

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-05-08,,,,
2023-05-09,-0.009971,-0.004682,-0.005346,-0.002772
2023-05-10,0.010421,0.007015,0.017296,0.040207
2023-05-11,0.001095,-0.009179,-0.007044,0.041147
2023-05-12,-0.005418,0.016046,-0.003676,0.008725
2023-05-15,-0.002897,0.004233,0.001586,-0.008141
2023-05-16,0.0,0.000811,0.007368,0.026761
2023-05-17,0.003603,0.018225,0.009452,0.011575


In [6]:
price.tail()            #显示最后5行内容

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-05-11,173.276016,119.524536,308.779785,116.900002
2023-05-12,172.33728,121.442467,307.644714,117.919998
2023-05-15,171.837967,121.956551,308.132599,116.959999
2023-05-16,171.837967,122.055412,310.402802,120.089996
2023-05-17,172.457123,124.279816,313.336609,121.480003


In [9]:
#利用corr来计算Series的相关性，cov计算对应的协方差
returns=price.pct_change()
print(returns['MSFT'].corr(returns['IBM']))
print(returns['MSFT'].cov(returns['IBM']))

0.45241451269853156
4.098258674134573e-05


In [10]:
#也可以计算DataFrame的相关性和协方差
print(returns.corr())
print(returns.cov())

          AAPL       IBM      MSFT      GOOG
AAPL  1.000000  0.228347  0.804885  0.724082
IBM   0.228347  1.000000  0.452415 -0.241846
MSFT  0.804885  0.452415  1.000000  0.332007
GOOG  0.724082 -0.241846  0.332007  1.000000
          AAPL       IBM      MSFT      GOOG
AAPL  0.000043  0.000015  0.000047  0.000094
IBM   0.000015  0.000102  0.000041 -0.000048
MSFT  0.000047  0.000041  0.000080  0.000059
GOOG  0.000094 -0.000048  0.000059  0.000390


In [14]:
#另外DataFrame还有corrwith方法，可以计算DataFrame中的行或列与另一个Series或DataFrame的相关性
print(returns.corrwith(returns.IBM))           #传入为一个Series时，计算每一列的数值与传入Series的相关性
print(returns.corrwith(returns))               #传入为DataFrame时，计算匹配到列名的相关性。即两个DataFrame中相同列名的数据求相关性
print(returns.corrwith(returns,axis=1))        #利用axis参数可以变为逐行的算

AAPL    0.228347
IBM     1.000000
MSFT    0.452415
GOOG   -0.241846
dtype: float64
AAPL    1.0
IBM     1.0
MSFT    1.0
GOOG    1.0
dtype: float64
Date
2023-05-08    NaN
2023-05-09    1.0
2023-05-10    1.0
2023-05-11    1.0
2023-05-12    1.0
2023-05-15    1.0
2023-05-16    1.0
2023-05-17    1.0
dtype: float64


### 3.2、唯一值、计数和成员属性

In [15]:
#唯一值方法unique
obj=pd.Series(list('cadaabbcc'))
uniques=obj.unique()                  #会返回Series中的唯一值，返回的是数组形式
print(obj)
print(uniques)

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object
['c' 'a' 'd' 'b']


In [19]:
#value_counts计数方法
print(obj.value_counts())             #返回时会按照数量降序排序
print(pd.value_counts(obj))           #这个方法还是顶层函数，可以用于任意数组或序列，不能用于DataFrame

c    3
a    3
b    2
d    1
Name: count, dtype: int64
c    3
a    3
b    2
d    1
Name: count, dtype: int64


In [24]:
#isin方法可以检测Series或DataFrame成员是否在另一个数据集中
mask=obj.isin(['b','c'])                       #返回一个bool序列
print(mask,type(mask))
mask=price.isin([price.iloc[0,3]])
print(mask,type(mask))

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool <class 'pandas.core.series.Series'>
             AAPL    IBM   MSFT   GOOG
Date                                  
2023-05-08  False  False  False   True
2023-05-09  False  False  False  False
2023-05-10  False  False  False  False
2023-05-11  False  False  False  False
2023-05-12  False  False  False  False
2023-05-15  False  False  False  False
2023-05-16  False  False  False  False
2023-05-17  False  False  False  False <class 'pandas.core.frame.DataFrame'>


In [30]:
#Index.get_indexer方法可以提供一个索引数组
to_match=pd.Series(list('cabbcadett'))
unique_vals=pd.Series(list('cba'))
pd.Index(unique_vals).get_indexer(to_match)      
#利用unique_vals的索引，将unique_vals的值与to_match的值比对，给出to_match中值在unique_vals中对应的索引
#-1表示缺失

array([ 0,  2,  1,  1,  0,  2, -1, -1, -1, -1], dtype=int64)

In [34]:
#求DataFrame的直方图
data=pd.DataFrame({'Qu1':[1,3,4,3,4],
                  'Qu2':[2,3,1,2,3],
                  'Qu3':[1,5,2,4,4]})
print(data.apply(pd.value_counts))
print(data.apply(pd.value_counts).fillna(0))        #后缀fillna（0）将NaN替换为0

   Qu1  Qu2  Qu3
1  1.0  1.0  1.0
2  NaN  2.0  1.0
3  2.0  2.0  NaN
4  2.0  NaN  2.0
5  NaN  NaN  1.0
   Qu1  Qu2  Qu3
1  1.0  1.0  1.0
2  0.0  2.0  1.0
3  2.0  2.0  0.0
4  2.0  0.0  2.0
5  0.0  0.0  1.0
