In [1]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all' # 在一个cell中的多行代码都out出来
# Options:	'all', 'last', 'last_expr', 'none', 'last_expr_or_assign'
# Default:	'last_expr'

# Series

In [11]:
obj = Series([4,7,-5,3])
obj

pandas.core.series.Series

In [4]:
obj.values
obj.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
obj2 = Series([4,7,-5,3],['b','d','a','c'])
obj2.index

Index(['b', 'd', 'a', 'c'], dtype='object')

In [14]:
type(obj2['a']) # 返回的是一个 numpy类型的值
obj2[['b','d']] # 返回的是有2个值的series

b    4
d    7
dtype: int64

In [16]:
obj20 = obj2 > 0 # 返回一个bool类型的Series，和np.array很像
obj20

b     True
d     True
a    False
c     True
dtype: bool

In [17]:
obj2[obj20]

b    4
d    7
c    3
dtype: int64

In [21]:
np.exp(obj20) # 求以自然数e为底，以obj2为幂的指数

b    2.71875
d    2.71875
a    1.00000
c    2.71875
dtype: float16

In [23]:
'e' in obj2 

False

In [33]:
dict1 = {'a':1000,'c':200,'f':-10,'e':0}
dict1 # 怎么才能自动输出dict1，就像这行代码也在末尾一样，而不是用print？
# Series(dict1) # 输出的Series中元素的顺序固定，因为默认按照索引排序了，虽然dict中的元素是无顺序的

{'a': 1000, 'c': 200, 'e': 0, 'f': -10}

In [35]:
states = ['c','a','r','e']
obj4 = Series(dict1,index = states) # 这时输出的索引就已经按照states中的顺序排列了
obj4

c     200.0
a    1000.0
r       NaN
e       0.0
dtype: float64

In [39]:
obj4.isnull() # 或者：
pd.isnull(obj4) # 相反的是：
obj4.notnull() # 或者
pd.notnull(obj4)

c     True
a     True
r    False
e     True
dtype: bool

In [47]:
obj4 + obj2 # 返回的 Series已经按照索引排序了

a    995.0
b      NaN
c    203.0
d      NaN
e      NaN
r      NaN
dtype: float64

In [49]:
obj4.name = 'population'
obj4.index.name = 'states'
obj4

states
c     200.0
a    1000.0
r       NaN
e       0.0
Name: population, dtype: float64

# DataFrame

## 基本

In [53]:
data = {'state':['ohio','ohio','nevada'],
        'year':[2001,2001,2000],
        'pop':[1.5,1.6,1.7]}
data # 理论上data中元素没有顺序，但实际上在jupyter中out出来是有顺序的

{'pop': [1.5, 1.6, 1.7],
 'state': ['ohio', 'ohio', 'nevada'],
 'year': [2001, 2001, 2000]}

In [78]:
df1 = DataFrame(data)
df1.index.name = 'id' # 这个方法同Series
df1.name = 'American' # 也有这个方法，但是没有显示出来
df1 # 同样的生成的DF的列是有顺序的，而且还和上面data输出的顺序一致

Unnamed: 0_level_0,pop,state,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1.5,ohio,2001
1,1.6,ohio,2001
2,1.7,nevada,2000


In [54]:
df2 = DataFrame(data,columns = ['year','pop','state'],
               index = ['one','two','there'],
                )  
df2 # 不要用print，因为out的表很丑

Unnamed: 0,year,pop,state
one,2001,1.5,ohio
two,2001,1.6,ohio
there,2000,1.7,nevada


In [55]:
df2['pop'] # 也可以用
df2.year # 不能用.pop这个方法，因为内置的有这个方法：用于删除相应的数据，并返回被删除的数据

one      1.5
two      1.6
there    1.7
Name: pop, dtype: float64

one      2001
two      2001
there    2000
Name: year, dtype: int64

In [94]:
df2.pop('year')

one      2001
two      2001
there    2000
Name: year, dtype: int64

In [95]:
df2

Unnamed: 0,pop,state
one,1.5,ohio
two,1.6,ohio
there,1.7,nevada


In [97]:
df2['id'] = np.arange(len(df2)) # 直接增加id这一列，很方便
df2

Unnamed: 0,pop,state,id
one,1.5,ohio,0
two,1.6,ohio,1
there,1.7,nevada,2


In [102]:
df2['test'] = df2['pop'] + df2['id'] > 1.6
del df2['test']
df2

Unnamed: 0,pop,state,id
one,1.5,ohio,0
two,1.6,ohio,1
there,1.7,nevada,2


In [105]:
data2 = {'nevada':{2001:2.4,2002:1.3,2003:1.9},
         'steven':{2001:2.1,2002:1.4,2004:2.0}}
df3 = DataFrame(data2) # 瞬间感觉这里像 mongoDB了，之前还感觉像关系型数据库
df3

Unnamed: 0,nevada,steven
2001,2.4,2.1
2002,1.3,1.4
2003,1.9,
2004,,2.0


In [113]:
df3.transpose() # 和np.array.transpose不一样的是括号内不需要再输入axis。也可以用
df3.T

Unnamed: 0,2001,2002,2003,2004
nevada,2.4,1.3,1.9,
steven,2.1,1.4,,2.0


In [116]:
df3 = DataFrame(data2,index = [2001,2002,2003])
df3

Unnamed: 0,nevada,steven
2001,2.4,2.1
2002,1.3,1.4
2003,1.9,


In [126]:
data3 = {'ohio':df3['nevada'][:2],
         'steve':df3['steven'][:-2]}
df4 = DataFrame(data3)
df4.index.name = 'year';df4.columns.name = 'state'
df4

state,ohio,steve
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,2.1
2002,1.3,


In [133]:
df2.values

array([[2001, 1.5, 'ohio'],
       [2001, 1.6, 'ohio'],
       [2000, 1.7, 'nevada']], dtype=object)

## index objects

In [136]:
obj = Series(range(3),index = ['a','b','c'])
idex1 = obj.index # 行索引、列索引更像是set，所以对它的操作(也必须是index)就清晰了

In [150]:
labels = pd.Index(range(3))
obj5 = Series([1.3,1.5,1.9],index = labels)
obj5.index is labels # == 返回的是bool值组成的array数组，而is相当于下面这样
(obj5.index == labels).all()

True

In [152]:
'20' in idex1

False

In [166]:
idex2 = idex1.append(pd.Index(['d']))
idex1 is idex2

False

In [168]:
obj5.reindex([0,'b','c'])

0    1.3
b    NaN
c    NaN
dtype: float64

In [51]:
df7 = DataFrame(np.arange(9).reshape((3,3)),
                index = ['a','c','ohio'],
                columns = ['ohio','texas','california'])
df7.reindex(index = ['a','ohio','c','d','e','f','g','h','i'],
            columns = ['ohio','california','texas'],
            #method = 'bfill', # 如果行列中都有要新填充的，就不能用这个method
            fill_value = np.nan,
           copy = True,
           limit = 10) # ffill 向前填充，bfill向后填充
# reindex 创建了一个新对象，不改变原对象

Unnamed: 0,ohio,california,texas
a,0.0,2.0,1.0
ohio,6.0,8.0,7.0
c,3.0,5.0,4.0
d,,,
e,,,
f,,,
g,,,
h,,,
i,,,


In [21]:
df7.index = pd.Index(['a','c','ohio']) # 不能像列表那样仅更改其中的一个数，如果要改就要全部改掉

In [217]:
obj6 = Series(np.arange(5),index = ['a','b','c','d','e'])
new_obj = obj6.pop('c') # 在原址上操作,删除c这行,并且返回被删除的行c
new_obj

2

In [216]:
obj6.drop('d',inplace = True) # 和上面实现的结果一致，但是默认inplace=False

In [213]:
obj6

a    0
b    1
e    4
dtype: int32

In [194]:
df7.pop('ohio') # 在列上操作,返回被删除的数据

a       0
c       3
ohio    6
Name: ohio, dtype: int32

In [59]:
df7.drop(labels = ['ohio'],axis = 1) # 在行上操作，返回删除后的数组(默认非原址) df7的最终结果和上面的操作一样
                          # 如果要原址的话，需要设定inplace = True
                          # 默认是在行上操作的
# DataFrame.drop(labels=None, axis=0, level=None, inplace=False, errors='raise')  

Unnamed: 0,texas,california
a,1,2
c,4,5
ohio,7,8


## pandas.style

In [2]:
df = DataFrame(np.random.randn(28).reshape(4,7))
df

Unnamed: 0,0,1,2,3,4,5,6
0,0.153375,1.150032,-0.127145,0.782154,2.134165,1.389647,0.318714
1,0.447143,0.225145,0.535606,1.892001,0.091651,0.013733,0.686879
2,0.487633,0.757918,1.962323,-1.269642,-0.447111,0.944311,-1.523954
3,0.135916,2.824728,0.698221,-0.18122,-1.983121,0.547549,-0.084368


In [3]:
def show_color(val):
    color = 'red' if val < 0 else 'green'
    return 'color:{}'.format(color)

In [4]:
df.style.applymap(show_color) # 对整个工作表适用上面的颜色分类

Unnamed: 0,0,1,2,3,4,5,6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [52]:
df.style.applymap( show_color,subset = pd.IndexSlice[[0,1],['row_1','row_3']] )
df.style.applymap( show_color, subset = pd.IndexSlice[[3,2],'row_4'] )

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [25]:
df = df.add_prefix('row_') 
df.style.applymap( show_color, subset = ['row_0','row_1'] )

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [30]:
# 找出每一列中的最小值
def show_min_column(val):
    c = val == val.min()
    return ['background-color:red' if v else '' for v in c ]

df.style.apply(show_min_column)

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [42]:
# 找出每列的最小值还有一个方法：
df.style.highlight_min(axis = 0 ,color = 'red') 

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [34]:
# 结合上面两种形式
df.style.applymap(show_color).apply(show_min_column,subset = pd.IndexSlice[[2,3],:] )  

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


In [39]:
df.style.bar(color = 'blue') # 每一列中最小值是0，最大值是全部

Unnamed: 0,row_0,row_1,row_2,row_3,row_4,row_5,row_6
0,0.153375,1.15003,-0.127145,0.782154,2.13417,1.38965,0.318714
1,0.447143,0.225145,0.535606,1.892,0.0916509,0.0137334,0.686879
2,0.487633,0.757918,1.96232,-1.26964,-0.447111,0.944311,-1.52395
3,0.135916,2.82473,0.698221,-0.18122,-1.98312,0.547549,-0.0843683


### operitaion

In [9]:
s1 = Series(np.arange(5),index = ['a','b','c','d','e'])
s2 = Series(np.arange(10,15,1),index = ['a','b','c','d','f']) 
# arange的用法和range很像
s1
s2
s1 + s2 # 也可以：
s1.add(s2,fill_value = 0) # 还可以
s1.radd(s2)

a    0
b    1
c    2
d    3
e    4
dtype: int32

a    10
b    11
c    12
d    13
f    14
dtype: int32

a    10.0
b    12.0
c    14.0
d    16.0
e     NaN
f     NaN
dtype: float64

a    10.0
b    12.0
c    14.0
d    16.0
e     4.0
f    14.0
dtype: float64

a    10.0
b    12.0
c    14.0
d    16.0
e     NaN
f     NaN
dtype: float64

In [63]:
s2.add(3) # 这里不能用 3.add(s2)

a    13
b    14
c    15
d    16
f    17
dtype: int32

In [30]:
df10 = DataFrame(np.arange(12).reshape(3,4),index = list('abc'),
                columns = ['ohio','texas','colorado','oregan'])
df11 = DataFrame(np.arange(9.).reshape(3,3),index = list('abd'),
                columns = ['ohio','texas','utah'])
df10
df11
df11 + df10

Unnamed: 0,ohio,texas,colorado,oregan
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11


Unnamed: 0,ohio,texas,utah
a,0.0,1.0,2.0
b,3.0,4.0,5.0
d,6.0,7.0,8.0


Unnamed: 0,colorado,ohio,oregan,texas,utah
a,,0.0,,2.0,
b,,7.0,,9.0,
c,,,,,
d,,,,,


In [71]:
df11.loc['a','ohio'] = np.nan
df11

Unnamed: 0,ohio,texas,utah
a,,1.0,2.0
b,3.0,4.0,5.0
d,6.0,7.0,8.0


In [73]:
df10.add(df11,fill_value = 0)

Unnamed: 0,colorado,ohio,oregan,texas,utah
a,2.0,0.0,3.0,2.0,2.0
b,6.0,7.0,7.0,9.0,5.0
c,10.0,8.0,11.0,9.0,
d,,6.0,,7.0,8.0


In [5]:
df10 / 1 # 相当于
df10.div(1) # 还相当于
df12 = DataFrame(np.ones_like(df10),index = df10.index,
                columns = df10.columns)
df12.rdiv(df10) 
# 还有其它：add sub floordiv mul pow

Unnamed: 0,ohio,texas,colorado,oregan
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0


Unnamed: 0,ohio,texas,colorado,oregan
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0


Unnamed: 0,ohio,texas,colorado,oregan
a,0.0,1.0,2.0,3.0
b,4.0,5.0,6.0,7.0
c,8.0,9.0,10.0,11.0


In [14]:
df10.add(s1,axis = 0) # 不能用s1.add(df10)

Unnamed: 0,ohio,texas,colorado,oregan
a,0.0,1.0,2.0,3.0
b,5.0,6.0,7.0,8.0
c,10.0,11.0,12.0,13.0
d,,,,
e,,,,


In [13]:
df10.add(s1,axis = 'index') # DF + Series 默认是在axis = 'comlumns'

Unnamed: 0,ohio,texas,colorado,oregan
a,0.0,1.0,2.0,3.0
b,5.0,6.0,7.0,8.0
c,10.0,11.0,12.0,13.0
d,,,,
e,,,,


In [19]:
df10.drop('a',axis = 'index',inplace = True)

In [22]:
df10.drop('c') # 因为只有再index中有‘c',pandas自动匹配到了它，如果行列中有相同的呢？
df10.columns = list('abcd')

Unnamed: 0,ohio,texas,colorado,oregan
b,4,5,6,7


In [23]:
df10

Unnamed: 0,a,b,c,d
b,4,5,6,7
c,8,9,10,11


In [24]:
df10.drop('b',inplace = True) # 都有的话，默认的是行索引

In [26]:
df10.drop('c',inplace = True)

In [38]:
df10[:1] # 默认索引的是行
df10.loc[:,:'texas'] 

Unnamed: 0,ohio,texas,colorado,oregan
a,0,1,2,3


Unnamed: 0,ohio,texas
a,0,1
b,4,5
c,8,9


In [46]:
df10.at['a','texas'] # 这个loc有啥不一样？
df10.loc['a','texas'] # 
df10.at['a','texas'] == df10.loc['a','texas'] # 如果仅选出一个元素，看起来是一样的？

1

1

True

In [50]:
df10.drop('ohio',inplace = True,axis = 'columns') # 默认是行索引

In [52]:
df10.columns = list('abc')

In [56]:
df10.loc['a']

a    1
b    2
c    3
Name: a, dtype: int32

In [59]:
df10

Unnamed: 0,a,b,c
a,1,2,3
b,5,6,7
c,9,10,11


In [72]:
df10.sub(df10.loc['ohio'],axis = 1) # 默认是在列索引上(axis =1),索引要匹配，不匹配则uion

Unnamed: 0,a,b,c
ohio,0,0,0
colifonia,4,4,4
texas,8,8,8


In [65]:
df10.index = ['ohio','colifonia','texas']

In [71]:
df10.sub(df10.loc['ohio'])

Unnamed: 0,a,b,c
ohio,0,0,0
colifonia,4,4,4
texas,8,8,8


In [73]:
df12 = DataFrame(np.random.randn(4,3),
                 index = list('abcd'),
                columns = ['ohio','texas','califnia'])
df12

Unnamed: 0,ohio,texas,califnia
a,0.861856,0.345762,-0.463403
b,-0.23942,1.270962,1.61821
c,-0.778999,-1.478413,-1.161025
d,-0.229906,1.121958,0.027531


In [74]:
np.abs(df12)

Unnamed: 0,ohio,texas,califnia
a,0.861856,0.345762,0.463403
b,0.23942,1.270962,1.61821
c,0.778999,1.478413,1.161025
d,0.229906,1.121958,0.027531


In [78]:
f = lambda x : x.max()-x.min()
df12.apply(f,axis = 1) #默认是 axis = 0

a    1.325259
b    1.857630
c    0.699414
d    1.351864
dtype: float64

In [84]:
def f(x):
    return Series([x.max(), x.min()],index = ['max','min'])
df12.apply(f,axis = 1) 

Unnamed: 0,max,min
a,0.861856,-0.463403
b,1.61821,-0.23942
c,-0.778999,-1.478413
d,1.121958,-0.229906


In [81]:
f = lambda x :'{0:.2f}'.format(x)
df12.applymap(f)

Unnamed: 0,ohio,texas,califnia
a,0.86,0.35,-0.46
b,-0.24,1.27,1.62
c,-0.78,-1.48,-1.16
d,-0.23,1.12,0.03


In [83]:
df12.loc['a'].map(f)

ohio         0.86
texas        0.35
califnia    -0.46
Name: a, dtype: object

In [87]:
df12.sort_index(axis = 1,ascending=False)

Unnamed: 0,texas,ohio,califnia
a,0.345762,0.861856,-0.463403
b,1.270962,-0.23942,1.61821
c,-1.478413,-0.778999,-1.161025
d,1.121958,-0.229906,0.027531


In [95]:
df12.texas.sort_index(ascending=False).rank(method = 'first') # 排名

d    3.0
c    1.0
b    4.0
a    2.0
Name: texas, dtype: float64

In [97]:
df12.rank(axis = 1)

Unnamed: 0,ohio,texas,califnia
a,3.0,2.0,1.0
b,1.0,2.0,3.0
c,3.0,1.0,2.0
d,1.0,3.0,2.0


In [99]:
df12.index = list('abc') + ['c']

In [102]:
df12.loc['c'] # 有相同索引的话，就会直接取出全部

Unnamed: 0,ohio,texas,califnia
c,-0.778999,-1.478413,-1.161025
c,-0.229906,1.121958,0.027531


In [105]:
df12.index.is_unique # is_unique这个属性对行列索引都适用

False

In [115]:
df13 = DataFrame([[1,2],[3,4],[np.nan,5],[6,7]],index = pd.Index(['a','b','c','d']), columns = ['one','two'])  
df13

Unnamed: 0,one,two
a,1.0,2
b,3.0,4
c,,5
d,6.0,7


In [117]:
df13.sum(axis = 1)

a     3.0
b     7.0
c     5.0
d    13.0
dtype: float64

In [118]:
df13.sum(axis=0,skipna = False)

one     NaN
two    18.0
dtype: float64

In [123]:
df13.idxmin(axis = 1) # 定位最大值的位置

a    one
b    one
c    two
d    one
dtype: object

In [128]:
df13.pct_change() # 相比较前一个数，变化率

Unnamed: 0,one,two
a,,
b,2.0,1.0
c,,0.25
d,1.0,0.4


In [136]:
df13.corrwith(df13.two)

one    1.0
two    1.0
dtype: float64

In [140]:
df12.ohio.unique()

array([ 0.86185634, -0.2394198 , -0.77899893, -0.22990649])

In [150]:
df12.ohio.value_counts(sort = True) # 统计值出现的次数，如果没有这个方法，就要用到
df12.groupby('ohio',as_index = True ).count()[['texas']] # 哪有上面的来的简单

-0.239420    1
-0.778999    1
 0.861856    1
-0.229906    1
Name: ohio, dtype: int64

Unnamed: 0_level_0,texas
ohio,Unnamed: 1_level_1
-0.778999,1
-0.23942,1
-0.229906,1
0.861856,1


In [152]:
df12.index.isin(['a','b'])

array([ True,  True, False, False], dtype=bool)

In [162]:
pd.Index(['a','b']).get_indexer(Series(['a','a','b','v']))

array([ 0,  0,  1, -1], dtype=int64)

In [165]:
df12

Unnamed: 0,ohio,texas,califnia
a,0.861856,0.345762,-0.463403
b,-0.23942,1.270962,1.61821
c,-0.778999,-1.478413,-1.161025
c,-0.229906,1.121958,0.027531


In [166]:
df12.apply(pd.value_counts).fillna(0) # value_counts只能在Series上操作，所以要用apply这个

Unnamed: 0,ohio,texas,califnia
-1.478413,0.0,1.0,0.0
-1.161025,0.0,0.0,1.0
-0.778999,1.0,0.0,0.0
-0.463403,0.0,0.0,1.0
-0.23942,1.0,0.0,0.0
-0.229906,1.0,0.0,0.0
0.027531,0.0,0.0,1.0
0.345762,0.0,1.0,0.0
0.861856,1.0,0.0,0.0
1.121958,0.0,1.0,0.0
