In [1]:
import pandas as pd
import numpy as np

In [2]:
obj = pd.Series([4,7,-5,3],index = ['d','b','a','c'])

In [3]:
obj.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [4]:
obj['d'] = 6
obj[['c','a','d']]

c    3
a   -5
d    6
dtype: int64

In [5]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Navada'],
       'year':[2000,2001,2002,2001,2002],
       'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = pd.DataFrame(data,columns = ['year','state','pop','debt'],index = ['one','two','three','four','five'])
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Navada,2.9,


In [6]:
frame.year # frame['year'] (等价)
frame.loc['three'] # frame.iloc[2](等价)

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [7]:
frame['debt'] = np.arange(5.)
frame['eastern'] = frame.state == 'Ohio'
frame
del frame['eastern']
frame

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Navada,2.9,4.0


##### index object

In [8]:
obj = pd.Series(range(3),index = ['a','b','c'])
obj

a    0
b    1
c    2
dtype: int32

In [9]:
index = obj.index # index 不可更改
index = pd.Index(np.arange(3))
index

Int64Index([0, 1, 2], dtype='int64')

In [10]:
'three' in frame.index # 查看是否在index里

True

In [11]:
#reindex
obj.reindex(['a','b','c','d'],fill_value = 0) # d 不存在，用0代替np.Nan

a    0
b    1
c    2
d    0
dtype: int32

In [12]:
obj3 = pd.Series(['blue','purple','yellow'],index= [0,2,4])
obj3.reindex(range(6),method = 'ffill') #向前填充

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [13]:
frame = pd.DataFrame(np.arange(9).reshape(3,3),index = ['a','c','d'],columns = ['Ohio','Texas','California'])

In [14]:
states = ['Texas','Utah','California']
frame.reindex(index = ['a','b','c','d'],method='ffill').reindex(columns = states)

Unnamed: 0,Texas,Utah,California
a,1,,2
b,1,,2
c,4,,5
d,7,,8


In [15]:
frame.loc[['a','b','c','d'],['Texas']]

Unnamed: 0,Texas
a,1.0
b,
c,4.0
d,7.0


In [16]:
obj = pd.Series(np.arange(5.),index = ['a','b','c','d','e'])
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [18]:
data = pd.DataFrame(np.arange(16).reshape((4,4)),index = ['Ohio','Colorado','Utah','New York'],columns = ['one','two','three','four'])
data.drop(['Colorado'],axis = 0)
data.drop(['one'],axis = 1)

Unnamed: 0,two,three,four
Ohio,1,2,3
Colorado,5,6,7
Utah,9,10,11
New York,13,14,15


In [30]:
data[['two','one']] #取列
data[:2] #取行
data[data['three']>5] #布尔值

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [40]:
#loc -> label/ iloc ->positon
data.loc[['Colorado','Utah'],['three','two']]
data.iloc[[0,2],data.columns.get_indexer(['three','two'])]

Unnamed: 0,three,two
Ohio,2,1
Utah,10,9


In [44]:
#函数应用和映射
frame = pd.DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
frame
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.089913,0.71832,1.776517
Ohio,0.865483,0.260199,0.461091
Texas,0.263431,0.403069,1.295188
Oregon,0.305062,0.40925,1.177781


In [50]:
frame

Unnamed: 0,b,d,e
Utah,0.089913,-0.71832,1.776517
Ohio,-0.865483,0.260199,-0.461091
Texas,-0.263431,-0.403069,-1.295188
Oregon,0.305062,-0.40925,-1.177781


In [49]:
f = lambda x: x.max()-x.min()
frame.apply(f,axis = 1) #axis = 1 按行 axis = 0 按列

Utah      2.494837
Ohio      1.125682
Texas     1.031756
Oregon    1.482843
dtype: float64

In [52]:
def f(x):
    return pd.Series([x.min(),x.max()],index = ['min','max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.865483,-0.71832,-1.295188
max,0.305062,0.260199,1.776517


In [54]:
# sorting
obj = pd.Series(np.arange(4),index = ['d','a','c','e'])
obj.sort_index()

a    1
c    2
d    0
e    3
dtype: int32

In [56]:
frame = pd.DataFrame(np.arange(8).reshape((2,4)),index = ['three','one'],columns=['d','a','b','c'])
frame.sort_index() # row axis = 0
frame.sort_index(axis = 1) # columns axis = 1
# ascending = False/True

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [60]:
# order by value of a Series
obj = pd.Series([4,np.NaN,-7,3,2])
obj.sort_values()

2   -7.0
4    2.0
3    3.0
0    4.0
1    NaN
dtype: float64

In [64]:
frame = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})
frame.sort_values(by=['a','b'])

Unnamed: 0,a,b
2,0,-3
0,0,4
3,1,2
1,1,7


In [68]:
# ranking
obj = pd.Series([7,-5,7,4,2,0,4])
obj.rank(method= 'max')
# average 均值/min 最小/max 最大/first 首次出现

0    7.0
1    1.0
2    7.0
3    5.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [76]:
# duplicate index
obj = pd.Series(range(5),index = ['a','a','b','b','c'])
frame = pd.DataFrame(np.arange(12).reshape((4,3)),index = ['a','a','b','b'])
frame.loc['a']

Unnamed: 0,0,1,2
a,0,1,2
a,3,4,5


In [82]:
#Stat and summary
df = pd.DataFrame([[1.4,np.NaN],[7.1,-4.5],[np.NaN,np.NaN],[0.75,-1.3]],index = ['a','b','c','d'],columns = ['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [84]:
print(df.sum(axis = 0))
df.sum(axis = 1)

one    9.25
two   -5.80
dtype: float64


a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [88]:
print(df.mean(axis =1,skipna=False))
print(df.mean(axis =1,skipna=True))

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64
a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64


In [92]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3
