# ６. pandas入门

In [1]:
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

In [2]:
print(Series().__class__)
print(Series.__doc__)
print(Series.__bases__)
print(Series.__name__)
print(DataFrame.__module__)
print(Series.__dict__)

<class 'pandas.core.series.Series'>

    One-dimensional ndarray with axis labels (including time series).

    Labels need not be unique but must be a hashable type. The object
    supports both integer- and label-based indexing and provides a host of
    methods for performing operations involving the index. Statistical
    methods from ndarray have been overridden to automatically exclude
    missing data (currently represented as NaN).

    Operations between Series (+, -, /, *, **) align values based on their
    associated index values-- they need not be the same length. The result
    index will be the sorted union of the two indexes.

    Parameters
    ----------
    data : array-like, dict, or scalar value
        Contains data stored in Series
    index : array-like or Index (1d)
        Values must be hashable and have the same length as `data`.
        Non-unique index values are allowed. Will default to
        RangeIndex(len(data)) if not provided. If both a dict and ind

In [13]:
type(Series())

pandas.core.series.Series

## 6.1 Series

- ### list to Series

In [20]:
obj = pd.Series([4,-7,8,9])
obj

0    4
1   -7
2    8
3    9
dtype: int64

In [21]:
obj.values

array([ 4, -7,  8,  9])

In [22]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [25]:
obj2=Series([4,5,0,-4],index=['d','a','b','fg'])
obj2

d     4
a     5
b     0
fg   -4
dtype: int64

In [26]:
obj2['a']

5

In [27]:
obj2[obj2>0]

d    4
a    5
dtype: int64

In [29]:
'a' in obj2

True

In [30]:
'c' in obj2

False

- ### dict to Series

In [32]:
sd = {"Ohio":3500,"Texas":7100,"Oregon":1600,"Utah":5100}
obj3=Series(sd)
obj3

Ohio      3500
Oregon    1600
Texas     7100
Utah      5100
dtype: int64

In [33]:
states={"California","Ohio","New York"}
obj4=Series(sd,index=states)
obj4

Ohio          3500.0
New York         NaN
California       NaN
dtype: float64

In [36]:
pd.isnull(obj4)

Ohio          False
New York       True
California     True
dtype: bool

In [38]:
pd.notnull(obj4)

Ohio           True
New York      False
California    False
dtype: bool

In [39]:
obj3+obj4

California       NaN
New York         NaN
Ohio          7000.0
Oregon           NaN
Texas            NaN
Utah             NaN
dtype: float64

In [40]:
(obj3+obj4).index

Index(['California', 'New York', 'Ohio', 'Oregon', 'Texas', 'Utah'], dtype='object')

## 6.2 DataFrame

- ### dict to DataFrame

In [101]:
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],'year':[2000,2001,2002,2001,2002],'pop':[1.5,1.7,3.6,2.4,2.9]}
frame=DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [102]:
frame.index

RangeIndex(start=0, stop=5, step=1)

In [103]:
frame2=DataFrame(data,columns=['state','pop','year'])
frame2

Unnamed: 0,state,pop,year
0,Ohio,1.5,2000
1,Ohio,1.7,2001
2,Ohio,3.6,2002
3,Nevada,2.4,2001
4,Nevada,2.9,2002


In [104]:
frame3=DataFrame(data,index=['one','two','three','four','five'])
frame3

Unnamed: 0,pop,state,year
one,1.5,Ohio,2000
two,1.7,Ohio,2001
three,3.6,Ohio,2002
four,2.4,Nevada,2001
five,2.9,Nevada,2002


In [119]:
frame3.columns

Index(['pop', 'state', 'year', 'debt'], dtype='object')

In [120]:
frame3.values

array([[1.5, 'Ohio', 2000, 0],
       [1.7, 'Ohio', 2001, 1],
       [3.6, 'Ohio', 2002, 2],
       [2.4, 'Nevada', 2001, 3],
       [2.9, 'Nevada', 2002, 4]], dtype=object)

In [106]:
frame3['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [107]:
frame3.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [108]:
frame3['debt']=12
frame3

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,12
two,1.7,Ohio,2001,12
three,3.6,Ohio,2002,12
four,2.4,Nevada,2001,12
five,2.9,Nevada,2002,12


In [109]:
frame3['debt']=range(5)
frame3

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [117]:
frame3.ix[3]

pop         2.4
state    Nevada
year       2001
debt          3
Name: four, dtype: object

In [111]:
frame3.ix['four']

pop         2.4
state    Nevada
year       2001
debt          3
Name: four, dtype: object

In [112]:
frame3['eastern']=frame3.state=='Ohio'
frame3

Unnamed: 0,pop,state,year,debt,eastern
one,1.5,Ohio,2000,0,True
two,1.7,Ohio,2001,1,True
three,3.6,Ohio,2002,2,True
four,2.4,Nevada,2001,3,False
five,2.9,Nevada,2002,4,False


In [114]:
frame3.columns
del frame3['eastern']

In [115]:
frame3

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [116]:
frame3.T

Unnamed: 0,one,two,three,four,five
pop,1.5,1.7,3.6,2.4,2.9
state,Ohio,Ohio,Ohio,Nevada,Nevada
year,2000,2001,2002,2001,2002
debt,0,1,2,3,4


- ### index对象

In [161]:
ind = frame3.index
ind

Index(['one', 'two', 'three', 'four', 'five'], dtype='object')

In [162]:
ind[:3]

Index(['one', 'two', 'three'], dtype='object')

In [163]:
#index是immutable对象,　不可以修改已经有的值
ind[3]='三'

TypeError: Index does not support mutable operations

In [164]:
frame3.reindex(['two','three','four','one','five','six','seven'])

Unnamed: 0,pop,state,year,debt
two,1.7,Ohio,2001.0,1.0
three,3.6,Ohio,2002.0,2.0
four,2.4,Nevada,2001.0,3.0
one,1.5,Ohio,2000.0,0.0
five,2.9,Nevada,2002.0,4.0
six,,,,
seven,,,,


In [165]:
frame3.reindex(['two','three','four','one','five','six','seven'],fill_value=100)

Unnamed: 0,pop,state,year,debt
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
one,1.5,Ohio,2000,0
five,2.9,Nevada,2002,4
six,100.0,100,100,100
seven,100.0,100,100,100


In [166]:
frame３

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [167]:
# frame3.drop(['six'])

In [168]:
frame3

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [169]:
frame3[frame3['pop']>2]

Unnamed: 0,pop,state,year,debt
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [170]:
frame3[1:4]

Unnamed: 0,pop,state,year,debt
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3


In [171]:
frame3['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object

In [172]:
frame3<10

Unnamed: 0,pop,state,year,debt
one,True,True,False,True
two,True,True,False,True
three,True,True,False,True
four,True,True,False,True
five,True,True,False,True


In [174]:
frame3

Unnamed: 0,pop,state,year,debt
one,1.5,Ohio,2000,0
two,1.7,Ohio,2001,1
three,3.6,Ohio,2002,2
four,2.4,Nevada,2001,3
five,2.9,Nevada,2002,4


In [177]:
frame3.ix[['Ohio'],[1,2,3]]

Unnamed: 0,state,year,debt
Ohio,,,


## 算术运算和数据对齐

In [3]:
s1 = Series([7.3,-2.5,3.4,1.5],index=['a','c','d','e'])
s2 = Series([-2.1,3.6,-1.5,4,3.1],index=['a','c','f','e','g'])
s1+s2

a    5.2
c    1.1
d    NaN
e    5.5
f    NaN
g    NaN
dtype: float64

In [10]:
d1 = DataFrame(np.arange(9.).reshape((3,3)),columns=list('bcd'),index=['Ohio','Texas','Colorado'])
d2 = DataFrame(np.arange(12.).reshape((4,3)),columns=list('bde'),index=['Ohio','Texas','Utah','Oregon'])
d1+d2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,0.0,,3.0,
Oregon,,,,
Texas,6.0,,9.0,
Utah,,,,


In [11]:
d1.add(d2,fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,0.0,1.0,3.0,2.0
Oregon,9.0,,10.0,11.0
Texas,6.0,4.0,9.0,5.0
Utah,6.0,,7.0,8.0


In [12]:
d2.add(d1,fill_value=0)

Unnamed: 0,b,c,d,e
Colorado,6.0,7.0,8.0,
Ohio,0.0,1.0,3.0,2.0
Oregon,9.0,,10.0,11.0
Texas,6.0,4.0,9.0,5.0
Utah,6.0,,7.0,8.0


In [18]:
# d1.reindex(index=['Ohio','Texas','Utah','Oregon'])
d1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


d2

### 函数应用和映射

In [22]:
f = DataFrame(np.random.randn(4,3),columns=list('bde'),index=['Utah','Ohio','Texas','Oregon'])
f

Unnamed: 0,b,d,e
Utah,-0.206276,-1.151213,0.960068
Ohio,0.454822,-0.23014,-1.10862
Texas,1.405977,-0.619003,0.352228
Oregon,-1.269022,-0.511869,-0.106956


In [25]:
foo = lambda x:x.max()-x.min()
f.apply(foo)

b    2.674999
d    0.921073
e    2.068688
dtype: float64

In [26]:
f.apply(foo,axis=1)

Utah      2.111281
Ohio      1.563442
Texas     2.024981
Oregon    1.162066
dtype: float64

In [27]:
def fun(x):
    return Series([x.min(),x.max()],index=['min','max'])
f.apply(fun)

Unnamed: 0,b,d,e
min,-1.269022,-1.151213,-1.10862
max,1.405977,-0.23014,0.960068


### 排序和排名

In [35]:
obj = Series(range(4),index=['d','c','a','b'])
obj

AttributeError: 'Series' object has no attribute 'order'

In [29]:
obj.sort_index()

a    2
b    3
c    1
d    0
dtype: int64

### 注意排名rank和排序的不同

#### 不区分数值出现的顺序

In [43]:
obj = Series([7.-5,7,4,2,0,4])
obj.rank()

0    2.5
1    6.0
2    4.5
3    2.5
4    1.0
5    4.5
dtype: float64

#### 区分数值出现的先后顺序

In [44]:
obj.rank(method='first')

0    2.0
1    6.0
2    4.0
3    3.0
4    1.0
5    5.0
dtype: float64

In [46]:
frm = DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.5]})
frm

Unnamed: 0,a,b,c
0,0,4.3,-2.0
1,1,7.0,5.0
2,0,-3.0,8.0
3,1,2.0,-2.5


In [51]:
print(frm.rank())
frm.rank(axis=0)

     a    b    c
0  1.5  3.0  2.0
1  3.5  4.0  3.0
2  1.5  1.0  4.0
3  3.5  2.0  1.0


Unnamed: 0,a,b,c
0,1.5,3.0,2.0
1,3.5,4.0,3.0
2,1.5,1.0,4.0
3,3.5,2.0,1.0


In [48]:
frm.rank(axis=1)

Unnamed: 0,a,b,c
0,2.0,3.0,1.0
1,1.0,3.0,2.0
2,2.0,1.0,3.0
3,2.0,3.0,1.0


### 带有重复值的索引

In [52]:
df = DataFrame(np.random.randn(4,3),index=['a','a','b','b'])
df

Unnamed: 0,0,1,2
a,-1.316786,-0.083821,1.605374
a,1.211833,-0.865484,0.231857
b,-1.129528,-1.109946,1.532517
b,0.49222,1.245905,-1.536293


In [54]:
df.ix['b']

Unnamed: 0,0,1,2
b,-1.129528,-1.109946,1.532517
b,0.49222,1.245905,-1.536293


In [55]:
df.index.is_unique

False

### 汇总计算和描述统计

In [58]:
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [59]:
df.sum()

one    9.25
two   -5.80
dtype: float64

#### axis : 操作轴，DataFrame的行用0, 列用1

In [65]:
df.sum(axis=1)

a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64

In [66]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


### 处理缺失数据

#### *np.nan和python内置的None都会被当做NA来处理*

In [68]:
data = Series(["lakers","clippers","spurs",np.nan,"rockets",None])
data

0      lakers
1    clippers
2       spurs
3         NaN
4     rockets
5        None
dtype: object

In [69]:
data.isnull()

0    False
1    False
2    False
3     True
4    False
5     True
dtype: bool

In [70]:
data.notnull()

0     True
1     True
2     True
3    False
4     True
5    False
dtype: bool

#### 滤除缺失的数据

In [71]:
data.dropna()

0      lakers
1    clippers
2       spurs
4     rockets
dtype: object

In [72]:
data[data.notnull()]

0      lakers
1    clippers
2       spurs
4     rockets
dtype: object

In [73]:
frm = DataFrame([[1,6.5,3],[1,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.5,3]])
frm

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [77]:
frm.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [78]:
frm.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [79]:
frm.dropna(axis=0,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


### 填充缺失的数据

In [80]:
frm.fillna(0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,0.0,0.0
2,0.0,0.0,0.0
3,0.0,6.5,3.0


### 针对不同的列进行填充不同的值

In [85]:
frm.fillna({1:100,2:-1})

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,100.0,-1.0
2,,100.0,-1.0
3,,6.5,3.0


### 层次化索引

In [87]:
data = Series(np.random.randn(10),index=[['a','a','a','b','b','b','c','c','d','d'],[1,2,3,1,2,3,1,2,1,2]])
data

a  1   -0.355396
   2   -0.835153
   3   -1.382004
b  1   -0.253211
   2    0.450039
   3   -0.333865
c  1    1.003190
   2   -0.297199
d  1   -0.693626
   2   -1.191112
dtype: float64

In [88]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 0, 1]])

In [89]:
data['a']

1   -0.355396
2   -0.835153
3   -1.382004
dtype: float64

In [90]:
data['b':'c']

b  1   -0.253211
   2    0.450039
   3   -0.333865
c  1    1.003190
   2   -0.297199
dtype: float64

In [91]:
data[:,2]

a   -0.835153
b    0.450039
c   -0.297199
d   -1.191112
dtype: float64

In [92]:
data.unstack()

Unnamed: 0,1,2,3
a,-0.355396,-0.835153,-1.382004
b,-0.253211,0.450039,-0.333865
c,1.00319,-0.297199,
d,-0.693626,-1.191112,


In [95]:
data.unstack().stack()

a  1   -0.355396
   2   -0.835153
   3   -1.382004
b  1   -0.253211
   2    0.450039
   3   -0.333865
c  1    1.003190
   2   -0.297199
d  1   -0.693626
   2   -1.191112
dtype: float64

In [103]:
frm = DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frm

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [106]:
frm.index.names=['key1','key2']
frm.columns.names=['state','color']

In [107]:
frm

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [108]:
frm.swaplevel('key1','key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11
