### pandas
因为 pandas 含有使得数据分析工作变得高效的高级数据结构和操作工具  

pandas 基于 numpy 来进行构建

In [2]:
import pandas as pd
from pandas import Series, DataFrame

### Series 类型说明
这个就是一个类似于一维数组的对象，它是由一组数据以及一组与之相关的数组标签组成（索引），且由一组数据即可产生最简单的series

In [5]:
obj = Series([1,2,3,4,5])
print(obj)
print(obj.values)
print(obj.index)

0    1
1    2
2    3
3    4
4    5
dtype: int64
[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [7]:
# 自定义索引
obj = Series(['a','b','c','e'], index=[1,2,3,4])
print(obj)

1    a
2    b
3    c
4    e
dtype: object


In [8]:
obj[2]

'b'

In [11]:
# 我们也可以把Series 当作字典来使用
data = {'a':1000, 'b':2000, 'c':3000}
obj = Series(data)
print(data)
keys = ['a','c']
obj_1 = Series(data,index=keys)
print(obj_1)

{'a': 1000, 'b': 2000, 'c': 3000}
a    1000
c    3000
dtype: int64


In [14]:
# 缺失数据的处理
data = {'a':None, 'b':20000, 'c':30000}
obj = Series(data)
print(obj)

a        NaN
b    20000.0
c    30000.0
dtype: float64


In [15]:
pd.isnull(obj)

a     True
b    False
c    False
dtype: bool

In [16]:
pd.notnull(obj)

a    False
b     True
c     True
dtype: bool

In [17]:
data = {'LiLe':None, 'HanMeimei':25, 'Toni':None,'Jack':50}
obj = Series(data)
obj.name = 'NameAndAge'
obj.index.name = 'xingme'
print(obj)

xingme
LiLe          NaN
HanMeimei    25.0
Toni          NaN
Jack         50.0
Name: NameAndAge, dtype: float64


### DataFrame 类型
DataFrame 是一个表格型的数据结构，它含有一组有序的列，每列可以是不同值得类型，数值，字符串，布尔值都可以

DataFrame 本身有行索引，也有列索引

DataFrame 也可以理解成由 series 组成的字典

In [19]:
# 构建DataFrame
data = {
    '60年代':['狗子','嘎子','秀儿'],
    '70年代':['卫国','建国','爱国'],
    '80年代':['李烈','韩寒','张伟'],
}
frame_data = DataFrame(data)
print(frame_data)
print(frame_data['70年代'])

  60年代 70年代 80年代
0   狗子   卫国   李烈
1   嘎子   建国   韩寒
2   秀儿   爱国   张伟
0    卫国
1    建国
2    爱国
Name: 70年代, dtype: object


In [21]:
import numpy as np
dates = pd.date_range('20190301', periods=6)
print(dates)

DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
               '2019-03-05', '2019-03-06'],
              dtype='datetime64[ns]', freq='D')


In [25]:
df = pd.DataFrame(np.random.rand(6,4), index=dates, columns=list('ABCD'))
print(df)

                   A         B         C         D
2019-03-01  0.354000  0.388653  0.051867  0.085335
2019-03-02  0.827737  0.621688  0.150804  0.594892
2019-03-03  0.795124  0.358267  0.549414  0.613351
2019-03-04  0.051236  0.963592  0.462968  0.656992
2019-03-05  0.346791  0.689398  0.873752  0.170278
2019-03-06  0.761649  0.872547  0.306120  0.448941


In [26]:
df.T

Unnamed: 0,2019-03-01,2019-03-02,2019-03-03,2019-03-04,2019-03-05,2019-03-06
A,0.354,0.827737,0.795124,0.051236,0.346791,0.761649
B,0.388653,0.621688,0.358267,0.963592,0.689398,0.872547
C,0.051867,0.150804,0.549414,0.462968,0.873752,0.30612
D,0.085335,0.594892,0.613351,0.656992,0.170278,0.448941


In [29]:
df['20190301':'20190303']  # 左闭右闭

Unnamed: 0,A,B,C,D
2019-03-01,0.354,0.388653,0.051867,0.085335
2019-03-02,0.827737,0.621688,0.150804,0.594892
2019-03-03,0.795124,0.358267,0.549414,0.613351


In [32]:
df.loc['20190301':'20190303', ['A','B']]  # 对行和列进行筛选

Unnamed: 0,A,B
2019-03-01,0.354,0.388653
2019-03-02,0.827737,0.621688
2019-03-03,0.795124,0.358267


In [33]:
df.at[dates[0],'A']

0.35399956361772134

In [34]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-03-01,0.354,0.388653,0.051867,0.085335
2019-03-02,0.827737,0.621688,0.150804,0.594892


In [35]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-03-04,0.051236,0.963592,0.462968,0.656992
2019-03-05,0.346791,0.689398,0.873752,0.170278
2019-03-06,0.761649,0.872547,0.30612,0.448941


### DataFrame 构造函数能够接受那些数据类型
1. 二维 numpy array
2. 由数组，列表或者元组组成的字典
3. 由 series 组成的字典
4. 由字典组成的字典
5. 由列表或元组组成的列表
7. 另一个 DataFram
 

### pandas 重新索引

In [36]:
obj = Series([4.5, 9.8, -1.2], index=['a','b','c'])
print(obj)
job_1 = obj.reindex(['a','b','c','e','f'])
print(job_1)

a    4.5
b    9.8
c   -1.2
dtype: float64
a    4.5
b    9.8
c   -1.2
e    NaN
f    NaN
dtype: float64


In [37]:
job_1 = obj.reindex(['a','b','c','e','f'],fill_value=0)
print(job_1)

a    4.5
b    9.8
c   -1.2
e    0.0
f    0.0
dtype: float64


In [40]:
obj = Series([4.5, 9.8, -1.2], index=[0,2,4])
o = obj.reindex(range(6), method='ffill')  # 前向值填充  backfill 后向值填充
print(obj)
print(o)

0    4.5
2    9.8
4   -1.2
dtype: float64
0    4.5
1    4.5
2    9.8
3    9.8
4   -1.2
5   -1.2
dtype: float64


### 算数运算和数据对齐
pandas 的一个重要功能，就是可以对不同索引的对象进行算法运算，在将对象相加时，如果存在不同的索引对，则结果的索引就是该索引的并集

In [42]:
d1 = Series([1.3, 2.5, -1.3 , 2.3], index=['a','b','c','d'])
d2 = Series([2.1, 1.4, -1.5 , -1.3, 2.0], index=['a','b','c','d','e'])

print(d1+d2)

a    3.4
b    3.9
c   -2.8
d    1.0
e    NaN
dtype: float64


In [43]:
df1 = DataFrame(np.arange(9).reshape((3,3)), columns=list('abc'), index=[1,2,3])
print(df1)

   a  b  c
1  0  1  2
2  3  4  5
3  6  7  8


In [44]:
df2 = DataFrame(np.arange(12).reshape((4,3)), columns=list('cde'), index=[1,2,3,4])
print(df2)

   c   d   e
1  0   1   2
2  3   4   5
3  6   7   8
4  9  10  11


In [45]:
df1+df2

Unnamed: 0,a,b,c,d,e
1,,,2.0,,
2,,,8.0,,
3,,,14.0,,
4,,,,,


In [46]:
df1.add(df2, fill_value=0)  # 用0来填充不重叠的值，注意：如果本身位置就是空，那么还是空

Unnamed: 0,a,b,c,d,e
1,0.0,1.0,2.0,1.0,2.0
2,3.0,4.0,8.0,4.0,5.0
3,6.0,7.0,14.0,7.0,8.0
4,,,9.0,10.0,11.0


### DataFrame 和 Series 的运算

In [50]:
frame = DataFrame(np.arange(12).reshape((4,3)), columns=list('bde'), index=[1,2,3,4])
series = frame.loc[1]
print(frame)
print(series)

   b   d   e
1  0   1   2
2  3   4   5
3  6   7   8
4  9  10  11
b    0
d    1
e    2
Name: 1, dtype: int32


In [51]:
frame-series  # 一直向下，广播相减

Unnamed: 0,b,d,e
1,0,0,0
2,3,3,3
3,6,6,6
4,9,9,9


In [52]:
series = Series(range(3), index=list('bef'))
frame + series # 相加时，没有就合并

Unnamed: 0,b,d,e,f
1,0.0,,3.0,
2,3.0,,6.0,
3,6.0,,9.0,
4,9.0,,12.0,


### 排序
根据条件对数据进行排序

In [53]:
obj = Series(range(4), index=['d','e','a','b'])
print(obj)

d    0
e    1
a    2
b    3
dtype: int64


In [54]:
obj.sort_index()

a    2
b    3
d    0
e    1
dtype: int64

In [55]:
obj.sort_values()

d    0
e    1
a    2
b    3
dtype: int64

In [56]:
# 针对 DataFrame，根据任意一个轴上的索引进行排序
frame = DataFrame(np.arange(8).reshape((2,4)), index=['two','one'], columns=['c','d','e','a'])
frame

Unnamed: 0,c,d,e,a
two,0,1,2,3
one,4,5,6,7


In [57]:
frame.sort_index()

Unnamed: 0,c,d,e,a
one,4,5,6,7
two,0,1,2,3


In [59]:
frame.sort_index(axis=1) # 指定轴进行排序

Unnamed: 0,a,c,d,e
two,3,0,1,2
one,7,4,5,6


In [60]:
frame = DataFrame({'b':[4,7,2,-1],'a':[0,4,2,0]})
frame

Unnamed: 0,b,a
0,4,0
1,7,4
2,2,2
3,-1,0


In [61]:
frame.sort_values(by='b')

Unnamed: 0,b,a
3,-1,0
2,2,2
0,4,0
1,7,4
