In [2]:
# 检查pandas是否安装好了

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print('Hello, Pandas')

Hello, Pandas


In [4]:
# 通过传递值列表来创建一个系列，让Pandas创建一个默认的整数索引:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [5]:
# 通过传递 numpy 数组, 使用datetime索引和标记列来创建DataFrame
dates = pd.date_range('20190228', periods=7)
print(dates)
print('--'*16)
df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
print(df)

DatetimeIndex(['2019-02-28', '2019-03-01', '2019-03-02', '2019-03-03',
               '2019-03-04', '2019-03-05', '2019-03-06'],
              dtype='datetime64[ns]', freq='D')
--------------------------------
                   A         B         C         D
2019-02-28  0.815418 -0.042083 -0.456778  2.637277
2019-03-01 -1.098334  0.432953 -1.272166 -0.962234
2019-03-02  1.521279 -0.021197  0.460703 -0.708133
2019-03-03 -0.591527 -0.730397  1.375019 -1.326105
2019-03-04 -0.990563 -0.805685 -1.383599  0.716953
2019-03-05  1.223609  0.016651 -0.893842  0.498036
2019-03-06 -0.183597  0.058861 -0.629665 -0.213048


In [6]:
# 通过字典来创建DataFrame
df2 = pd.DataFrame({
    'A': 1,
    'B': pd.Timestamp('20190228'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(['test', 'train', 'test', 'train']),
    'F': 'foo'
})
print(df2)

   A          B    C  D      E    F
0  1 2019-02-28  1.0  3   test  foo
1  1 2019-02-28  1.0  3  train  foo
2  1 2019-02-28  1.0  3   test  foo
3  1 2019-02-28  1.0  3  train  foo


In [11]:
# 查看框架的顶部和底部的数据行
dates = pd.date_range('20190301', periods=7)
df = pd.DataFrame(np.random.randn(7, 4), index=dates, columns=list('ABCD'))
print(df.head())
print('-------'* 10)
print(df.tail(3))

                   A         B         C         D
2019-03-01 -0.050891  0.326553 -1.027658 -1.701585
2019-03-02  1.665027  0.114310  1.913054  0.245307
2019-03-03 -1.320102  0.778767  0.122822 -0.645810
2019-03-04 -0.031670 -1.851832  1.673014 -1.303776
2019-03-05 -0.909918  0.826651  1.454990 -1.489953
----------------------------------------------------------------------
                   A         B         C         D
2019-03-05 -0.909918  0.826651  1.454990 -1.489953
2019-03-06 -1.288405  0.188955 -1.569860  0.232230
2019-03-07 -1.059343  0.225782 -1.271175  0.699152


In [16]:
# 显示索引、列和底层numpy数据
print('index is: ', df.index)
print('columns is: ', df.columns)
print('values is: ', df.values)

index is:  DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
               '2019-03-05', '2019-03-06', '2019-03-07'],
              dtype='datetime64[ns]', freq='D')
columns is:  Index(['A', 'B', 'C', 'D'], dtype='object')
values is:  [[-0.05089093  0.32655282 -1.02765771 -1.70158548]
 [ 1.66502684  0.11431047  1.91305357  0.24530716]
 [-1.32010173  0.77876663  0.12282168 -0.64580982]
 [-0.03166997 -1.85183213  1.67301365 -1.30377563]
 [-0.9099176   0.82665136  1.45499033 -1.48995275]
 [-1.28840519  0.18895466 -1.56986014  0.23222955]
 [-1.05934328  0.22578234 -1.27117476  0.69915207]]


In [17]:
# 描述显示数据的快速统计摘要
df.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.4279,0.087027,0.185027,-0.566348
std,1.068256,0.901155,1.499312,0.965222
min,-1.320102,-1.851832,-1.56986,-1.701585
25%,-1.173874,0.151633,-1.149416,-1.396864
50%,-0.909918,0.225782,0.122822,-0.64581
75%,-0.04128,0.55266,1.564002,0.238768
max,1.665027,0.826651,1.913054,0.699152


In [18]:
# 调换数据，矩阵逆置
df.T

Unnamed: 0,2019-03-01 00:00:00,2019-03-02 00:00:00,2019-03-03 00:00:00,2019-03-04 00:00:00,2019-03-05 00:00:00,2019-03-06 00:00:00,2019-03-07 00:00:00
A,-0.050891,1.665027,-1.320102,-0.03167,-0.909918,-1.288405,-1.059343
B,0.326553,0.11431,0.778767,-1.851832,0.826651,0.188955,0.225782
C,-1.027658,1.913054,0.122822,1.673014,1.45499,-1.56986,-1.271175
D,-1.701585,0.245307,-0.64581,-1.303776,-1.489953,0.23223,0.699152


In [21]:
# 通过轴排序, ascending=False 降序序，True升序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2019-03-01,-1.701585,-1.027658,0.326553,-0.050891
2019-03-02,0.245307,1.913054,0.11431,1.665027
2019-03-03,-0.64581,0.122822,0.778767,-1.320102
2019-03-04,-1.303776,1.673014,-1.851832,-0.03167
2019-03-05,-1.489953,1.45499,0.826651,-0.909918
2019-03-06,0.23223,-1.56986,0.188955,-1.288405
2019-03-07,0.699152,-1.271175,0.225782,-1.059343


In [26]:
# 按值排序, 默认降序排序，ascending=False升序
print(df.sort_values(by='B'))
print(df.sort_values(by='B', ascending=False))

                   A         B         C         D
2019-03-04 -0.031670 -1.851832  1.673014 -1.303776
2019-03-02  1.665027  0.114310  1.913054  0.245307
2019-03-06 -1.288405  0.188955 -1.569860  0.232230
2019-03-07 -1.059343  0.225782 -1.271175  0.699152
2019-03-01 -0.050891  0.326553 -1.027658 -1.701585
2019-03-03 -1.320102  0.778767  0.122822 -0.645810
2019-03-05 -0.909918  0.826651  1.454990 -1.489953
                   A         B         C         D
2019-03-05 -0.909918  0.826651  1.454990 -1.489953
2019-03-03 -1.320102  0.778767  0.122822 -0.645810
2019-03-01 -0.050891  0.326553 -1.027658 -1.701585
2019-03-07 -1.059343  0.225782 -1.271175  0.699152
2019-03-06 -1.288405  0.188955 -1.569860  0.232230
2019-03-02  1.665027  0.114310  1.913054  0.245307
2019-03-04 -0.031670 -1.851832  1.673014 -1.303776


In [33]:
# 选择一列
df['A']

2019-03-01   -0.050891
2019-03-02    1.665027
2019-03-03   -1.320102
2019-03-04   -0.031670
2019-03-05   -0.909918
2019-03-06   -1.288405
2019-03-07   -1.059343
Freq: D, Name: A, dtype: float64

In [34]:
# 切片行
df[0:3]

Unnamed: 0,A,B,C,D
2019-03-01,-0.050891,0.326553,-1.027658,-1.701585
2019-03-02,1.665027,0.11431,1.913054,0.245307
2019-03-03,-1.320102,0.778767,0.122822,-0.64581


In [36]:
# 指定选择日期
print('========指定选择日期===========')
df['20190301':'20190305']



Unnamed: 0,A,B,C,D
2019-03-01,-0.050891,0.326553,-1.027658,-1.701585
2019-03-02,1.665027,0.11431,1.913054,0.245307
2019-03-03,-1.320102,0.778767,0.122822,-0.64581
2019-03-04,-0.03167,-1.851832,1.673014,-1.303776
2019-03-05,-0.909918,0.826651,1.45499,-1.489953


In [39]:
# 使用标签获取横截面
dates = pd.date_range('20190301', periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df.loc[dates[0]])

A   -1.076931
B    0.134197
C   -1.154182
D    0.722826
Name: 2019-03-01 00:00:00, dtype: float64


In [41]:
# 通过标签选择多轴
print(df.loc[:, ['A', 'B']])

                   A         B
2019-03-01 -1.076931  0.134197
2019-03-02 -2.044523 -0.464087
2019-03-03 -0.132776 -1.306859
2019-03-04  2.270239  1.676939
2019-03-05 -0.500713  1.367007
2019-03-06 -0.259914 -1.137223


In [43]:
# 显示标签切片，包括两个端点
print(df.loc['20190303':'20190305', ['A', 'B']])

                   A         B
2019-03-03 -0.132776 -1.306859
2019-03-04  2.270239  1.676939
2019-03-05 -0.500713  1.367007


In [45]:
# 减少返回对象的大小
print(df.loc['20190303', ['A', 'B']])

A   -0.132776
B   -1.306859
Name: 2019-03-03 00:00:00, dtype: float64


In [51]:
# 获取标量值
print(df.loc[dates[0], 'A'])

-1.0769309251955075


In [52]:
# 快速访问标量等同于loc
print(df.at[dates[0], 'A'])

-1.0769309251955075


In [60]:
# 通过位置选择，传递整数的位置选择
print(df.iloc[3])
# 通过整数切片
print(df.iloc[3:5, 0:2])
# 通过整数位置的列表
print(df.iloc[[1, 2, 4], [0, 2]])
# 明确切片行
print(df.iloc[1:3, :])
# 明确切片列
print(df.iloc[:, 1:3])
# 明确获取值
print(df.iloc[1, 1])
# 快速访问标量
print(df.iat[1, 1])

A    2.270239
B    1.676939
C   -0.397652
D   -1.084032
Name: 2019-03-04 00:00:00, dtype: float64
                   A         B
2019-03-04  2.270239  1.676939
2019-03-05 -0.500713  1.367007
                   A         C
2019-03-02 -2.044523  2.055828
2019-03-03 -0.132776  0.916044
2019-03-05 -0.500713  0.813726
                   A         B         C         D
2019-03-02 -2.044523 -0.464087  2.055828  0.285524
2019-03-03 -0.132776 -1.306859  0.916044  2.807670
                   B         C
2019-03-01  0.134197 -1.154182
2019-03-02 -0.464087  2.055828
2019-03-03 -1.306859  0.916044
2019-03-04  1.676939 -0.397652
2019-03-05  1.367007  0.813726
2019-03-06 -1.137223  1.075026
-0.46408683956811647
-0.46408683956811647


In [62]:
# 布尔索引
# 使用单列的值来选择数据
print(df[df.A > 0])

                   A         B         C         D
2019-03-04  2.270239  1.676939 -0.397652 -1.084032


In [64]:
# 从满足布尔条件的DataFrame中选择值
print(df[df > 0])

                   A         B         C         D
2019-03-01       NaN  0.134197       NaN  0.722826
2019-03-02       NaN       NaN  2.055828  0.285524
2019-03-03       NaN       NaN  0.916044  2.807670
2019-03-04  2.270239  1.676939       NaN       NaN
2019-03-05       NaN  1.367007  0.813726  0.450990
2019-03-06       NaN       NaN  1.075026  0.597659


In [68]:
# 使用 isin() 方法进行过滤
df2 = df.copy()
df2['E'] = ['one', 'one', 'two', 'three', 'four', 'three']
print(df2)
print('============= start to filter =====================')
print(df2[df2['E'].isin(['two', 'four'])])

                   A         B         C         D      E
2019-03-01 -1.076931  0.134197 -1.154182  0.722826    one
2019-03-02 -2.044523 -0.464087  2.055828  0.285524    one
2019-03-03 -0.132776 -1.306859  0.916044  2.807670    two
2019-03-04  2.270239  1.676939 -0.397652 -1.084032  three
2019-03-05 -0.500713  1.367007  0.813726  0.450990   four
2019-03-06 -0.259914 -1.137223  1.075026  0.597659  three
                   A         B         C        D     E
2019-03-03 -0.132776 -1.306859  0.916044  2.80767   two
2019-03-05 -0.500713  1.367007  0.813726  0.45099  four
