In [34]:
# 重采样及频率转换
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas import DataFrame,Series
from pandas.tseries.offsets import Day,Hour,Minute,MonthEnd,Second

In [5]:
rng = pd.date_range('1/1/2000',periods=100,freq='D')
ts = Series(np.random.randn(100),index=rng)
ts.head()

2000-01-01    0.486906
2000-01-02    1.517094
2000-01-03   -0.277287
2000-01-04   -0.124305
2000-01-05   -0.019601
Freq: D, dtype: float64

In [None]:
# resample方法的参数
# rule 使用什么标准来采样，比如'M','D','B'等
# how 已经被淘汰  原来的how='mean'写成resample(...).mean()
# axis 应该是对DataFrame来说的，行操作还是列操作，默认是行操作
# fill_method 已经被淘汰
# closed 在降采样中，各时间段的哪一端是闭合的，'right'，或'left'
# label 在降采样中，如何设置聚合值的标签，'right,或'left'
#       例如，9:30到9::35之间的5分钟会被标记为9:00或9:35
# conversion 在重采样时期时，将低频转换到高频采样所采用的约定,
#            默认'start'，亦可'end'
# loffset  面元标签的时间校正值。比如'-1s'或Second(-1)用于将聚合
#          标签调早1秒
# base 对于5分钟频率，基础可能范围从0到4，默认值是0
# on    针对DataFrame中列操作时，指定哪一列来采样
# level 针对多维Index，指定使用哪一层的index来采样

In [13]:
# 降采样
rng = pd.date_range('1/1/2010',periods=15,freq='T')
ts = Series(range(15),index=rng)
ts.head()

2010-01-01 00:00:00    0
2010-01-01 00:01:00    1
2010-01-01 00:02:00    2
2010-01-01 00:03:00    3
2010-01-01 00:04:00    4
Freq: T, dtype: int32

In [16]:
ts.resample('3T').sum().head()

2010-01-01 00:00:00     3
2010-01-01 00:03:00    12
2010-01-01 00:06:00    21
2010-01-01 00:09:00    30
2010-01-01 00:12:00    39
Freq: 3T, dtype: int32

In [18]:
# closed 为left，
# 3分钟一个区间，00-01-02-03 为3T
# left 闭区间是左，则[00,03),sum求和时，取0+1+2=3
# 由于label默认是left，所以标签取00
ts.resample('3T',closed='left').sum()

2010-01-01 00:00:00     3
2010-01-01 00:03:00    12
2010-01-01 00:06:00    21
2010-01-01 00:09:00    30
2010-01-01 00:12:00    39
Freq: 3T, dtype: int32

In [19]:
# closed为right，
# 3分钟一个区间，00-01-02-03 为3T
# right闭区间是右，则(00,03]，sum求和时，取1+2+3=6
# 由于label默认是left，所以标签取00
ts.resample('3T',closed='right').sum()

2009-12-31 23:57:00     0
2010-01-01 00:00:00     6
2010-01-01 00:03:00    15
2010-01-01 00:06:00    24
2010-01-01 00:09:00    33
2010-01-01 00:12:00    27
Freq: 3T, dtype: int32

In [21]:
# label为right
# 3分钟一个区间，00-01-02-03为3T
# closed默认为left，所以是[00,03),sum求和时，取0+1+2=3
# 由于label是right，所以标签取03
ts.resample('3T',label='right').sum()#closed 默认为left

2010-01-01 00:03:00     3
2010-01-01 00:06:00    12
2010-01-01 00:09:00    21
2010-01-01 00:12:00    30
2010-01-01 00:15:00    39
Freq: 3T, dtype: int32

In [22]:
#label为left  默认为left
# 3分钟一个区间，00-01-02-03为3T
# closed默认为left，所以是[00,03),sum求和时，取0+1+2=3
# 由于labelleft，所以标签取00
ts.resample('3T',label='left').sum()# closed默认为left

2010-01-01 00:00:00     3
2010-01-01 00:03:00    12
2010-01-01 00:06:00    21
2010-01-01 00:09:00    30
2010-01-01 00:12:00    39
Freq: 3T, dtype: int32

In [23]:
# closed=left label=left
# 偏移量是1s，所以面元标签+1s
# 只改变计算后的标签
ts.resample(Minute(3),loffset='1s').sum()

2010-01-01 00:00:01     3
2010-01-01 00:03:01    12
2010-01-01 00:06:01    21
2010-01-01 00:09:01    30
2010-01-01 00:12:01    39
Freq: 3T, dtype: int32

In [25]:
ts.resample(Minute(3),base=0).sum()

2010-01-01 00:00:00     3
2010-01-01 00:03:00    12
2010-01-01 00:06:00    21
2010-01-01 00:09:00    30
2010-01-01 00:12:00    39
Freq: 3T, dtype: int32

In [27]:
# 类似于loffset，但是base可以理解为同时改变
# 面元标签和sum的元素，这里base是2，则计算时
# 00-01-02-03 间隔3T，移动base(=2),为
# 02-03-04-05 间隔3T，计算sum=2+3+4=9
# 由于closed和label默认为left
ts.resample(Minute(3),base=2).sum()

2009-12-31 23:59:00     1
2010-01-01 00:02:00     9
2010-01-01 00:05:00    18
2010-01-01 00:08:00    27
2010-01-01 00:11:00    36
2010-01-01 00:14:00    14
Freq: 3T, dtype: int32

'''
resample(rule, how=None, axis=0, fill_method=None, closed=None, label=None, convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None)
'''

In [35]:
# 重采样30s
ts.resample(Second(30)).asfreq()[0:5]

2010-01-01 00:00:00    0.0
2010-01-01 00:00:30    NaN
2010-01-01 00:01:00    1.0
2010-01-01 00:01:30    NaN
2010-01-01 00:02:00    2.0
Freq: 30S, dtype: float64

In [36]:
# 填充NaN,前向填充类似于ffill
ts.resample(Second(30)).pad()[0:5]

2010-01-01 00:00:00    0
2010-01-01 00:00:30    0
2010-01-01 00:01:00    1
2010-01-01 00:01:30    1
2010-01-01 00:02:00    2
Freq: 30S, dtype: int32

In [37]:
# 填充NaN，后向填充，类似于bfill
ts.resample('30S').bfill()[0:5]

2010-01-01 00:00:00    0
2010-01-01 00:00:30    1
2010-01-01 00:01:00    1
2010-01-01 00:01:30    2
2010-01-01 00:02:00    2
Freq: 30S, dtype: int32

In [38]:
# 通过apply运行一个自定义函数
def custom_resample(array_like):
    return np.sum(array_like)+5
ts.resample(Minute(3)).apply(custom_resample)

2010-01-01 00:00:00     8
2010-01-01 00:03:00    17
2010-01-01 00:06:00    26
2010-01-01 00:09:00    35
2010-01-01 00:12:00    44
Freq: 3T, dtype: int32

In [40]:
df = DataFrame(data=9*[range(4)],columns=['a','b','c','d'])
df['time'] = pd.date_range('1/1/2000',periods=9,freq='T')
df.head()

Unnamed: 0,a,b,c,d,time
0,0,1,2,3,2000-01-01 00:00:00
1,0,1,2,3,2000-01-01 00:01:00
2,0,1,2,3,2000-01-01 00:02:00
3,0,1,2,3,2000-01-01 00:03:00
4,0,1,2,3,2000-01-01 00:04:00


In [41]:
df.resample('3T',on='time').sum()

Unnamed: 0_level_0,a,b,c,d
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,0,3,6,9
2000-01-01 00:03:00,0,3,6,9
2000-01-01 00:06:00,0,3,6,9


In [43]:
time = pd.date_range('1/1/2000',periods=5,freq='T')
df = DataFrame(data=10*[range(4)],
              columns=['a','b','c','d'],
              index=pd.MultiIndex.from_product(
              [time,[1,2]]))
df

Unnamed: 0,Unnamed: 1,a,b,c,d
2000-01-01 00:00:00,1,0,1,2,3
2000-01-01 00:00:00,2,0,1,2,3
2000-01-01 00:01:00,1,0,1,2,3
2000-01-01 00:01:00,2,0,1,2,3
2000-01-01 00:02:00,1,0,1,2,3
2000-01-01 00:02:00,2,0,1,2,3
2000-01-01 00:03:00,1,0,1,2,3
2000-01-01 00:03:00,2,0,1,2,3
2000-01-01 00:04:00,1,0,1,2,3
2000-01-01 00:04:00,2,0,1,2,3


In [48]:
df.resample('2T',level=0).sum()

Unnamed: 0,a,b,c,d
2000-01-01 00:00:00,0,4,8,12
2000-01-01 00:02:00,0,4,8,12
2000-01-01 00:04:00,0,2,4,6


In [49]:
# OHLC重采样 Open/High/Low/Close

In [50]:
rng = pd.date_range('1/1/2000',periods=15,freq='T')
ts = Series(range(15),index=rng)
ts.head()

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
Freq: T, dtype: int32

In [51]:
# 其实就是5分钟k线图
ts.resample('5T').ohlc() # 开盘，最高，最低，收盘

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,14,10,14


In [52]:
# 通过groupby进行重采样
rng = pd.date_range('1/1/2000',periods=100,freq='D')
ts = Series(range(100),index=rng)
ts.groupby(lambda x:x.month).mean()

1    15
2    45
3    75
4    95
dtype: int32

In [53]:
ts.groupby(lambda x:x.weekday).mean()# 0是周日

0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

In [54]:
# 升采样和插值
frame = DataFrame(np.random.randn(2,4),
                 index=pd.date_range('1/1/2000',periods=2,freq='W-WED'),
                 columns=['Colorado','Texas','New York','Ohio'])
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.152852,-0.778139,0.088707,-0.442574
2000-01-12,-0.985766,-1.891469,-1.081687,-0.18528


In [55]:
frame.resample('D').pad()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.152852,-0.778139,0.088707,-0.442574
2000-01-06,-0.152852,-0.778139,0.088707,-0.442574
2000-01-07,-0.152852,-0.778139,0.088707,-0.442574
2000-01-08,-0.152852,-0.778139,0.088707,-0.442574
2000-01-09,-0.152852,-0.778139,0.088707,-0.442574
2000-01-10,-0.152852,-0.778139,0.088707,-0.442574
2000-01-11,-0.152852,-0.778139,0.088707,-0.442574
2000-01-12,-0.985766,-1.891469,-1.081687,-0.18528


In [58]:
for i in frame.resample('D').iteritems():
    print(i)

('Colorado', 2000-01-05   -0.152852
2000-01-06         NaN
2000-01-07         NaN
2000-01-08         NaN
2000-01-09         NaN
2000-01-10         NaN
2000-01-11         NaN
2000-01-12   -0.985766
Freq: D, Name: Colorado, dtype: float64)
('Texas', 2000-01-05   -0.778139
2000-01-06         NaN
2000-01-07         NaN
2000-01-08         NaN
2000-01-09         NaN
2000-01-10         NaN
2000-01-11         NaN
2000-01-12   -1.891469
Freq: D, Name: Texas, dtype: float64)
('New York', 2000-01-05    0.088707
2000-01-06         NaN
2000-01-07         NaN
2000-01-08         NaN
2000-01-09         NaN
2000-01-10         NaN
2000-01-11         NaN
2000-01-12   -1.081687
Freq: D, Name: New York, dtype: float64)
('Ohio', 2000-01-05   -0.442574
2000-01-06         NaN
2000-01-07         NaN
2000-01-08         NaN
2000-01-09         NaN
2000-01-10         NaN
2000-01-11         NaN
2000-01-12   -0.185280
Freq: D, Name: Ohio, dtype: float64)


.resample() is now a deferred operation
You called iteritems(...) on this deferred object which materialized it into a dataframe
by implicitly taking the mean.  Use .resample(...).mean() instead
  """Entry point for launching an IPython kernel.


In [59]:
frame.resample('D').bfill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-0.152852,-0.778139,0.088707,-0.442574
2000-01-06,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-07,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-08,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-09,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-10,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-11,-0.985766,-1.891469,-1.081687,-0.18528
2000-01-12,-0.985766,-1.891469,-1.081687,-0.18528


In [63]:
# 通过时期进行重采样

In [64]:
frame = DataFrame(np.random.randn(24,4),
                 index=pd.period_range('1-2000','12-2001',freq='M'),
                 columns=['Colorado','Texas','New York','Ohio'])
frame.head()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,-1.346188,1.258305,-0.810552,0.171215
2000-02,-0.250168,0.28315,1.324118,0.112565
2000-03,0.571713,-1.033826,1.057647,-1.301723
2000-04,-1.036244,0.594786,0.598773,-0.391418
2000-05,-0.425644,-0.241858,-0.16224,-0.027402


In [65]:
frame.index

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06',
             '2000-07', '2000-08', '2000-09', '2000-10', '2000-11', '2000-12',
             '2001-01', '2001-02', '2001-03', '2001-04', '2001-05', '2001-06',
             '2001-07', '2001-08', '2001-09', '2001-10', '2001-11', '2001-12'],
            dtype='period[M]', freq='M')

In [67]:
annual_frame = frame.resample('A-DEC').mean()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,-0.214155,0.385625,0.243442,-0.519978
2001,-0.111339,-0.128711,0.128725,0.259194


In [69]:
annual_frame.index.start_time

DatetimeIndex(['2000-01-01', '2001-01-01'], dtype='datetime64[ns]', freq=None)

In [70]:
annual_frame.index.end_time

DatetimeIndex(['2000-12-31', '2001-12-31'], dtype='datetime64[ns]', freq=None)

In [71]:
frame.resample('Q-DEC').mean()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.341548,0.169209,0.523737,-0.339314
2000Q2,-0.935079,0.248873,-0.354851,0.202453
2000Q3,0.557422,0.132509,0.431256,-2.016974
2000Q4,-0.137415,0.99191,0.373626,0.073924
2001Q1,0.080577,0.104695,-0.565977,0.6155
2001Q2,-0.507448,0.089342,0.929165,0.465289
2001Q3,-0.359228,-0.142184,0.027795,-0.413357
2001Q4,0.340743,-0.566699,0.123916,0.369342


In [72]:
quarter_frame = frame.resample('Q-DEC').mean()
quarter_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.341548,0.169209,0.523737,-0.339314
2000Q2,-0.935079,0.248873,-0.354851,0.202453
2000Q3,0.557422,0.132509,0.431256,-2.016974
2000Q4,-0.137415,0.99191,0.373626,0.073924
2001Q1,0.080577,0.104695,-0.565977,0.6155
2001Q2,-0.507448,0.089342,0.929165,0.465289
2001Q3,-0.359228,-0.142184,0.027795,-0.413357
2001Q4,0.340743,-0.566699,0.123916,0.369342


In [81]:
quarter_frame = frame.resample('Q-NOV').mean()
quarter_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,-0.798178,0.770727,0.256783,0.14189
2000Q2,-0.296725,-0.226966,0.49806,-0.573514
2000Q3,0.013683,0.074909,-0.349821,-1.031029
2000Q4,0.214659,0.751026,0.759573,-0.770669
2001Q1,0.024377,0.672133,-0.396729,0.826657
2001Q2,-0.617604,0.244367,0.603845,0.161183
2001Q3,-0.077651,-0.685773,-0.028136,-0.132432
2001Q4,-0.244487,-0.399375,0.267899,0.608595
2002Q1,0.641671,0.250547,-0.11163,-0.679564


In [83]:
# 一些讲解：
# - 在降采样中，目标频率必须是源频率的子时期（subperiod）。降采样 <-> 高频到低频
# - 在升采样中，目标频率必须是源频率的超时期（superperiod）。升采样 <-> 低频到高频
# - 如果不满足这些条件，就会引发异常。这主要影响的是按季、年、周计算的频率。
#   例如，由Q-MAR定义的时间区间只能升采样为A-MAR、A-JUN、A-SEP、A-DEC等。(有问题自己慢慢琢磨吧...)