In [2]:
# 日期的范围、频率以及移动
import numpy as np
import pandas as pd
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas import DataFrame,Series
from pandas.tseries.offsets import Day, Hour, Minute, MonthEnd

In [3]:
dates = [datetime(2011,1,2),
        datetime(2011, 1, 5),
        datetime(2011, 1, 7),
        datetime(2011, 1, 8),
        datetime(2011, 1, 10),
        datetime(2011, 1, 12)]
ts = Series(np.random.randn(6),index=dates)

In [4]:
ts

2011-01-02    0.824993
2011-01-05   -0.799037
2011-01-07    2.276828
2011-01-08   -0.556551
2011-01-10    0.407476
2011-01-12   -0.173712
dtype: float64

In [5]:
# pandas 中的时间序列一般被默认为不规则的，即没有固定的频率。但
# 处于分析的需要，我们通过插值的方式将序列转换为具有固定频率的格式。
# 一种快捷方式是使用resample(rule)方法
# 按日期重新采样，填充进去的日期设置为NaN
for k,v in ts.resample('D').iteritems():
    print(k,v)

2011-01-02 00:00:00 0.8249931033519954
2011-01-03 00:00:00 nan
2011-01-04 00:00:00 nan
2011-01-05 00:00:00 -0.7990372503931042
2011-01-06 00:00:00 nan
2011-01-07 00:00:00 2.2768275488528564
2011-01-08 00:00:00 -0.5565513827737169
2011-01-09 00:00:00 nan
2011-01-10 00:00:00 0.407476143529384
2011-01-11 00:00:00 nan
2011-01-12 00:00:00 -0.17371180963563043


.resample() is now a deferred operation
You called iteritems(...) on this deferred object which materialized it into a series
by implicitly taking the mean.  Use .resample(...).mean() instead
  """


In [6]:
# 生成日期范围
index = pd.date_range('4/1/2012','5/1/2012')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01'],
              dtype='datetime64[ns]', freq='D')

In [7]:
pd.date_range(start='4/1/2012',periods=20) # 20天

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [8]:
pd.date_range(end='4/1/2012',periods=20) # 前20天

DatetimeIndex(['2012-03-13', '2012-03-14', '2012-03-15', '2012-03-16',
               '2012-03-17', '2012-03-18', '2012-03-19', '2012-03-20',
               '2012-03-21', '2012-03-22', '2012-03-23', '2012-03-24',
               '2012-03-25', '2012-03-26', '2012-03-27', '2012-03-28',
               '2012-03-29', '2012-03-30', '2012-03-31', '2012-04-01'],
              dtype='datetime64[ns]', freq='D')

In [9]:
# BM = business end of month 结果调整到月底
# 只取 月底（工作日）的日期作为Index
pd.date_range('1/1/2000','12/1/2000',freq='BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [10]:
pd.date_range('5/2/2012 12:56:31',periods=5)# 时分秒保留

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [12]:
# 时分被和谐
pd.date_range('5/2/2012 12:56:31',periods=5,normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

In [13]:
# 频率和日期偏移量
hour = Hour()
four_hours = Hour(4)

In [14]:
# 以4小时为间隔单位，两边闭区间。
# 如果右面是1/4/2000 00::00,那么符合生成条件
pd.date_range('1/1/2000','1/3/2000 23:59',freq='4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [15]:
Hour(2)+Minute(30)

<150 * Minutes>

In [16]:
pd.date_range('1/1/2000',periods=10,freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [17]:
# 时间序列基础频率
# D：    每日
# B：    每工作日
# H：    每小时
# T/min:  每分钟
# S：    每秒
# L/ms:   每毫秒
# U：     每微妙
# M:     每月最后一个日历日
# BM：    每月最后一个工作日
# MS：    每月第一个日历日
# BNS：   每月第一个工作日
# W-MON： 每月从指定的星期几开始算起
# WOM-1MON： 产生每月的第一、第二、第x周的周几。WOM-3FRI表示每月第三个星期五。
# Q-JAN：   对于以指定月份结束的年度，每季度最后一月的最后一个日历日。
# BQ-JAN：  对于以指定月份结束的年度，每季度最后一月的最后一个工作日。
# QS-JAN：  对于以指定月份结束的年度，每季度最后一月的第一个日历日。
# BQS-JAN： 对于以指定月份结束的年度，每季度最后一月的第一个工作日。
# A-JAN：   每年指定月份的最后一个日历日 
# BA-JAN：  每年指定月份的最后一个工作日
# AS-JAN：  每年指定月份的第一个日历日
# BAS-JAN： 每年指定月份的第一个工作日

In [19]:
# WOM 日期（Week Of Month）
# 每个月的第2个周五
rng = pd.date_range('1/1/2010',periods=10,freq='WOM-2FRI')
rng

DatetimeIndex(['2010-01-08', '2010-02-12', '2010-03-12', '2010-04-09',
               '2010-05-14', '2010-06-11', '2010-07-09', '2010-08-13',
               '2010-09-10', '2010-10-08'],
              dtype='datetime64[ns]', freq='WOM-2FRI')

In [20]:
# 移动（超前和滞后）数据
ts = Series(np.random.randn(4),
           index=pd.date_range('1/1/2000',periods=4,freq='M'))
ts

2000-01-31   -0.794998
2000-02-29    0.561038
2000-03-31    0.341638
2000-04-30    1.176280
Freq: M, dtype: float64

In [21]:
ts.shift(1)

2000-01-31         NaN
2000-02-29   -0.794998
2000-03-31    0.561038
2000-04-30    0.341638
Freq: M, dtype: float64

In [22]:
ts.shift(-1)

2000-01-31    0.561038
2000-02-29    0.341638
2000-03-31    1.176280
2000-04-30         NaN
Freq: M, dtype: float64

In [23]:
# 相对于前一条记录的变化率，比如股票涨跌幅
ts / ts.shift(1) - 1 

2000-01-31         NaN
2000-02-29   -1.705710
2000-03-31   -0.391061
2000-04-30    2.443060
Freq: M, dtype: float64

In [24]:
# 指定freq后调整key，加2个月
ts.shift(2,freq='M')

2000-03-31   -0.794998
2000-04-30    0.561038
2000-05-31    0.341638
2000-06-30    1.176280
Freq: M, dtype: float64

In [27]:
ts.shift(-2,freq='D')

2000-01-29   -0.794998
2000-02-27    0.561038
2000-03-29    0.341638
2000-04-28    1.176280
dtype: float64

In [28]:
ts.shift(1,freq='3D')# 3D等价于3天

2000-02-03   -0.794998
2000-03-03    0.561038
2000-04-03    0.341638
2000-05-03    1.176280
dtype: float64

In [29]:
# 通过偏移量对日期进行位移
now = datetime(2011,11,17)
now + 3 * Day()

Timestamp('2011-11-20 00:00:00')

In [30]:
now + MonthEnd() # 移动到月底

Timestamp('2011-11-30 00:00:00')

In [31]:
now + MonthEnd(2)# 推到下个月月底

Timestamp('2011-12-31 00:00:00')

In [32]:
offset = MonthEnd()
print(offset.rollforward(now))# 等价now+MonthEnd()
print(offset.rollback(now))# 移动到now之前那个月的月底

2011-11-30 00:00:00
2011-10-31 00:00:00


In [39]:
ts = Series(np.random.randn(20),
           index=pd.date_range('1/15/2000',periods=20,freq='4d'))
# 根据rollforward的结果分组并统计
ts.groupby(offset.rollforward).mean()

2000-01-31    0.078277
2000-02-29   -0.398599
2000-03-31    0.193669
dtype: float64

In [43]:
for k,v in ts.resample('M').iteritems():
    print(k,v)

2000-01-31 00:00:00 0.07827706938626075
2000-02-29 00:00:00 -0.39859918281126155
2000-03-31 00:00:00 0.19366898996947116


.resample() is now a deferred operation
You called iteritems(...) on this deferred object which materialized it into a series
by implicitly taking the mean.  Use .resample(...).mean() instead
  """Entry point for launching an IPython kernel.


In [48]:
ts.loc['1/1/2000':'2/1/2000'].mean()

0.07827706938626075

In [49]:
ts.resample('M').mean()

2000-01-31    0.078277
2000-02-29   -0.398599
2000-03-31    0.193669
Freq: M, dtype: float64