In [1]:
# 时间序列
import pandas as pd
import numpy as np

In [2]:
# python标准库包含用于日期（date）和时间（time）数据的数据类型，而且还有日历方面的功能。
# 我们主要会用到datatime、time以及calendar模块。datetime.datetime是用得最多的数据类型
from datetime import datetime
now = datetime.now()
now

datetime.datetime(2019, 7, 17, 16, 48, 10, 339300)

In [3]:
now.year, now.month, now.day

(2019, 7, 17)

In [4]:
# datetime以毫秒形式存储日期和时间。timedelta表示两个datetime对象之间的时间差
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [5]:
delta.days

926

In [6]:
delta.seconds

56700

In [7]:
# 可以给datetime对象加上（或减去）一个或多个timedelta，这样会产生一个新对象(日、秒、毫秒)
from datetime import timedelta
start = datetime(2011, 1, 7)
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [8]:
start - 2*timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

In [9]:
# 字符串和datetime的互相转换
# 利用str或strftime方法（传入一个格式化字符串），datetime对象和pandas的Timestamp对象
# 可以被格式化为字符串
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [10]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [11]:
# datetime.strptime可以用这些格式化编码将字符串转换为日期
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [12]:
datestrs = ['7/6/2011', '8/6/2011']
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [13]:
# datetime.strptime是通过已知格式进行日期解析的最佳方式。但是每次都要编写格式定义是麻烦
# 的事情，尤其是对于一些常见的日期格式。这种情况下，你可以用dateutil这个第三方包中的
# parser.parse方法（pandas中已经自动安装好了）
from dateutil.parser import parse
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [14]:
# dateutil可以解析几乎所有人类能够理解的日期表示形式
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [15]:
parse('2019 1 1 14:00')

datetime.datetime(2019, 1, 1, 14, 0)

In [16]:
# 在国际通用的格式中，日出现在月的前面很普遍，传入dayfirst=True，即可解决这个问题
parse('6/12/2011', dayfirst=True)

datetime.datetime(2011, 12, 6, 0, 0)

In [17]:
# pandas通常是用于处理成组日期的，不管这些日期是DataFrame的轴索引还是列。to_datetime方法
# 可以解析多种不同的日期表示形式。对标准日期格式（如IS08601）的解析非常快
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [18]:
# 它还可以处理缺失值（None、空字符串等）
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [19]:
idx[2]

NaT

In [20]:
pd.isnull(idx)

array([False, False,  True])

In [21]:
# NaT(Not a Time)是pandas中时间戳数据的null值

In [22]:
# 时间序列基础
# pandas最基本的时间序列类型就是以时间戳为索引的Series
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),
         datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02    0.846129
2011-01-05   -1.057333
2011-01-07   -0.944450
2011-01-08   -0.033905
2011-01-10   -1.212500
2011-01-12   -0.590229
dtype: float64

In [23]:
# 这些datetime对象实际上是被放在一个DatetimeIndex中的
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [24]:
# 跟其他Series一样，不同索引的时间序列之间的算术运算会自动按日期对齐
ts + ts[::2]   #ts[::2]是每隔两个取一个

2011-01-02    1.692258
2011-01-05         NaN
2011-01-07   -1.888899
2011-01-08         NaN
2011-01-10   -2.425000
2011-01-12         NaN
dtype: float64

In [25]:
# pandas用Numpy的datetime64数据类型以纳秒形式存储时间戳
ts.index.dtype

dtype('<M8[ns]')

In [26]:
# DatetimeIndex中的各个标量值是pandas的Timestamp对象
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

In [27]:
# 只要有需要，Timestamp可以随时自动转换为Datatime对象。此外，它还可以存储频率信息，且
# 知道如何执行时区转换以及其他操作

In [28]:
# 索引 选取 子集构造
# 当你根据标签索引选取数据时，时间序列和其他的pandas.Series很像
stamp = ts.index[2]
stamp

Timestamp('2011-01-07 00:00:00')

In [29]:
ts[stamp]

-0.9444495568847658

In [30]:
# 还有一种更为方便的用法：传入一个可以被解释为日期的字符串
ts['1/10/2011']

-1.2125000188151884

In [31]:
ts['20110110']

-1.2125000188151884

In [32]:
# 对于较长的时间序列，只需传入年或年月即可轻松选取数据的切片
longer_ts = pd.Series(np.random.randn(1000),
                      index = pd.date_range('1/1/2000',periods=1000))
longer_ts

2000-01-01    1.583483
2000-01-02    0.478582
2000-01-03   -1.582142
2000-01-04    0.803781
2000-01-05    0.514653
2000-01-06    1.291837
2000-01-07   -0.738696
2000-01-08    0.791480
2000-01-09    0.144927
2000-01-10    0.679811
2000-01-11   -0.372335
2000-01-12   -1.824795
2000-01-13   -0.127114
2000-01-14    0.054765
2000-01-15   -0.773040
2000-01-16    0.660628
2000-01-17   -1.344360
2000-01-18   -0.400187
2000-01-19   -0.160639
2000-01-20   -0.229783
2000-01-21    0.467884
2000-01-22    0.804798
2000-01-23   -1.274523
2000-01-24   -2.028847
2000-01-25    0.717671
2000-01-26    0.714249
2000-01-27    0.677222
2000-01-28   -1.615000
2000-01-29   -0.762789
2000-01-30   -0.091569
                ...   
2002-08-28   -0.730591
2002-08-29    0.048752
2002-08-30   -0.413107
2002-08-31    0.805626
2002-09-01    0.284892
2002-09-02   -0.041211
2002-09-03    0.894723
2002-09-04    1.038219
2002-09-05   -1.803686
2002-09-06    0.403729
2002-09-07   -0.562830
2002-09-08   -0.124560
2002-09-09 

In [33]:
longer_ts['2001']

2001-01-01    0.153150
2001-01-02    0.500436
2001-01-03    0.384464
2001-01-04   -0.756742
2001-01-05    0.728235
2001-01-06   -0.076287
2001-01-07    0.229172
2001-01-08   -1.425770
2001-01-09    1.064483
2001-01-10   -0.906105
2001-01-11   -0.634001
2001-01-12   -0.141718
2001-01-13    0.248312
2001-01-14   -1.769439
2001-01-15   -0.671160
2001-01-16   -1.544510
2001-01-17    1.679581
2001-01-18    0.225321
2001-01-19    1.027120
2001-01-20   -1.204753
2001-01-21   -0.232669
2001-01-22   -0.360576
2001-01-23    0.272264
2001-01-24   -0.722944
2001-01-25   -0.260394
2001-01-26    0.425863
2001-01-27    0.322668
2001-01-28    1.142819
2001-01-29    0.806131
2001-01-30   -0.333780
                ...   
2001-12-02   -0.751153
2001-12-03    0.395142
2001-12-04    1.376700
2001-12-05   -2.696009
2001-12-06    0.211603
2001-12-07    0.583017
2001-12-08   -1.912898
2001-12-09    1.284650
2001-12-10   -0.397858
2001-12-11    0.895207
2001-12-12    0.517974
2001-12-13    1.182084
2001-12-14 

In [34]:
# 这里，字符串“2001”被解释为年，并根据它选取时间区间。指定月也同样奏效
longer_ts['2001-05']

2001-05-01   -0.489859
2001-05-02   -0.176150
2001-05-03    1.366656
2001-05-04    0.738752
2001-05-05    0.135589
2001-05-06    0.198006
2001-05-07    1.225382
2001-05-08    1.869997
2001-05-09    1.819084
2001-05-10   -0.767762
2001-05-11   -1.611401
2001-05-12    1.569330
2001-05-13   -1.040537
2001-05-14    0.803239
2001-05-15    0.916445
2001-05-16   -0.558189
2001-05-17    0.682303
2001-05-18   -0.438286
2001-05-19   -0.942583
2001-05-20   -0.785728
2001-05-21   -0.972179
2001-05-22    0.183154
2001-05-23    0.406681
2001-05-24    0.473486
2001-05-25    0.062740
2001-05-26    0.891382
2001-05-27    0.278929
2001-05-28    0.351553
2001-05-29   -1.777798
2001-05-30    0.160391
2001-05-31   -0.009250
Freq: D, dtype: float64

In [35]:
# datetime对象也可以进行切片
ts[datetime(2011, 1, 7):]

2011-01-07   -0.944450
2011-01-08   -0.033905
2011-01-10   -1.212500
2011-01-12   -0.590229
dtype: float64

In [36]:
# 由于大部分时间序列数据都是按照时间先后排序的，因此你也可以用不存在于该时间序列的
# 时间戳对其进行切片（即范围查询）
ts

2011-01-02    0.846129
2011-01-05   -1.057333
2011-01-07   -0.944450
2011-01-08   -0.033905
2011-01-10   -1.212500
2011-01-12   -0.590229
dtype: float64

In [37]:
ts['1/6/2011': '1/11/2011']

2011-01-07   -0.944450
2011-01-08   -0.033905
2011-01-10   -1.212500
dtype: float64

In [38]:
# 跟之前一样，你可以传入字符串日期、datetime或timestamp。注意，这样切片所产生的是源时间
# 序列的视图，跟Numpy数组的切片运算是一样的  这意味着，没有数据被复制，对切片进行修改会
# 反映到原始数据上 此外，还有一个等价的实例方法也可以截取两个日期之间的TimeSeries
ts.truncate(after='1/9/2011')

2011-01-02    0.846129
2011-01-05   -1.057333
2011-01-07   -0.944450
2011-01-08   -0.033905
dtype: float64

In [39]:
# 这些操作对DataFrame也有效
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4), 
                       index = dates,
                       columns=['Colorado', 'Texas', 'New York', 'Ohio'])
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.315456,-0.799577,-0.250605,-0.51806
2001-05-09,-0.802989,-0.066074,0.877158,-0.854961
2001-05-16,-0.222926,-0.893812,-0.612403,0.434553
2001-05-23,0.595706,1.226469,0.512978,0.834379
2001-05-30,-0.585919,0.405033,1.421442,0.664006


In [40]:
help(pd.date_range)

Help on function date_range in module pandas.core.indexes.datetimes:

date_range(start=None, end=None, periods=None, freq=None, tz=None, normalize=False, name=None, closed=None, **kwargs)
    Return a fixed frequency DatetimeIndex.
    
    Parameters
    ----------
    start : str or datetime-like, optional
        Left bound for generating dates.
    end : str or datetime-like, optional
        Right bound for generating dates.
    periods : integer, optional
        Number of periods to generate.
    freq : str or DateOffset, default 'D'
        Frequency strings can have multiples, e.g. '5H'. See
        :ref:`here <timeseries.offset_aliases>` for a list of
        frequency aliases.
    tz : str or tzinfo, optional
        Time zone name for returning localized DatetimeIndex, for example
        'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
        timezone-naive.
    normalize : bool, default False
        Normalize start/end dates to midnight before generating dat

In [41]:
# 带有重复索引的时间序列
# 在某些应用场景中，可能会存在多个观测数据落在同一个时间点上的情况
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)

In [42]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [43]:
# 通过检查索引的is_unique属性，我们就可以知道他是不是唯一的
dup_ts.index.is_unique

False

In [44]:
# 对这个时间序列进行索引，要么产生标量值，要么产生切片，具体要看所选的时间点是否重复
dup_ts['1/3/2000']

4

In [45]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [46]:
# 假设你想要对具有非唯一时间戳的数据进行聚合。一个办法是使用groupby，并传入level=0
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [47]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64