In [43]:
%pylab inline
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

Populating the interactive namespace from numpy and matplotlib


# 金融和经济数据应用
## 数据规整化方面的话题

### 时间序列以及截面对齐
“数据对齐”（data alignment）：两个相关的时间序列的索引可能没有很好的对齐，或两个DF对象可能含有不匹配的列或行。  
MATLAB/R要花费大量的精力将数据规整化为完全对齐的形式，pandas可以在算术运算中自动对齐数据。

In [44]:
df = pd.read_csv('old-file/ch11/stock_px.csv', parse_dates=[0], index_col=[0])
df[:10]

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.0,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33
1990-02-08,5.04,7.71,2.92,17.86,4.46,0.51,6.22,332.96,6.35
1990-02-09,5.06,8.0,2.94,17.82,4.49,0.52,6.24,333.62,6.37
1990-02-12,4.96,7.94,2.89,17.58,4.46,0.52,6.23,330.08,6.22
1990-02-13,4.91,8.06,2.88,17.95,4.43,0.52,6.09,331.02,6.23
1990-02-14,4.94,8.0,2.89,18.04,4.47,0.52,6.1,332.01,6.2


In [45]:
prices = df[['AAPL', 'JNJ', 'SPX', 'XOM']]
prices.head()

Unnamed: 0,AAPL,JNJ,SPX,XOM
1990-02-01,7.86,4.27,328.79,6.12
1990-02-02,8.0,4.37,330.92,6.24
1990-02-05,8.18,4.34,331.85,6.25
1990-02-06,8.12,4.32,329.66,6.23
1990-02-07,7.77,4.38,333.75,6.33


In [46]:
df = pd.read_csv('old-file/ch11/volume.csv', parse_dates=[0], index_col=[0])
volume = df[['AAPL', 'JNJ', 'XOM']]
volume.head()

Unnamed: 0,AAPL,JNJ,XOM
1990-02-01,4193200.0,5942400.0,2916400.0
1990-02-02,4248800.0,4732800.0,4250000.0
1990-02-05,3653200.0,3950400.0,5880800.0
1990-02-06,2640000.0,3761600.0,4750800.0
1990-02-07,11180800.0,5458400.0,4124800.0


In [47]:
prices * volume

Unnamed: 0,AAPL,JNJ,SPX,XOM
1990-02-01,3.295855e+07,2.537405e+07,,1.784837e+07
1990-02-02,3.399040e+07,2.068234e+07,,2.652000e+07
1990-02-05,2.988318e+07,1.714474e+07,,3.675500e+07
1990-02-06,2.143680e+07,1.625011e+07,,2.959748e+07
1990-02-07,8.687482e+07,2.390779e+07,,2.610998e+07
1990-02-08,5.150280e+07,3.866642e+07,,3.588512e+07
1990-02-09,4.803520e+07,1.471642e+07,,2.156118e+07
1990-02-12,2.140306e+07,1.143901e+07,,1.678156e+07
1990-02-13,2.944802e+07,1.610394e+07,,2.220870e+07
1990-02-14,2.758400e+07,1.656761e+07,,1.754600e+07


In [48]:
# 成交量加权平均价格
vwap = (prices * volume).sum() / volume.sum()
vwap

AAPL    81.246271
JNJ     40.576111
SPX           NaN
XOM     50.520303
dtype: float64

In [49]:
vwap.dropna()

AAPL    81.246271
JNJ     40.576111
XOM     50.520303
dtype: float64

In [50]:
# 显式丢弃，使用DF的align方法，返回一个元组，包含两个对象的重索引版本：
prices.align(volume, join='inner')

(              AAPL    JNJ    XOM
 1990-02-01    7.86   4.27   6.12
 1990-02-02    8.00   4.37   6.24
 1990-02-05    8.18   4.34   6.25
 1990-02-06    8.12   4.32   6.23
 1990-02-07    7.77   4.38   6.33
 1990-02-08    7.71   4.46   6.35
 1990-02-09    8.00   4.49   6.37
 1990-02-12    7.94   4.46   6.22
 1990-02-13    8.06   4.43   6.23
 1990-02-14    8.00   4.47   6.20
 1990-02-15    8.00   4.54   6.40
 1990-02-16    7.91   4.47   6.33
 1990-02-20    7.85   4.39   6.25
 1990-02-21    7.97   4.33   6.28
 1990-02-22    7.73   4.28   6.22
 1990-02-23    7.79   4.20   6.22
 1990-02-26    7.97   4.30   6.37
 1990-02-27    7.85   4.30   6.38
 1990-02-28    7.97   4.32   6.20
 1990-03-01    8.03   4.35   6.13
 1990-03-02    7.91   4.43   6.17
 1990-03-05    8.08   4.36   6.10
 1990-03-06    8.26   4.45   6.22
 1990-03-07    8.29   4.45   6.13
 1990-03-08    8.61   4.50   6.17
 1990-03-09    8.64   4.48   6.12
 1990-03-12    8.58   4.46   6.13
 1990-03-13    8.64   4.38   6.02
 1990-03-14   

In [51]:
# 通过一组索引可能不同的Series构建一个DF
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d','b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])

DataFrame({'one': s1, 'two': s2, 'three': s3})

Unnamed: 0,one,three,two
a,0.0,1.0,
b,1.0,,1.0
c,2.0,2.0,2.0
d,,,0.0
e,,,3.0
f,,0.0,


In [52]:
DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))

Unnamed: 0,one,three,two
f,,0.0,
a,0.0,1.0,
c,2.0,2.0,2.0
e,,,3.0


### 频率不同的时间序列的运算
频率转换和重对齐的两大主要工具是resample和reindex方法。  
resample用于将数据转换到固定频率，而reindex则用于使数据符合一个新索引。  
它们都支持插值（如前向填充）逻辑。

In [53]:
ts1 = Series(np.random.randn(3),
            index=pd.date_range('2012-6-12', periods=3, freq='W-WED'))
ts1

2012-06-13   -0.512860
2012-06-20    2.356966
2012-06-27   -0.532520
Freq: W-WED, dtype: float64

In [54]:
# 重采样到工作日频率，没有数据的日子会出现空
ts1.resample('B').mean()

2012-06-13   -0.512860
2012-06-14         NaN
2012-06-15         NaN
2012-06-18         NaN
2012-06-19         NaN
2012-06-20    2.356966
2012-06-21         NaN
2012-06-22         NaN
2012-06-25         NaN
2012-06-26         NaN
2012-06-27   -0.532520
Freq: B, dtype: float64

In [55]:
ts1.resample('B').ffill()

2012-06-13   -0.512860
2012-06-14   -0.512860
2012-06-15   -0.512860
2012-06-18   -0.512860
2012-06-19   -0.512860
2012-06-20    2.356966
2012-06-21    2.356966
2012-06-22    2.356966
2012-06-25    2.356966
2012-06-26    2.356966
2012-06-27   -0.532520
Freq: B, dtype: float64

In [56]:
# 不规整频率
dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',
                         '2012-6-21', '2012-6-22', '2012-6-29'])
ts2 = Series(np.random.randn(6), index=dates)
ts2

2012-06-12    0.344643
2012-06-17    1.844415
2012-06-18   -1.036623
2012-06-21   -0.516821
2012-06-22   -0.337424
2012-06-29    0.866701
dtype: float64

In [57]:
# 将ts1种的值加到ts2上，在维持ts2种的日期索引的情况下，用reindex解决
ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17   -0.512860
2012-06-18   -0.512860
2012-06-21    2.356966
2012-06-22    2.356966
2012-06-29   -0.532520
dtype: float64

In [58]:
ts2 + ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17    1.331555
2012-06-18   -1.549483
2012-06-21    1.840145
2012-06-22    2.019542
2012-06-29    0.334181
dtype: float64

### 使用Period
提供了另一种处理不同频率时间序列的办法，尤其那些有着**特殊规范**的以年或季度为频率的金融或经济序列。  
一个公司可能会发布以6月份结尾的财年的每季度盈利报告，频率为Q-JUN。来看两个有关GDP和通货膨胀的宏观经济时间序列：

In [59]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
            index=pd.period_range('1984Q2', periods=7, freq='Q-SEP')) # 以SEP为结束的季度
infl = Series([0.025, 0.045, 0.037, 0.04],
             index=pd.period_range('1982', periods=4, freq='A-DEC')) # 以DEC为结束的年

In [60]:
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [61]:
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

In [62]:
infl_q = infl.asfreq('Q-SEP', how='end')

In [63]:
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [64]:
infl_q.reindex(gdp.index, method='ffill')

1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

### 时间和“最当前”数据选取
