In [1]:
from __future__ import division
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
pd.options.display.max_rows = 12
np.set_printoptions(precision=4, suppress=True)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(12, 4))

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# Data munging topics

데이터 준비 

## Time series and cross-section alignment



In [4]:
close_px = pd.read_csv('data/ch11/stock_px.csv', parse_dates=True, index_col=0)
volume = pd.read_csv('data/ch11/volume.csv', parse_dates=True, index_col=0)
prices = close_px.loc['2011-09-05':'2011-09-14', ['AAPL', 'JNJ', 'SPX', 'XOM']]
volume = volume.loc['2011-09-05':'2011-09-12', ['AAPL', 'JNJ', 'XOM']]

In [8]:
# 거래량
volume
prices * volume

Unnamed: 0,AAPL,JNJ,XOM
2011-09-06,18173500.0,15848300.0,25416300.0
2011-09-07,12492000.0,10759700.0,23108400.0
2011-09-08,14839800.0,15551500.0,22434800.0
2011-09-09,20171900.0,17008200.0,27969100.0
2011-09-12,16697300.0,13448200.0,26205800.0


Unnamed: 0,AAPL,JNJ,SPX,XOM
2011-09-06,6901205000.0,1024434000.0,,1808370000.0
2011-09-07,4796054000.0,704007200.0,,1701934000.0
2011-09-08,5700561000.0,1010070000.0,,1633702000.0
2011-09-09,7614489000.0,1082402000.0,,1986086000.0
2011-09-12,6343972000.0,855171000.0,,1882625000.0
2011-09-13,,,,
2011-09-14,,,,


In [9]:
# 단일 거래에 대한, 거래된 한주당 평균 주가 
vwap = (prices * volume).sum() / volume.sum()
vwap

AAPL    380.655181
JNJ      64.394769
SPX            NaN
XOM      72.024288
dtype: float64

In [10]:
vwap.dropna()

AAPL    380.655181
JNJ      64.394769
XOM      72.024288
dtype: float64

In [11]:
prices.align(volume, join='inner')

(              AAPL    JNJ    XOM
 2011-09-06  379.74  64.64  71.15
 2011-09-07  383.93  65.43  73.65
 2011-09-08  384.14  64.95  72.82
 2011-09-09  377.48  63.64  71.01
 2011-09-12  379.94  63.59  71.84,
                   AAPL         JNJ         XOM
 2011-09-06  18173500.0  15848300.0  25416300.0
 2011-09-07  12492000.0  10759700.0  23108400.0
 2011-09-08  14839800.0  15551500.0  22434800.0
 2011-09-09  20171900.0  17008200.0  27969100.0
 2011-09-12  16697300.0  13448200.0  26205800.0)

In [12]:
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])
DataFrame({'one': s1, 'two': s2, 'three': s3})

Unnamed: 0,one,three,two
a,0.0,1.0,
b,1.0,,1.0
c,2.0,2.0,2.0
d,,,0.0
e,,,3.0
f,,0.0,


In [13]:
DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))

Unnamed: 0,one,three,two
f,,0.0,
a,0.0,1.0,
c,2.0,2.0,2.0
e,,,3.0


### Operations with time series of different frequencies

In [15]:
ts1 = Series(np.random.randn(3),
             index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))
ts1

2012-06-13    1.619605
2012-06-20   -0.525436
2012-06-27   -2.095212
Freq: W-WED, dtype: float64

In [16]:
ts1.resample('B')

DatetimeIndexResampler [freq=<BusinessDay>, axis=0, closed=left, label=left, convention=start, base=0]

In [17]:
ts1.resample('B').ffill()

2012-06-13    1.619605
2012-06-14    1.619605
2012-06-15    1.619605
2012-06-18    1.619605
2012-06-19    1.619605
2012-06-20   -0.525436
2012-06-21   -0.525436
2012-06-22   -0.525436
2012-06-25   -0.525436
2012-06-26   -0.525436
2012-06-27   -2.095212
Freq: B, dtype: float64

In [19]:
dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',
                          '2012-6-21', '2012-6-22', '2012-6-29'])
ts2 = Series(np.random.randn(6), index=dates)
ts2

2012-06-12   -1.124883
2012-06-17    0.134027
2012-06-18    1.129768
2012-06-21    1.375899
2012-06-22   -0.913427
2012-06-29    1.031935
dtype: float64

In [20]:
# ts2로 다시 인덱싱 하겠다. 
ts1.reindex(ts2.index).ffill()

2012-06-12   NaN
2012-06-17   NaN
2012-06-18   NaN
2012-06-21   NaN
2012-06-22   NaN
2012-06-29   NaN
dtype: float64

In [21]:
ts2 + ts1.reindex(ts2.index).ffill()

2012-06-12   NaN
2012-06-17   NaN
2012-06-18   NaN
2012-06-21   NaN
2012-06-22   NaN
2012-06-29   NaN
dtype: float64

#### Using periods instead of timestamps

In [22]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
             index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))
infl = Series([0.025, 0.045, 0.037, 0.04],
              index=pd.period_range('1982', periods=4, freq='A-DEC'))
gdp
infl

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

In [24]:
infl_q = infl.asfreq('Q-SEP', how='end')
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [25]:
infl_q.reindex(gdp.index, method='ffill')
infl_q

1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64