In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pandas import DataFrame,Series

  from pandas.core import datetools


In [2]:
# 分组级运算和转换
df = DataFrame({'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,0.550508,1.879314,a,one
1,0.254492,0.283275,a,two
2,0.663045,0.57271,b,one
3,0.618527,-3.163678,b,two
4,0.317932,1.12048,a,one


In [3]:
k1_means = df.groupby(['key1']).mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.374311,1.094357
b,0.640786,-1.295484


In [4]:
pd.merge(df,k1_means,left_on='key1',right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,0.550508,1.879314,a,one,0.374311,1.094357
1,0.254492,0.283275,a,two,0.374311,1.094357
4,0.317932,1.12048,a,one,0.374311,1.094357
2,0.663045,0.57271,b,one,0.640786,-1.295484
3,0.618527,-3.163678,b,two,0.640786,-1.295484


In [5]:
people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,0.488326,-0.045259,-1.150332,-0.974425,0.670636
Steve,0.264797,-0.306168,0.415148,-0.428822,-1.431038
Wes,1.023938,2.007095,-0.548136,-1.47564,0.92903
Jim,-0.274015,-1.228677,-0.608153,1.79226,0.505814
Travis,-1.259053,0.815636,-0.207568,0.160316,0.212394


In [6]:
# 使用key来进行分组，则第一行、第三行、第五行的均值
# 第二行、第四行的均值
key = ['one','two','one','two','one'] # 每一行的名称
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.084404,0.925824,-0.635345,-0.76325,0.60402
two,-0.004609,-0.767423,-0.096503,0.681719,-0.462612


In [7]:
# 这行代码的意思是，使用key进行分组，计算出mean值，
# 再将计算出来的均值填充到这些行。
people.groupby(key).transform(np.mean)

Unnamed: 0,a,b,c,d,e
Joe,0.084404,0.925824,-0.635345,-0.76325,0.60402
Steve,-0.004609,-0.767423,-0.096503,0.681719,-0.462612
Wes,0.084404,0.925824,-0.635345,-0.76325,0.60402
Jim,-0.004609,-0.767423,-0.096503,0.681719,-0.462612
Travis,0.084404,0.925824,-0.635345,-0.76325,0.60402


In [8]:
# 自定义填充（转换）规则，demean
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean)
demeaned

Unnamed: 0,a,b,c,d,e
Joe,0.403922,-0.971082,-0.514987,-0.211175,0.066616
Steve,0.269406,0.461254,0.51165,-1.110541,-0.968426
Wes,0.939534,1.081271,0.087209,-0.71239,0.32501
Jim,-0.269406,-0.461254,-0.51165,1.110541,0.968426
Travis,-1.343456,-0.110188,0.427778,0.923565,-0.391626


In [9]:
# 因为前面每个值都减去了平均值，所以应该是0.
# 显示不为0是因为浮点数计算误差
demeaned.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,0.0,-3.700743e-17,-3.700743e-17,1.110223e-16,-1.850372e-17
two,0.0,0.0,0.0,0.0,0.0


In [10]:
# apply : 一般性的“拆分-应用-合并”

In [11]:
import seaborn as sns

In [12]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [13]:
tips['tip_pct'] = tips.tip/tips.total_bill
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [14]:
# 获取小费比例最高的n条数据
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=3)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345


In [15]:
# 先按是否吸烟分组，再分别查看小费比例
tips.groupby('smoker').apply(top,n=2,column='tip_pct')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Yes,178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
No,149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [16]:
result = tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
result

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Yes,Thur,197,43.11,5.0,Female,Yes,Thur,Lunch,4,0.115982
Yes,Fri,95,40.17,4.73,Male,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Male,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Male,Yes,Sun,Dinner,3,0.077178
No,Thur,142,41.19,5.0,Male,No,Thur,Lunch,5,0.121389
No,Fri,94,22.75,3.25,Female,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,Male,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,Male,No,Sun,Dinner,6,0.103799


In [17]:
# 对于unstack的作用，如下理解
'''
Pivot a level of the (necessarily hierarchical) index labels, returning
    a DataFrame having a new level of column labels whose inner-most level
    consists of the pivoted index labels.
'''
# 根据level这个参数，将多层的index中对应的某一层index，比如level=-1，
# 表示将最内层的index作为columns的新一层，生成新的dataframe。
# 如果dataframe的index是单层的，那么生成新的Series，此时Series的
# index是多层的
result.unstack('smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,tip,tip,sex,sex,smoker,smoker,day,day,time,time,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No,Yes,No
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Thur,142,,41.19,,5.0,,Male,,No,,Thur,,Lunch,,5.0,,0.121389
Thur,197,43.11,,5.0,,Female,,Yes,,Thur,,Lunch,,4.0,,0.115982,
Fri,94,,22.75,,3.25,,Female,,No,,Fri,,Dinner,,2.0,,0.142857
Fri,95,40.17,,4.73,,Male,,Yes,,Fri,,Dinner,,4.0,,0.11775,
Sat,170,50.81,,10.0,,Male,,Yes,,Sat,,Dinner,,3.0,,0.196812,
Sat,212,,48.33,,9.0,,Male,,No,,Sat,,Dinner,,4.0,,0.18622
Sun,156,,48.17,,5.0,,Male,,No,,Sun,,Dinner,,6.0,,0.103799
Sun,182,45.35,,3.5,,Male,,Yes,,Sun,,Dinner,,3.0,,0.077178,


In [18]:
# 对于stack的作用，理解如下：
'''
Pivot a level of the (possibly hierarchical) column labels, returning a
    DataFrame (or Series in the case of an object with a single level of
    column labels) having a hierarchical index with a new inner-most level
    of row labels.
'''
# 将多层的columns，根据level参数，选定某一层，提取出来，转换成index
# 的新一层，填充到最内层，生成多层index
result = tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
result.unstack(level=0).stack()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,sex,smoker,day,time,size,tip_pct
day,Unnamed: 1_level_1,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Thur,142,No,41.19,5.0,Male,No,Thur,Lunch,5.0,0.121389
Thur,197,Yes,43.11,5.0,Female,Yes,Thur,Lunch,4.0,0.115982
Fri,94,No,22.75,3.25,Female,No,Fri,Dinner,2.0,0.142857
Fri,95,Yes,40.17,4.73,Male,Yes,Fri,Dinner,4.0,0.11775
Sat,170,Yes,50.81,10.0,Male,Yes,Sat,Dinner,3.0,0.196812
Sat,212,No,48.33,9.0,Male,No,Sat,Dinner,4.0,0.18622
Sun,156,No,48.17,5.0,Male,No,Sun,Dinner,6.0,0.103799
Sun,182,Yes,45.35,3.5,Male,Yes,Sun,Dinner,3.0,0.077178


In [19]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199


In [20]:
result.unstack('smoker')

       smoker
count  Yes        93.000000
       No        151.000000
mean   Yes         0.163196
       No          0.159328
std    Yes         0.085119
       No          0.039910
min    Yes         0.035638
       No          0.056797
25%    Yes         0.106771
       No          0.136906
50%    Yes         0.153846
       No          0.155625
75%    Yes         0.195059
       No          0.185014
max    Yes         0.710345
       No          0.291990
dtype: float64

In [21]:
'''
default:axis=0
axis : {0 or 'index', 1 or 'columns'}, default 0
        * 0 or 'index': apply function to each column
        * 1 or 'columns': apply function to each row
'''
result.apply(lambda x:x.describe())

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
std,41.012193,0.002735,0.031968,0.014961,0.021309,0.001258,0.007103,0.295822
min,93.0,0.159328,0.03991,0.035638,0.106771,0.153846,0.185014,0.29199
25%,107.5,0.160295,0.051212,0.040928,0.114305,0.154291,0.187525,0.396578
50%,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
75%,136.5,0.162229,0.073817,0.051507,0.129372,0.15518,0.192547,0.605756
max,151.0,0.163196,0.085119,0.056797,0.136906,0.155625,0.195059,0.710345


In [22]:
# 禁止分组键
# 禁止构成多重索引
# 此时smoker没有作为一层index，而是作为一个columns存在
tips.groupby('smoker',group_keys=False).apply(top)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
109,14.31,4.0,Female,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Male,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Female,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Female,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Male,Yes,Sun,Dinner,2,0.710345
88,24.71,5.85,Male,No,Thur,Lunch,2,0.236746
185,20.69,5.0,Male,No,Sun,Dinner,5,0.241663
51,10.29,2.6,Female,No,Sun,Dinner,2,0.252672
149,7.51,2.0,Male,No,Thur,Lunch,2,0.266312
232,11.61,3.39,Male,No,Sat,Dinner,2,0.29199


In [23]:
# 分位数和桶分析
frame = DataFrame({'data1':np.random.randn(1000),
                  'data2':np.random.randn(1000)})
factor = pd.cut(frame.data1,4)# 切4份，等距划分 qcut是等频划分
factor[:5]# 前5个元素

0     (0.0898, 1.568]
1    (-1.388, 0.0898]
2     (0.0898, 1.568]
3    (-1.388, 0.0898]
4    (-2.872, -1.388]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-2.872, -1.388] < (-1.388, 0.0898] < (0.0898, 1.568] < (1.568, 3.046]]

In [24]:
def get_stats(group):
    return {'min':group.min(),
           'max':group.max(),
           'count':group.count(),
           'mean':group.mean()}
# 根据data1的分段，对data2进行分组
grouped = frame.data2.groupby(factor)
result = grouped.apply(get_stats)
result.unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-2.872, -1.388]",73.0,2.541382,0.031874,-1.969359
"(-1.388, 0.0898]",442.0,2.782554,0.049768,-2.72843
"(0.0898, 1.568]",437.0,3.268943,-0.065322,-3.243978
"(1.568, 3.046]",48.0,2.395414,0.037877,-1.432584


In [25]:
# False的话默认是数字编号
grouping = pd.qcut(frame.data1,10,labels=list('ABCDEFGHIJ'))
grouping.head() # 返回每个元素的区间编号

0    I
1    D
2    F
3    D
4    A
Name: data1, dtype: category
Categories (10, object): [A < B < C < D ... G < H < I < J]

In [26]:
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,100.0,2.541382,0.018834,-1.969359
B,100.0,2.782554,0.151989,-2.389493
C,100.0,2.566009,0.103267,-2.522961
D,100.0,1.987446,0.029276,-2.728307
E,100.0,2.020572,-0.054654,-2.72843
F,100.0,3.268943,-0.037178,-2.467422
G,100.0,2.608333,-0.031832,-2.578758
H,100.0,2.038952,-0.115504,-2.676116
I,100.0,2.462106,-0.02965,-3.243978
J,100.0,2.395414,-0.058582,-2.999935


In [27]:
# 示例：用特定分组的值填充缺失值

In [32]:
s = Series(np.random.randn(6))
s[::2] = np.nan # s[::2]表示每两个index取一个数（每隔一个）
s.fillna(s.mean()) # 用平均值填充缺失值

0    0.052568
1    0.743672
2    0.052568
3   -0.386571
4    0.052568
5   -0.199398
dtype: float64

In [34]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East']*4 + ['West']*4
data = Series(np.random.randn(8),index=states)
data[['New York','Oregon','Idaho']] = np.nan
data

Ohio          1.240043
New York           NaN
Vermont      -1.786674
Florida       0.145921
Oregon             NaN
Nevada       -1.173932
California    3.029959
Idaho              NaN
dtype: float64

In [35]:
data.groupby(group_key).mean()# 求非NaN值的平均

East   -0.133570
West    0.928013
dtype: float64

In [36]:
fill_mean = lambda g:g.fillna(g.mean())
#分组后用每组的平均值填充缺失值
data.groupby(group_key).apply(fill_mean)

Ohio          1.240043
New York     -0.133570
Vermont      -1.786674
Florida       0.145921
Oregon        0.928013
Nevada       -1.173932
California    3.029959
Idaho         0.928013
dtype: float64

In [37]:
fill_values = {'East':0.5,'West':-1} # 指定填充值
fill_func = lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio          1.240043
New York      0.500000
Vermont      -1.786674
Florida       0.145921
Oregon       -1.000000
Nevada       -1.173932
California    3.029959
Idaho        -1.000000
dtype: float64

In [40]:
# 随机采样和排列
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ['H','S','C','D']
# Python3下range是生成器，必须用list显示展开
card_val = (list(range(1,11)) + [10]*3)*4
base_names = ['A'] + list(range(2,11))+['J','K','Q']
cards = []
for suit in suits:
    cards.extend(str(num)+suit for num in base_names)
deck = Series(card_val, index=cards)
deck.head()

AH    1
2H    2
3H    3
4H    4
5H    5
dtype: int64

In [41]:
# np.random.permutation(x)  将np.arange(x)产生的list随机打乱
# 如果x 为多维数组，则将数组进行复制，打乱数据返回。
def draw(deck,n=5):
    # take出Series中相应index的样本
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

2S      2
10D    10
3S      3
JH     10
9D      9
dtype: int64

In [46]:
# 每种花色中随机抽取两张牌
get_suit = lambda card :card[-1]
# 默认根据索引排序，索引的最后一个字符是花色
deck.groupby(get_suit).apply(draw,n=2)

C  2C     2
   9C     9
D  6D     6
   5D     5
H  KH    10
   8H     8
S  KS    10
   JS    10
dtype: int64

In [47]:
# 效果一样,但是不用多重索引
deck.groupby(get_suit,group_keys=False).apply(draw,n=2)

AC     1
6C     6
2D     2
5D     5
QH    10
AH     1
5S     5
4S     4
dtype: int64

In [None]:
# 示例：分组加权平均数和相关系数

In [48]:
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,0.535783,0.127268
1,a,-1.485562,0.098371
2,a,0.005903,0.995334
3,a,0.004751,0.609944
4,b,-1.56233,0.972856
5,b,-0.305661,0.20312
6,b,-0.445504,0.816564
7,b,1.480661,0.93903


In [49]:
grouped = df.groupby('category')

In [50]:
# 求加权平均，weights自动归一化处理
get_wavg = lambda g:np.average(g['data'],
                              weights=g['weights'])
grouped.apply(get_wavg)# 分组计算

category
a   -0.037781
b   -0.189456
dtype: float64

In [53]:
close_px = pd.read_csv('../dataset/stock_px.csv',
                      parse_dates=True,index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 9 columns):
AA      5472 non-null float64
AAPL    5472 non-null float64
GE      5472 non-null float64
IBM     5472 non-null float64
JNJ     5472 non-null float64
MSFT    5472 non-null float64
PEP     5471 non-null float64
SPX     5472 non-null float64
XOM     5472 non-null float64
dtypes: float64(9)
memory usage: 427.5 KB


In [54]:
close_px.head()

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.0,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33


In [64]:
# pct_change 计算相邻两行之间的变化率，默认axis=0，
# 也可以定义axis=1，此时计算时相邻两列之间的变化率
rets = close_px.pct_change().dropna() # 扔掉有空数据的行
spx_corr = lambda x:x.corrwith(x['SPX'])# 与SPX的相关系数
by_year = close_px.groupby(lambda x:x.year)# 指定用哪个函数去做group
by_year.apply(spx_corr)# 按照年分组并计算与SPX的相关系数

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.666344,0.79092,0.936572,0.542328,0.007684,0.418844,0.131762,1.0,-0.367608
1991,0.501961,0.062452,0.857109,-0.442377,0.80452,0.696698,0.595248,1.0,0.898667
1992,0.031827,0.043334,0.777656,-0.627887,0.527574,0.460122,0.732524,1.0,0.434972
1993,0.235182,-0.738769,0.753559,0.134773,-0.030199,-0.421216,0.325393,1.0,0.541652
1994,0.369808,0.428639,0.76473,0.115749,0.13448,-0.089835,0.394886,1.0,0.469686
1995,0.841431,-0.165185,0.945882,0.72844,0.952851,0.853378,0.96618,1.0,0.953864
1996,0.510062,-0.328348,0.956553,0.89041,0.7767,0.930639,-0.061709,1.0,0.902646
1997,0.60893,0.217403,0.972252,0.947289,0.646167,0.93037,0.908041,1.0,0.934569
1998,0.219095,0.450231,0.912106,0.601698,0.503901,0.702868,0.618352,1.0,0.688878
1999,0.745858,0.67893,0.795967,0.447046,0.671333,0.698906,-0.207475,1.0,0.728155


In [65]:
by_year.apply(lambda g:g['AAPL'].corr(g['MSFT']))

1990    0.505225
1991   -0.014891
1992    0.449038
1993    0.720003
1994    0.445075
1995    0.165660
1996   -0.525291
1997    0.244986
1998    0.812362
1999    0.673524
2000    0.635638
2001    0.696534
2002    0.631651
2003    0.666938
2004    0.784780
2005    0.570353
2006    0.815130
2007    0.761417
2008    0.729151
2009    0.951722
2010   -0.342094
2011    0.307734
dtype: float64

In [66]:
# 示例：面向分组的线性回归

In [67]:
# sm的线性回归模型拟合的时候需要手动加截距
def regress(data,yvar,xvars):
    y = data[yvar]
    x = data[xvars]
    x['intercept'] = 1.
    result = sm.OLS(y,x).fit()
    return result.params

by_year.apply(regress,'AAPL',['SPX'])

Unnamed: 0,SPX,intercept
1990,0.048327,-7.257926
1991,0.005647,10.344999
1992,0.008978,9.39817
1993,-0.233618,115.425788
1994,0.04979,-14.554316
1995,-0.003646,12.050288
1996,-0.007084,10.9796
1997,0.002047,2.704009
1998,0.011162,-4.474886
1999,0.064633,-71.346362
