In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline

In [3]:
df = pd.DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
                   'key2':['one', 'two', 'one', 'two', 'one'],
                   'data1':np.random.randn(5), 
                   'data2':np.random.randn(5)})

In [4]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.718982,-1.870917,a,one
1,-1.478086,-1.059762,a,two
2,-1.020752,-0.913036,b,one
3,-0.375736,0.543435,b,two
4,1.99647,1.279956,a,one


In [5]:
grouped = df['data1'].groupby(df['key1'])

In [6]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f75a88128>

In [7]:
grouped.mean()

key1
a   -0.400199
b   -0.698244
Name: data1, dtype: float64

In [8]:
grouped.size()

key1
a    3
b    2
Name: data1, dtype: int64

In [9]:
for name, group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -1.718982 -1.870917    a  one
1 -1.478086 -1.059762    a  two
4  1.996470  1.279956    a  one
b
      data1     data2 key1 key2
2 -1.020752 -0.913036    b  one
3 -0.375736  0.543435    b  two


In [10]:
for (k1, k2), group  in df.groupby(['key1', 'key2']):
    print(k1, k2)
    print(group)

a one
      data1     data2 key1 key2
0 -1.718982 -1.870917    a  one
4  1.996470  1.279956    a  one
a two
      data1     data2 key1 key2
1 -1.478086 -1.059762    a  two
b one
      data1     data2 key1 key2
2 -1.020752 -0.913036    b  one
b two
      data1     data2 key1 key2
3 -0.375736  0.543435    b  two


In [11]:
pieces = dict(list(df.groupby('key1')))

In [12]:
pieces

{'a':       data1     data2 key1 key2
 0 -1.718982 -1.870917    a  one
 1 -1.478086 -1.059762    a  two
 4  1.996470  1.279956    a  one, 'b':       data1     data2 key1 key2
 2 -1.020752 -0.913036    b  one
 3 -0.375736  0.543435    b  two}

In [13]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-1.020752,-0.913036,b,one
3,-0.375736,0.543435,b,two


In [14]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [15]:
grouped = df.groupby(df.dtypes, axis=1)

In [16]:
for dtype, group in grouped:
    print(dtype)
    print(group)

float64
      data1     data2
0 -1.718982 -1.870917
1 -1.478086 -1.059762
2 -1.020752 -0.913036
3 -0.375736  0.543435
4  1.996470  1.279956
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one


In [17]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.295481
a,two,-1.059762
b,one,-0.913036
b,two,0.543435


In [18]:
# 通过字典或者series分类
people = pd.DataFrame(np.random.randn(5, 5), 
                      columns=['a', 'b', 'c', 'd', 'e'],
                      index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])

In [19]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.848424,-0.617921,0.471463,-0.386574,0.760802
Steve,-1.088931,-0.054358,-1.074148,-0.157982,-0.194246
Wes,0.139748,0.072812,-0.521808,-0.753134,-0.301435
Jim,1.785282,-1.486716,-0.95412,0.058197,0.683829
Travis,-0.505682,-0.528254,-1.723207,-0.999245,0.311155


In [20]:
people.iloc[2:3, [1, 2]] = np.nan

In [21]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.848424,-0.617921,0.471463,-0.386574,0.760802
Steve,-1.088931,-0.054358,-1.074148,-0.157982,-0.194246
Wes,0.139748,,,-0.753134,-0.301435
Jim,1.785282,-1.486716,-0.95412,0.058197,0.683829
Travis,-0.505682,-0.528254,-1.723207,-0.999245,0.311155


In [22]:
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f': 'orange'}

In [23]:
by_column = people.groupby(mapping, axis=1)

In [24]:
by_column

<pandas.core.groupby.DataFrameGroupBy object at 0x7f75a512e8>

In [25]:
by_column.sum()

Unnamed: 0,blue,red
Joe,0.084889,0.991305
Steve,-1.23213,-1.337535
Wes,-0.753134,-0.161688
Jim,-0.895923,0.982395
Travis,-2.722452,-0.722782


In [26]:
map_series = pd.Series(mapping)

In [27]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [28]:
people.groupby(map_series, axis=1).count()

Unnamed: 0,blue,red
Joe,2,3
Steve,2,3
Wes,1,2
Jim,2,3
Travis,2,3


In [29]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,2.773453,-2.104637,-0.482656,-1.081512,1.143195
5,-1.088931,-0.054358,-1.074148,-0.157982,-0.194246
6,-0.505682,-0.528254,-1.723207,-0.999245,0.311155


In [30]:
df

Unnamed: 0,data1,data2,key1,key2
0,-1.718982,-1.870917,a,one
1,-1.478086,-1.059762,a,two
2,-1.020752,-0.913036,b,one
3,-0.375736,0.543435,b,two
4,1.99647,1.279956,a,one


In [31]:
grouped = df.groupby('key1')

In [32]:
grouped.describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-0.400199,2.079068,-1.718982,-1.598534,-1.478086,0.259192,1.99647,3.0,-0.550241,1.636065,-1.870917,-1.46534,-1.059762,0.110097,1.279956
b,2.0,-0.698244,0.456095,-1.020752,-0.859498,-0.698244,-0.53699,-0.375736,2.0,-0.184801,1.029881,-0.913036,-0.548918,-0.184801,0.179317,0.543435


In [33]:
tips = pd.read_csv('./tips.csv')

In [34]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size
0,16.99,1.01,No,Sun,Dinner,2
1,10.34,1.66,No,Sun,Dinner,3
2,21.01,3.5,No,Sun,Dinner,3
3,23.68,3.31,No,Sun,Dinner,2
4,24.59,3.61,No,Sun,Dinner,4


In [35]:
tips['tip_pct'] = tips['tip'] / tips['total_bill']

In [36]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [37]:
grouped = tips.groupby(['day', 'smoker'])

In [38]:
grouped_pct = grouped['tip_pct']

In [39]:
grouped_pct.agg('mean')

day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64

In [40]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [41]:
grouped_pct.agg(['mean', 'std', peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,0.15165,0.028123,0.067349
Fri,Yes,0.174783,0.051293,0.159925
Sat,No,0.158048,0.039767,0.235193
Sat,Yes,0.147906,0.061375,0.290095
Sun,No,0.160113,0.042347,0.193226
Sun,Yes,0.18725,0.154134,0.644685
Thur,No,0.160298,0.038774,0.19335
Thur,Yes,0.163863,0.039389,0.15124


In [42]:
# 定义一个新名字
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

Unnamed: 0_level_0,Unnamed: 1_level_0,foo,bar
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,No,0.15165,0.028123
Fri,Yes,0.174783,0.051293
Sat,No,0.158048,0.039767
Sat,Yes,0.147906,0.061375
Sun,No,0.160113,0.042347
Sun,Yes,0.18725,0.154134
Thur,No,0.160298,0.038774
Thur,Yes,0.163863,0.039389


In [43]:
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)

In [44]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,tip_pct,total_bill,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Fri,No,4,0.15165,0.187735,4,18.42,22.75
Fri,Yes,15,0.174783,0.26348,15,16.813333,40.17
Sat,No,45,0.158048,0.29199,45,19.661778,48.33
Sat,Yes,42,0.147906,0.325733,42,21.276667,50.81
Sun,No,57,0.160113,0.252672,57,20.506667,48.17
Sun,Yes,19,0.18725,0.710345,19,24.12,45.35
Thur,No,45,0.160298,0.266312,45,17.113111,41.19
Thur,Yes,17,0.163863,0.241255,17,19.190588,43.11


In [45]:
result['tip_pct']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,No,4,0.15165,0.187735
Fri,Yes,15,0.174783,0.26348
Sat,No,45,0.158048,0.29199
Sat,Yes,42,0.147906,0.325733
Sun,No,57,0.160113,0.252672
Sun,Yes,19,0.18725,0.710345
Thur,No,45,0.160298,0.266312
Thur,Yes,17,0.163863,0.241255


In [46]:
ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]

In [47]:
grouped['tip_pct', 'total_bill'].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
day,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Fri,No,0.15165,0.000791,18.42,25.596333
Fri,Yes,0.174783,0.002631,16.813333,82.562438
Sat,No,0.158048,0.001581,19.661778,79.908965
Sat,Yes,0.147906,0.003767,21.276667,101.387535
Sun,No,0.160113,0.001793,20.506667,66.09998
Sun,Yes,0.18725,0.023757,24.12,109.046044
Thur,No,0.160298,0.001503,17.113111,59.625081
Thur,Yes,0.163863,0.001551,19.190588,69.808518


In [48]:
def top(df, n=5, columns='tip_pct'):
    return df.sort_values(by=columns)[-n:]

In [49]:
top(tips, n=6)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
232,11.61,3.39,No,Sat,Dinner,2,0.29199
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [50]:
tips.groupby('smoker').apply(top)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,88,24.71,5.85,No,Thur,Lunch,2,0.236746
No,185,20.69,5.0,No,Sun,Dinner,5,0.241663
No,51,10.29,2.6,No,Sun,Dinner,2,0.252672
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
Yes,183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
Yes,67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [51]:
# 参数也可以一并传入
tips.groupby(['smoker', 'day']).apply(top, n=5, columns='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,99,12.46,1.5,No,Fri,Dinner,2,0.120385
No,Fri,223,15.98,3.0,No,Fri,Lunch,3,0.187735
No,Fri,91,22.49,3.5,No,Fri,Dinner,2,0.155625
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,39,31.27,5.0,No,Sat,Dinner,3,0.159898
No,Sat,238,35.83,4.67,No,Sat,Dinner,3,0.130338
No,Sat,23,39.42,7.58,No,Sat,Dinner,4,0.192288
No,Sat,59,48.27,6.73,No,Sat,Dinner,4,0.139424
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,47,32.4,6.0,No,Sun,Dinner,4,0.185185


In [52]:
# 分位数和桶分析
frame = pd.DataFrame({'data1': np.random.randn(1000), 
                      'data2': np.random.randn(1000)})

In [53]:
quartiles = pd.cut(frame.data1, 4)

In [54]:
quartiles[:10]

0    (-1.839, -0.0812]
1    (-1.839, -0.0812]
2    (-1.839, -0.0812]
3    (-1.839, -0.0812]
4     (-0.0812, 1.677]
5    (-1.839, -0.0812]
6     (-0.0812, 1.677]
7    (-1.839, -0.0812]
8     (-0.0812, 1.677]
9    (-1.839, -0.0812]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.604, -1.839] < (-1.839, -0.0812] < (-0.0812, 1.677] < (1.677, 3.435]]

In [55]:
frame.data2

0     -0.681131
1     -0.809946
2     -0.349943
3     -0.767211
4      0.584327
5      1.286358
6      1.290731
7      0.706280
8      1.314760
9     -0.684999
10    -1.171250
11     0.135656
12    -0.353363
13    -1.597326
14    -0.254076
15     0.572410
16    -0.737940
17    -0.171571
18    -2.067667
19    -1.619675
20    -1.407080
21     1.639440
22     1.191207
23     0.233792
24    -0.448136
25    -1.942256
26     0.043638
27     0.121155
28    -0.083696
29    -1.858585
         ...   
970   -1.266627
971   -0.697103
972   -0.951885
973    0.130848
974    1.699867
975   -0.569121
976   -1.281661
977   -0.115508
978    1.165982
979   -1.321432
980   -0.174752
981   -0.686370
982   -0.878065
983    0.281604
984   -0.817400
985    2.614325
986    0.635698
987   -0.130848
988   -0.215064
989   -0.759778
990    0.880163
991   -2.127505
992   -0.051448
993    0.256518
994   -2.129281
995   -0.505695
996    0.858152
997   -0.506853
998   -0.764507
999    1.306591
Name: data2, Length: 100

In [56]:
def get_stats(group):
    return {'min': group.min(), 'max': group.max(),
            'count': group.count(), 'mean': group.mean()}

In [57]:
grouped = frame.data2.groupby(quartiles)

In [58]:
grouped.apply(get_stats)

data1                   
(-3.604, -1.839]   count     43.000000
                   max        1.484941
                   mean      -0.208869
                   min       -2.114452
(-1.839, -0.0812]  count    418.000000
                   max        3.862592
                   mean      -0.064226
                   min       -3.431699
(-0.0812, 1.677]   count    496.000000
                   max        3.576528
                   mean       0.031968
                   min       -2.884519
(1.677, 3.435]     count     43.000000
                   max        1.855227
                   mean      -0.099996
                   min       -2.693255
Name: data2, dtype: float64

In [59]:
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(-3.604, -1.839]",43.0,1.484941,-0.208869,-2.114452
"(-1.839, -0.0812]",418.0,3.862592,-0.064226,-3.431699
"(-0.0812, 1.677]",496.0,3.576528,0.031968,-2.884519
"(1.677, 3.435]",43.0,1.855227,-0.099996,-2.693255


In [60]:
# 获取分位数相等的桶
grouping = pd.qcut(frame.data1, 10, labels=False)

In [61]:
grouping

0      1
1      1
2      1
3      0
4      9
5      3
6      6
7      1
8      7
9      4
10     0
11     5
12     6
13     8
14     7
15     7
16     5
17     5
18     0
19     8
20     6
21     4
22     8
23     1
24     8
25     5
26     8
27     3
28     1
29     5
      ..
970    2
971    9
972    4
973    4
974    0
975    3
976    2
977    7
978    5
979    0
980    5
981    9
982    9
983    7
984    4
985    6
986    6
987    0
988    4
989    5
990    3
991    1
992    4
993    1
994    2
995    4
996    6
997    0
998    6
999    8
Name: data1, Length: 1000, dtype: int64

In [62]:
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,100.0,2.052947,-0.080802,-2.114452
1,100.0,3.399737,-0.101744,-3.431699
2,100.0,3.109397,-0.083142,-2.415343
3,100.0,2.190608,-0.089804,-2.208453
4,100.0,3.862592,0.013123,-2.925311
5,100.0,1.912218,-0.077306,-2.153133
6,100.0,2.614325,0.100205,-2.884519
7,100.0,3.576528,0.153899,-2.383462
8,100.0,2.150344,-0.103507,-2.262517
9,100.0,2.736764,0.026364,-2.693255


In [63]:
# 填充缺失值
s = pd.Series(np.random.randn(6))
s[::2] = np.nan

In [64]:
s

0         NaN
1    1.167332
2         NaN
3    0.536882
4         NaN
5   -0.246587
dtype: float64

In [65]:
s.fillna(s.mean()) # 填充缺失值

0    0.485875
1    1.167332
2    0.485875
3    0.536882
4    0.485875
5   -0.246587
dtype: float64

In [66]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada',
          'California', 'Idaho']

In [67]:
group_key = ['East'] * 4 + ['West'] * 4

In [68]:
data = pd.Series(np.random.randn(8), index=states)

In [69]:
data

Ohio          0.353320
New York     -0.545728
Vermont       0.202941
Florida       0.391805
Oregon       -0.407995
Nevada        1.087995
California   -1.465071
Idaho        -1.335564
dtype: float64

In [70]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan

In [71]:
data

Ohio          0.353320
New York     -0.545728
Vermont            NaN
Florida       0.391805
Oregon       -0.407995
Nevada             NaN
California   -1.465071
Idaho              NaN
dtype: float64

In [72]:
data.groupby(group_key).mean()

East    0.066466
West   -0.936533
dtype: float64

In [73]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

Ohio          0.353320
New York     -0.545728
Vermont       0.066466
Florida       0.391805
Oregon       -0.407995
Nevada       -0.936533
California   -1.465071
Idaho        -0.936533
dtype: float64

In [74]:
# 预先设定一些值
fill_values = {'East': 0.5, 'West': -1}
fill_func = lambda g: g.fillna(fill_values[g.name])

In [75]:
data.groupby(group_key).apply(fill_func)

Ohio          0.353320
New York     -0.545728
Vermont       0.500000
Florida       0.391805
Oregon       -0.407995
Nevada       -1.000000
California   -1.465071
Idaho        -1.000000
dtype: float64

In [76]:
# 随机采样和排列
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10]*3)*4
base_name = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in suits:
    cards.extend(str(num) + suit for num in base_name)

deck = pd.Series(card_val, index=cards)

In [77]:
deck

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64

In [78]:
deck.sample(5)

8C     8
8S     8
KD    10
5C     5
KS    10
dtype: int64

In [79]:
deck[-1]

10

In [81]:
get_suit = lambda card: card[-1]
rand_pick = deck.groupby(get_suit)

In [82]:
for i, j in rand_pick:
    print(i, j)

C AC      1
2C      2
3C      3
4C      4
5C      5
6C      6
7C      7
8C      8
9C      9
10C    10
JC     10
KC     10
QC     10
dtype: int64
D AD      1
2D      2
3D      3
4D      4
5D      5
6D      6
7D      7
8D      8
9D      9
10D    10
JD     10
KD     10
QD     10
dtype: int64
H AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64
S AS      1
2S      2
3S      3
4S      4
5S      5
6S      6
7S      7
8S      8
9S      9
10S    10
JS     10
KS     10
QS     10
dtype: int64


In [83]:
# 分组加权平均数和相关系数
df = pd.DataFrame({'categroy': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'], 
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})

In [84]:
df

Unnamed: 0,categroy,data,weights
0,a,-0.947851,0.221532
1,a,-1.995502,0.710188
2,a,-0.272947,0.126186
3,a,-1.09553,0.346604
4,b,1.292666,0.287331
5,b,0.041631,0.988342
6,b,-0.499484,0.08445
7,b,0.517158,0.475586


In [85]:
grouped = df.groupby('categroy')
get_wavg = lambda g: np.average(g['data'], weights=g['weights'])

In [86]:
grouped.apply(get_wavg)

categroy
a   -1.453403
b    0.335751
dtype: float64

In [87]:
df.pct_change??

In [88]:
tips.pivot_table(index=['day', 'smoker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,size,tip,tip_pct,total_bill
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Fri,No,2.25,2.8125,0.15165,18.42
Fri,Yes,2.066667,2.714,0.174783,16.813333
Sat,No,2.555556,3.102889,0.158048,19.661778
Sat,Yes,2.47619,2.875476,0.147906,21.276667
Sun,No,2.929825,3.167895,0.160113,20.506667
Sun,Yes,2.578947,3.516842,0.18725,24.12
Thur,No,2.488889,2.673778,0.160298,17.113111
Thur,Yes,2.352941,3.03,0.163863,19.190588


In [89]:
tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'], columns='smoker')

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,tip_pct,tip_pct
Unnamed: 0_level_1,smoker,No,Yes,No,Yes
time,day,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Dinner,Fri,2.0,2.222222,0.139622,0.165347
Dinner,Sat,2.555556,2.47619,0.158048,0.147906
Dinner,Sun,2.929825,2.578947,0.160113,0.18725
Dinner,Thur,2.0,,0.159744,
Lunch,Fri,3.0,1.833333,0.187735,0.188937
Lunch,Thur,2.5,2.352941,0.160311,0.163863


In [90]:
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808
