In [2]:
import pandas as pd
import numpy as np

### EX : Filling missing vals with Group Specific Vals

In [3]:
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s

0         NaN
1    0.050272
2         NaN
3    0.196324
4         NaN
5   -1.096564
dtype: float64

In [4]:
# Fill with mean
s.fillna(s.mean())

0   -0.283323
1    0.050272
2   -0.283323
3    0.196324
4   -0.283323
5   -1.096564
dtype: float64

In [5]:
# Filling Groupwise with fillna on each group
states = ['Ohio', 'New York', 'Vermont', 'Florida',
    'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

Ohio         -0.792919
New York     -0.023542
Vermont       0.792683
Florida       0.306636
Oregon        0.441249
Nevada       -2.649028
California   -1.846832
Idaho        -1.018477
dtype: float64

In [6]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -0.792919
New York     -0.023542
Vermont            NaN
Florida       0.306636
Oregon        0.441249
Nevada             NaN
California   -1.846832
Idaho              NaN
dtype: float64

In [7]:
data.groupby(group_key).mean()  

East   -0.169942
West   -0.702791
dtype: float64

In [8]:
# fill with group mean
data.groupby(group_key).apply(lambda g : g.fillna(g.mean()))

East  Ohio         -0.792919
      New York     -0.023542
      Vermont      -0.169942
      Florida       0.306636
West  Oregon        0.441249
      Nevada       -0.702791
      California   -1.846832
      Idaho        -0.702791
dtype: float64

In [9]:
# or group fill vals
# g.name gets group name
group_fills = {"East" : .69, 'West' : .20}
data.groupby(group_key).apply(lambda g : g.fillna(group_fills[g.name]))


East  Ohio         -0.792919
      New York     -0.023542
      Vermont       0.690000
      Florida       0.306636
West  Oregon        0.441249
      Nevada        0.200000
      California   -1.846832
      Idaho         0.200000
dtype: float64

### Random Sampling and Permutation

In [10]:
# Card deck
# Hearts, Spades, Clubs, Diamonds
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(f'{num}{suit}' for num in base_names)
deck = pd.Series(card_val, index=cards)
deck[:13]

AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64

In [11]:
# Draws with sample
def draw(deck, n=5):
    return deck.sample(n)
draw(deck)

9C      9
4D      4
10S    10
4C      4
6H      6
dtype: int64

In [12]:
# Grouping based on suit , and take 2 from each group
# card -> index
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2)
deck.groupby(get_suit, group_keys=False).apply(draw, n=2)

JC    10
AC     1
5D     5
QD    10
8H     8
7H     7
QS    10
6S     6
dtype: int64

### Group Weighted Average of 2 cols

In [13]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
        'b', 'b', 'b', 'b'],
        'data': np.random.randn(8),
        'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.736951,0.406175
1,a,1.322929,0.793927
2,a,1.184981,0.791218
3,a,0.117016,0.737975
4,b,-0.257359,0.229969
5,b,-2.453783,0.216971
6,b,0.269642,0.544976
7,b,-1.090478,0.944219


In [14]:
df.groupby("category").apply(lambda g: np.average(g["data"], weights=g["weights"]))

category
a    0.650318
b   -0.761458
dtype: float64

In [16]:
# another yahoo example
close_px = pd.read_csv("../examples/stock_px.csv", parse_dates=True,index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AA      5472 non-null   float64
 1   AAPL    5472 non-null   float64
 2   GE      5472 non-null   float64
 3   IBM     5472 non-null   float64
 4   JNJ     5472 non-null   float64
 5   MSFT    5472 non-null   float64
 6   PEP     5471 non-null   float64
 7   SPX     5472 non-null   float64
 8   XOM     5472 non-null   float64
dtypes: float64(9)
memory usage: 427.5 KB


In [17]:
close_px[-4:]

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
2011-10-11,10.3,400.29,16.14,185.0,63.96,27.0,60.95,1195.54,76.27
2011-10-12,10.05,402.19,16.4,186.12,64.33,26.96,62.7,1207.25,77.16
2011-10-13,10.1,408.43,16.22,186.82,64.23,27.18,62.36,1203.66,76.37
2011-10-14,10.26,422.0,16.6,190.53,64.72,27.27,62.24,1224.58,78.11


In [21]:
# Yearly correlations of SPX with daily returns
# my
get_year = lambda d: d.year
get_corr = lambda g : g.corrwith(g["AA"])
close_px.groupby(get_year).apply(get_corr)

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,1.0,0.417615,0.759832,0.099264,-0.153269,0.025996,-0.083786,0.666344,-0.241196
1991,1.0,-0.006879,0.748064,0.068087,0.142455,-0.196075,0.492079,0.501961,0.313663
1992,1.0,0.004675,0.173012,0.18076,-0.602804,-0.259187,0.019471,0.031827,0.055557
1993,1.0,-0.280121,0.118154,-0.068282,-0.036481,-0.53337,0.205207,0.235182,-0.220529
1994,1.0,0.670483,0.080886,0.756488,0.801379,0.676592,-0.255782,0.369808,-0.175648
1995,1.0,0.191504,0.698998,0.822812,0.7137,0.846643,0.711679,0.841431,0.718555
1996,1.0,-0.38377,0.406672,0.491387,0.366658,0.5363,0.148958,0.510062,0.43588
1997,1.0,0.558769,0.531463,0.61405,0.166312,0.638957,0.462682,0.60893,0.660971
1998,1.0,0.034347,0.057428,0.326007,0.320909,0.096062,0.046745,0.219095,0.265577
1999,1.0,0.753175,0.68169,0.683044,0.599105,0.612914,-0.54551,0.745858,0.83045


In [24]:
# book
returns = close_px.pct_change().dropna()
# groupby year 
returns.groupby(get_year).apply(get_corr)

  returns = close_px.pct_change().dropna()


Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,1.0,0.31416,0.363487,0.462143,0.462304,0.421272,0.381153,0.595024,0.256957
1991,1.0,0.063495,0.3347,0.270062,0.265631,0.301224,0.215612,0.453574,0.162586
1992,1.0,0.227365,0.325433,0.171885,0.043394,0.239586,0.097657,0.39818,0.06706
1993,1.0,0.055439,0.085964,0.137688,0.069351,0.043862,0.110073,0.259069,0.0999
1994,1.0,0.085557,0.21128,0.172228,0.136718,0.15818,0.150916,0.428549,0.182896
1995,1.0,0.151338,0.122291,0.099189,-0.007507,0.162678,0.015838,0.291532,0.065978
1996,1.0,0.074599,0.168978,0.087722,0.063346,0.147847,0.145953,0.292344,0.161301
1997,1.0,0.07866,0.438897,0.373673,0.387654,0.274955,0.220356,0.564427,0.407447
1998,1.0,0.150363,0.375285,0.402523,0.285444,0.342807,0.188527,0.533802,0.225385
1999,1.0,0.063477,-0.005537,-0.003721,-0.039216,0.009682,0.067101,0.099033,0.171713


In [25]:
# 2col corr
returns.groupby(get_year).apply(lambda g : g["JNJ"].corr(g["SPX"]))

1990    0.801145
1991    0.646401
1992    0.515740
1993    0.451503
1994    0.372962
1995    0.315733
1996    0.569232
1997    0.703538
1998    0.591988
1999    0.517061
2000    0.189765
2001    0.111493
2002    0.584758
2003    0.562399
2004    0.354690
2005    0.444728
2006    0.394026
2007    0.568423
2008    0.801005
2009    0.603146
2010    0.689896
2011    0.752379
dtype: float64