In [2]:
import numpy as np
import pandas as pd

## Make Some Data with Nulls

In [8]:
# make some data to play with
np.random.seed(123)
n = 5000
flavors = ['pistachio', 'neopolitan', 'bubblegum', 'rocky road', 'chubby hubby', 'moolenium crunch']
df = pd.DataFrame({
    'flavor': np.random.choice(flavors, n),
    'pints': np.random.normal(10, 2, n)
})
df.pints = np.where(
    df.flavor == 'rocky road',
    np.random.normal(15, 3, n),
    df.pints
)
df.pints = np.where(
    df.flavor == 'bubblegum',
    np.random.normal(6, 1, n),
    df.pints
)
df.groupby('flavor').mean()

Unnamed: 0_level_0,pints
flavor,Unnamed: 1_level_1
bubblegum,6.004766
chubby hubby,10.000452
moolenium crunch,9.992013
neopolitan,10.087307
pistachio,10.011487
rocky road,15.042084


In [12]:
# introduce nulls into our data, we want 20% of values in the pints column to be null
p_missing = .2

df.pints = np.where(
    np.random.uniform(size=n) < p_missing,
    np.nan,
    df.pints
)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   flavor  5000 non-null   object 
 1   pints   4018 non-null   float64
dtypes: float64(1), object(1)
memory usage: 78.2+ KB


## Fill nulls with group averages

### Sidebar: `.transform`

In [38]:
np.random.seed(123)

simple_df = pd.DataFrame({
    'g': np.random.choice(list('abc'), 6),
    'x': np.random.randint(1, 11, 6)
})
simple_df

Unnamed: 0,g,x
0,c,7
1,b,2
2,c,4
3,c,10
4,a,7
5,c,2


In [40]:
simple_df.groupby('g').x.agg('mean')

g
a    7.00
b    2.00
c    5.75
Name: x, dtype: float64

In [41]:
simple_df.groupby('g').x.transform('mean')

0    5.75
1    2.00
2    5.75
3    5.75
4    7.00
5    5.75
Name: x, dtype: float64

### The "easy" way

In [20]:
df['flavor_pints_mean'] = df.groupby('flavor').pints.transform('mean')

In [31]:
df.sample(50, random_state=123)

Unnamed: 0,flavor,pints,flavor_pints_mean
2648,bubblegum,,5.984555
2456,rocky road,16.966692,15.122912
4557,moolenium crunch,6.406041,9.950813
4884,pistachio,,9.980376
92,bubblegum,,5.984555
4038,chubby hubby,,9.975426
30,pistachio,9.479867,9.980376
1746,chubby hubby,11.37435,9.975426
1692,chubby hubby,7.307847,9.975426
2277,chubby hubby,8.510078,9.975426


In [34]:
df['pints_imputed'] = df.pints.fillna(df.flavor_pints_mean)

In [36]:
df.sample(50, random_state=123)

Unnamed: 0,flavor,pints,flavor_pints_mean,pints_imputed
2648,bubblegum,,5.984555,5.984555
2456,rocky road,16.966692,15.122912,16.966692
4557,moolenium crunch,6.406041,9.950813,6.406041
4884,pistachio,,9.980376,9.980376
92,bubblegum,,5.984555,5.984555
4038,chubby hubby,,9.975426,9.975426
30,pistachio,9.479867,9.980376,9.479867
1746,chubby hubby,11.37435,9.975426,11.37435
1692,chubby hubby,7.307847,9.975426,7.307847
2277,chubby hubby,8.510078,9.975426,8.510078


### Accounting for Train and Test

- Fill missing values in pints with the average pint consumption for that flavor
- The averages must be calculated from the training dataset

In [42]:
import sklearn.model_selection

In [48]:
(
    'one'
    'two'
    'three'
)

'onetwothree'

In [49]:
df = df.drop(columns=['flavor_pints_mean', 'pints_imputed'])

In [50]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123)
train.shape, test.shape

((3750, 2), (1250, 2))

In [51]:
train.head()

Unnamed: 0,flavor,pints
2413,pistachio,11.148437
1471,moolenium crunch,12.970877
1196,pistachio,7.973456
1509,neopolitan,8.72335
4110,neopolitan,8.979141


How? Implement our own `.transform`, but with joins.

In [60]:
flavor_pint_avgs = train.groupby('flavor', as_index=False).mean().rename(columns={'pints': 'flavor_pint_avg'})
flavor_pint_avgs

Unnamed: 0,flavor,flavor_pint_avg
0,bubblegum,5.97033
1,chubby hubby,10.008736
2,moolenium crunch,9.957481
3,neopolitan,10.036325
4,pistachio,10.074873
5,rocky road,15.139496


In [58]:
train

Unnamed: 0,flavor,pints
2413,pistachio,11.148437
1471,moolenium crunch,12.970877
1196,pistachio,7.973456
1509,neopolitan,8.723350
4110,neopolitan,8.979141
...,...,...
1593,chubby hubby,6.271008
4060,moolenium crunch,7.302858
1346,bubblegum,
3454,moolenium crunch,10.473654


In [64]:
# assign takes keyword arguments that will be turned into new columns
# the value associated with the kwargs is a function that takes in the dataframe

def fill_missing_pints(df):
    return df.pints.fillna(df.flavor_pint_avg)

(
    pd.merge(train, flavor_pint_avgs, on='flavor')
    .assign(pints_imputed=fill_missing_pints)
)

Unnamed: 0,flavor,pints,flavor_pint_avg,pints_imputed
0,pistachio,11.148437,10.074873,11.148437
1,pistachio,7.973456,10.074873,7.973456
2,pistachio,10.413805,10.074873,10.413805
3,pistachio,10.306579,10.074873,10.306579
4,pistachio,11.844363,10.074873,11.844363
...,...,...,...,...
3745,bubblegum,4.749978,5.970330,4.749978
3746,bubblegum,6.795329,5.970330,6.795329
3747,bubblegum,6.017395,5.970330,6.017395
3748,bubblegum,3.857961,5.970330,3.857961


In [66]:
flavor_pint_avgs

Unnamed: 0,flavor,flavor_pint_avg
0,bubblegum,5.97033
1,chubby hubby,10.008736
2,moolenium crunch,9.957481
3,neopolitan,10.036325
4,pistachio,10.074873
5,rocky road,15.139496


In [68]:
(
    pd.merge(test, flavor_pint_avgs, on='flavor')
    .assign(pints_imputed=fill_missing_pints)
)

Unnamed: 0,flavor,pints,flavor_pint_avg,pints_imputed
0,bubblegum,,5.970330,5.970330
1,bubblegum,,5.970330,5.970330
2,bubblegum,,5.970330,5.970330
3,bubblegum,6.922189,5.970330,6.922189
4,bubblegum,6.819485,5.970330,6.819485
...,...,...,...,...
1245,neopolitan,,10.036325,10.036325
1246,neopolitan,12.160781,10.036325,12.160781
1247,neopolitan,,10.036325,10.036325
1248,neopolitan,11.140055,10.036325,11.140055


In [77]:
train.flavor.replace(train.groupby('flavor').pints.mean().to_dict())

2413    10.074873
1471     9.957481
1196    10.074873
1509    10.036325
4110    10.036325
          ...    
1593    10.008736
4060     9.957481
1346     5.970330
3454     9.957481
3582    10.036325
Name: flavor, Length: 3750, dtype: float64

In [81]:
# using the results of the group, i.e. the average pint consumption by flavor
# to lookup flavor, effectively replacing each flavor with the average
# pint consumption for the flavor
avg_pint_consumption_by_flavor = train.groupby('flavor').pints.mean()
train['pints_imputed'] = train.pints.fillna(train.flavor.map(avg_pint_consumption_by_flavor))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [74]:
train

Unnamed: 0,flavor,pints,pints_imputed
2413,pistachio,11.148437,10.074873
1471,moolenium crunch,12.970877,9.957481
1196,pistachio,7.973456,10.074873
1509,neopolitan,8.723350,10.036325
4110,neopolitan,8.979141,10.036325
...,...,...,...
1593,chubby hubby,6.271008,10.008736
4060,moolenium crunch,7.302858,9.957481
1346,bubblegum,,5.970330
3454,moolenium crunch,10.473654,9.957481


In [99]:
def impute_by_group_agg(train: pd.DataFrame, test: pd.DataFrame, x: str, group: str, aggfunc='mean'):
    '''
    Fills missing values in column ``x`` with the average x value by ``group``.
    '''
    group_avg_lookup = train.groupby(groups)[x].agg(aggfunc)

    train[x] = train[x].fillna(train[group].map(group_avg_lookup))
    test[x] = test[x].fillna(test[group].map(group_avg_lookup))
    return train, test

In [96]:
train, test = sklearn.model_selection.train_test_split(df, random_state=123)

# groups = ['flavor']
# x -- column to impute
# x = 'pints'
# aggfunc = 'mean'

print('--- train')
print(train.isna().sum())
print('--- test')
print(test.isna().sum())

--- train
flavor      0
pints     724
dtype: int64
--- test
flavor      0
pints     258
dtype: int64


In [98]:
train, test = impute_by_group_agg(train, test, 'pints', 'flavor')

print('--- train')
print(train.isna().sum())
print('--- test')
print(test.isna().sum())

--- train
flavor    0
pints     0
dtype: int64
--- test
flavor    0
pints     0
dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
