In [None]:
# default_exp train.balance

# Balance training data

> Various methods to balance training data by target class (label) for model training

In [None]:
# hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
# export 
import pandas as pd
import numpy as np

## Downsample 

In [None]:
# export
def downsample_df(df:pd.DataFrame, y_column:str, min_size:int=None, random_state:int=115, **kwargs) -> pd.DataFrame:
    '''Balance classes of the target variable by downsampling all classes to be equal to or smaller than "min_size".
    
    Classes smaller than "min_size" are not affected and will remain at their current size. If "min_size" is ommitted, 
    the size of the smallest current class is taken as "min_size". TEST.  
    '''
    df_new = df.copy()#.reset_index(drop=True)
    
    # get smallest current class if not supplied
    if min_size == None:
        min_size = df_new[y_column].value_counts().min()
    
    # downsample all classes larger than min_size
    for class_index, group in df_new.groupby(y_column):
        if group.shape[0] > min_size:
            drop_idx = group.sample(len(group)-min_size, random_state=random_state, **kwargs).index
            df_new = df_new.drop(drop_idx)
            
    return df_new.reset_index(drop=True)

**Paramters**

- *df*: Dataframe containing column "y_column"

- *y_column*: Name of df column containing the target variable (label)

- *min_size*: If no value is supplied, min_size will be set to the size of the smallest current class

- *random_state*: Random state for reproducibility

**Returns**

- *new_df*: Has the same structure as the input dataframe but classes were balanced by downsampling

In [None]:
df = pd.DataFrame({"x": ["A1", "B1", "B2", "C1", "C2", "C3"], "y": [0, 1, 1, 2, 2, 2]})
df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,1
1,2
2,3


In [None]:
new_df = downsample_df(df=df, y_column="y", random_state=115)
new_df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,1
1,1
2,1


In [None]:
# hide

# downsample unit tests
# test 0 
new_df = downsample_df(df=df, y_column="y", min_size=0).groupby("y").count()
test_eq(list(new_df.x), [])

# test default downsample 
new_df = downsample_df(df=df, y_column="y").groupby("y").count()
test_eq(list(new_df.x), [1, 1, 1])

# test partial downsample
new_df = downsample_df(df=df, y_column="y", min_size=2).groupby("y").count()
test_eq(list(new_df.x), [1, 2, 2])

# test none downsample
new_df = downsample_df(df=df, y_column="y", min_size=3).groupby("y").count()
test_eq(list(new_df.x), [1, 2, 3])

# test too large
new_df = downsample_df(df=df, y_column="y", min_size=4).groupby("y").count()
test_eq(list(new_df.x), [1, 2, 3])

## Upsample

In [None]:
# export

def upsample_df(df:pd.DataFrame, y_column:str, max_size:int=None, random_state:int=115, **kwargs) -> pd.DataFrame:
    '''Balance classes of the target variable by upsampling all classes to be equal to or larger than "max_size".
       
    Classes larger than "max_size" are not affected and will remain at their current size. If "max_size" is ommitted, 
    the size of the largest class is taken as "max_size".   
    '''
    df_new = df.copy()#.reset_index(drop=True)
    
    # get largest current class if not supplied
    if max_size == None:
        max_size = df_new[y_column].value_counts().max()
    
    # upsample all classes smaller than max_size
    for class_index, group in df_new.groupby(y_column):
        if group.shape[0] < max_size:
            df_new = df_new.append(group.sample(max_size-len(group), replace=True, random_state=random_state, **kwargs))
        
    return df_new.reset_index(drop=True)

**Paramters**

- *df*: Dataframe containing column "y_column"

- *y_column*: Name of df column containing the target variable (label)

- *max_size*: If no value is supplied, max_size will be set to the size of the largest current class

- *random_state*: Random state for reproducibility

**Returns**

- *new_df*: Has the same structure as the input dataframe but classes were balanced by upsampling

In [None]:
df = pd.DataFrame({"x": ["A1", "B1", "B2", "C1", "C2", "C3"], "y": [0, 1, 1, 2, 2, 2]})
df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,1
1,2
2,3


In [None]:
new_df = upsample_df(df=df, y_column="y", random_state=115)
new_df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,3
1,3
2,3


In [None]:
# hide

# upsample unit tests
# test 0 
new_df = upsample_df(df=df, y_column="y", max_size=0).groupby("y").count()
test_eq(list(new_df.x), [1, 2, 3])

# test default upsample 
new_df = upsample_df(df=df, y_column="y").groupby("y").count()
test_eq(list(new_df.x), [3, 3, 3])

# test partial upsample
new_df = upsample_df(df=df, y_column="y", max_size=2).groupby("y").count()
test_eq(list(new_df.x), [2, 2, 3])

# test explicit default upsample
new_df = upsample_df(df=df, y_column="y", max_size=3).groupby("y").count()
test_eq(list(new_df.x), [3, 3, 3])

# test complete upsample
new_df = upsample_df(df=df, y_column="y", max_size=4).groupby("y").count()
test_eq(list(new_df.x), [4, 4, 4])

## Balance

In [None]:
# export

def balance_df(df:pd.DataFrame, y_column:str, size:int, random_state:int=115, **kwargs) -> pd.DataFrame:
    '''Balance classes of the target variable by up- or downsampling all classes to be equal to "size".
    '''
    df_new = df.copy().reset_index(drop=True)
    
    for class_index, group in df_new.groupby(y_column):
        if group.shape[0] < size:
            df_new = df_new.append(group.sample(size-len(group), replace=True, random_state=random_state, **kwargs))
        if group.shape[0] > size:
            drop_idx = group.sample(len(group)-size, random_state=random_state, **kwargs).index
            df_new = df_new.drop(drop_idx)
        
    return df_new.reset_index(drop=True)

**Paramters**

- *df*: Dataframe containing column "y_column"

- *y_column*: Name of df column containing the target variable (label)

- *size*: desired class size

- *random_state*: Random state for reproducibility

**Returns**

- *new_df*: Has the same structure as the input dataframe but classes were balanced to a certain size

In [None]:
df = pd.DataFrame({"x": ["A1", "B1", "B2", "C1", "C2", "C3"], "y": [0, 1, 1, 2, 2, 2]})
df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,1
1,2
2,3


In [None]:
new_df = balance_df(df=df, y_column="y", random_state=115, size=5)
new_df.groupby("y").count()

Unnamed: 0_level_0,x
y,Unnamed: 1_level_1
0,5
1,5
2,5


In [None]:
# hide

# balance unit tests
# test 0 
new_df = balance_df(df=df, y_column="y", size=0).groupby("y").count()
test_eq(list(new_df.x), [])

# test partial balance
new_df = balance_df(df=df, y_column="y", size=1).groupby("y").count()
test_eq(list(new_df.x), [1, 1, 1])

# test partial balance
new_df = balance_df(df=df, y_column="y", size=3).groupby("y").count()
test_eq(list(new_df.x), [3, 3, 3])

# test large balance
new_df = balance_df(df=df, y_column="y", size=4).groupby("y").count()
test_eq(list(new_df.x), [4, 4, 4])