# Define Training Sets

> This notebook contains functions to help create training and test sets on the fly.

In [None]:
#| default_exp sets

In [None]:
#| export
import pandas as pd

In [None]:
# The most flexible way to set up testing sets is as a set of booleans or indices. These can then be combined.

# from dataG2F.core import get_data
# phno = get_data('phno')


## Create masks for Test Sets

In [None]:
#| export
def mask_columns(df, # A dataframe containing the column to use for the mask
                col_name = 'Hybrid', # Column containing the values in `holdouts`
                holdouts = ['M0143/LH185', 'M0003/LH185'] # A list of values to match
                ):
    """Create a dataframe containing one mask or more mask for a list of `holdouts`."""
    out = [pd.DataFrame(df.loc[:, col_name] == holdout
            ).rename(columns = {col_name:holdout})
            for holdout in holdouts]
    
    out = pd.concat(out, axis=1)
    return out

In [None]:
from dataG2F.core import get_data

In [None]:
df = get_data('phno').loc[:, ['Hybrid', 'Env', 'Year']]
df.head()

Unnamed: 0,Hybrid,Env,Year
0,M0088/LH185,DEH1_2014,2014
1,M0143/LH185,DEH1_2014,2014
2,M0003/LH185,DEH1_2014,2014
3,M0035/LH185,DEH1_2014,2014
4,M0052/LH185,DEH1_2014,2014


In [None]:
out = mask_columns(df, 
            col_name = 'Hybrid',
            holdouts = ['M0143/LH185', 'M0003/LH185']
            )
out.head()

Unnamed: 0,M0143/LH185,M0003/LH185
0,False,False
1,True,False
2,False,True
3,False,False
4,False,False


In [None]:
#| export
def mask_parents(
        df, # Dataframe containing a column with a genotype
        col_name = 'Hybrid', # The genotype column name
        holdout_parents = ['M0143'], # The genotype or genotypes that will be held out
        sep = '/' # Separator between parents. If not present (inbred genotype) that's okay.
    ):
    """Create a dataframe containing one mask or more based on a parent's genotype"""
    def  _mask_parent(df_FM, holdout = 'PHZ51'):
        holdout=   holdout.upper()
        mask_F = df_FM.F.str.upper() == holdout
        mask_M = df_FM.M.str.upper() == holdout
        mask = (mask_F | mask_M)
        return mask

    df[['F', 'M']] = df[col_name].str.split(sep, n=1, expand=True)
    mask = pd.concat([_mask_parent(df_FM=df, holdout=e) for e in holdout_parents], axis=1
            ).rename(columns={i:holdout_parents[i] for i in range(len(holdout_parents))})
    return mask

In [None]:
out = mask_parents(
        df = df,
        col_name = 'Hybrid',
        holdout_parents = ['M0143', 'LH185'],
        sep = '/'        
    )
out.head()

Unnamed: 0,M0143,LH185
0,False,True
1,True,True
2,False,True
3,False,True
4,False,True


To use these dataframes there are a few tricks to know.

The test set should contain _any_ of the rows that are flagged as true. To do that the steps are
1. Use the rowwise sum to get the logical OR of the columns
1. Check if the sum is above 0 to get a boolean

In [None]:
print('Testing Set')
(out.sum(axis=1) > 0).head()

Testing Set


0    True
1    True
2    True
3    True
4    True
dtype: bool

If the training set should contain _nothing_ that is included in the test set. To do that the steps are
1. Invert the mask dataframe
1. Use the rowwise product to get the logical AND of the columns. 
1. Check equivalence to 1 to get a boolean

In [None]:
print('Training Set')
((~out).prod(axis=1) == 1).head()

Training Set


0    False
1    False
2    False
3    False
4    False
dtype: bool

## More complex validation schemes

To get a train, validate, and test sets one option would be to get a mask, filter the dataframe, and get a new mask. This risks the indices changing if one isn't careful. Thus I recommend producing several masks on the original dataframe and then combine them to get the desired sets.

In [None]:
mask_2014 = mask_columns(
    df, 
    col_name = 'Year',
    holdouts = [2014]
    ).sum(axis=1) > 0


mask_2015 = mask_columns(
    df, 
    col_name = 'Year',
    holdouts = [2015]
    ).sum(axis=1) > 0

In [None]:
mask_train    = (~mask_2014 & ~mask_2015) # train on all but 2014, 2015
mask_validate = (~mask_2014 &  mask_2015) # validate on 2015. Here `~mask_2014` isn't needed, but 
                                          # for more complex masks this pattern should be used.
mask_retrain  = (~mask_2014)              # retrain on all but 2014
mask_test     = ( mask_2014)              # test on 2014

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()