# Explore train/test/validate schemes and the number of observations left with each

> This notebook is now deprecated. 01.06 implements cv00 selection of test/train and validation sets.

In [None]:
import os

import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

from tqdm import tqdm

from dataG2F.qol import ensure_dir_path_exists

In [None]:
cache_path = '../nbs_artifacts/09_explore_train_validate_test_options/'
ensure_dir_path_exists(dir_path = cache_path)

## Load phenotypic data to match

In [None]:
load_from = '../nbs_artifacts/05_prep_matrices/'
phno = pd.read_csv(load_from+'phno_geno.csv')
phno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx,Yield_Mg_ha,Stand_Count_plants,Pollen_DAP_days,Silk_DAP_days,Plant_Height_cm,Ear_Height_cm,Root_Lodging_plants,Stalk_Lodging_plants,Grain_Moisture,Twt_kg_m3
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,0,0,0,5.721725,56.0,63.0,67.0,213.0,79.0,0.0,0.0,20.8,706.664693
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,1,0,1,11.338246,54.0,61.0,63.0,286.0,172.0,0.0,0.0,25.8,693.792841
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,2,0,2,6.540810,60.0,63.0,65.0,239.0,92.0,0.0,4.0,20.8,698.941582
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,3,0,3,10.366857,59.0,61.0,63.0,242.0,118.0,0.0,0.0,23.7,711.813434
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,4,0,4,10.908814,58.0,63.0,65.0,211.0,92.0,0.0,0.0,19.4,743.993065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,133052,235,4871,11.975018,,,,,,,,,
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,133053,235,4872,12.971193,,,,,,,,,
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,133054,235,4873,13.499769,,,,,,,,,
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,133055,235,4875,10.831640,,,,,,,,,


Possible testing schemes:
    


In [None]:
tally_df = phno.groupby(['Year', 'Geno_Idx']).count().reset_index().loc[:, ['Year', 'Geno_Idx', 'Hybrid']].rename(columns = {'Hybrid': 'n'})
tally_df

Unnamed: 0,Year,Geno_Idx,n
0,2014,0,15
1,2014,1,17
2,2014,2,18
3,2014,3,16
4,2014,4,18
...,...,...,...
8111,2022,4921,3
8112,2022,4922,5
8113,2022,4923,2
8114,2022,4924,1


In [None]:
def sim_cv(
    tally_df = tally_df,
    draw_years = 1,
    draw_genos = 1,
    seed_val = 786342786,
    cv_type = 'cv1'
):
    possible_geno = list(set(tally_df.Geno_Idx))
    possible_years = list(set(tally_df.Year))

    test_geno = []
    test_years = []

    np.random.seed(seed_val)
    for i in range(draw_years):
        test_years += [np.random.choice(possible_years)] 
        possible_years = [e for e in possible_years if e not in test_years]

    for i in range(draw_genos):
        test_geno += [np.random.choice(possible_geno)]  
        possible_geno = [e for e in possible_geno if e not in test_geno]

    mask_test_geno = (tally_df.Geno_Idx.isin(test_geno))
    mask_test_year  = (tally_df.Year.isin(test_years))

    mask_train_geno = (tally_df.Geno_Idx.isin(possible_geno))
    mask_train_year  = (tally_df.Year.isin(possible_years))

    out = {'draw_years': draw_years,
           'draw_genos': draw_genos, 
           'seed_val': seed_val}

    if cv_type == 'cv1':
        # CV1 Lines don't overlap
        out['cv1_test']  = np.sum(tally_df.loc[mask_test_geno, 'n'])
        out['cv1_train'] = np.sum(tally_df.loc[mask_train_geno, 'n'])
    if cv_type == 'cv0':
        # CV0 years don't overlap
        out['cv0_test']  = np.sum(tally_df.loc[mask_test_year, 'n'])
        out['cv0_train'] = np.sum(tally_df.loc[mask_train_year, 'n'])
    if cv_type == 'cv00':
        # CV00 years and Lines don't overlap
        out['cv00_test']  = np.sum(tally_df.loc[(mask_test_geno & mask_test_year), 'n'])
        out['cv00_train'] = np.sum(tally_df.loc[(mask_train_geno & mask_train_year), 'n'])
    return(out)

In [None]:
seed_val = 786342786
np.random.seed(seed_val)

# in lieu of expand.grid :
settings_df = pd.DataFrame(
    {'join_on':'c', "n_year":list(np.linspace(1, 4, 4).astype(int))
    }).merge(pd.DataFrame(
    {'join_on':'c', "n_geno":list(np.linspace(100, 1000, 4).astype(int))
    })).drop(columns = ['join_on'])

settings_df["n_rep"] = [np.random.randint(0, 1e5) for i in range(settings_df.shape[0])]
settings_df

Unnamed: 0,n_year,n_geno,n_rep
0,1,100,5091
1,1,400,66458
2,1,700,89302
3,1,1000,33713
4,2,100,9218
5,2,400,66698
6,2,700,30990
7,2,1000,880
8,3,100,16542
9,3,400,34724


### CV1 (filter genotypes)

In [None]:
# temp_settings = settings_df.copy()
# # filter down to the first unique entries for the selected columns
# temp_idx = temp_settings.loc[:, ['n_geno']].drop_duplicates().index
# temp_settings = temp_settings.loc[temp_idx, ]

In [None]:
# df = []
# for i in tqdm(temp_idx):
#     df += [pd.DataFrame(
#             sim_cv(
#                 draw_years = temp_settings.loc[i, 'n_year'],
#                 draw_genos = temp_settings.loc[i, 'n_geno'],
#                 seed_val =   temp_settings.loc[i, 'n_rep'],
#                 cv_type = 'cv1'
#             ), index = [0]
#         )]
    
# df = pd.concat(df)
# df.head()

In [None]:
# df_summary = df.groupby(['draw_genos']).agg(
#     cv1_test = ('cv1_test', np.mean),
#     cv1_train = ('cv1_train', np.mean)).reset_index()
# df_summary['pr_test'] = df_summary['cv1_test']/(df_summary['cv1_test']+df_summary['cv1_train'])
# df_summary

In [None]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(x = df['draw_genos'], y = df['cv1_train'], marker_color = 'blue', mode='markers'))
# fig.add_trace(go.Scatter(x = df['draw_genos'], y = df['cv1_test'],  marker_color = 'red', mode='markers'))
# fig.show()

### CV0 (filter years)

In [None]:
# temp_settings = settings_df.copy()
# # filter down to the first unique entries for the selected columns
# temp_idx = temp_settings.loc[:, ['n_year']].drop_duplicates().index
# temp_settings = temp_settings.loc[temp_idx, ]

In [None]:
# df = []
# for i in tqdm(temp_idx):
#     df += [pd.DataFrame(
#             sim_cv(
#                 draw_years = temp_settings.loc[i, 'n_year'],
#                 draw_genos = temp_settings.loc[i, 'n_geno'],
#                 seed_val =   temp_settings.loc[i, 'n_rep'],
#                 cv_type = 'cv0'
#             ), index = [0]
#         )]
    
# df = pd.concat(df)
# df.head()

In [None]:
# df_summary = df.groupby(['draw_years']).agg(
#     cv0_test = ('cv0_test', np.mean),
#     cv0_train = ('cv0_train', np.mean)).reset_index()
# df_summary['pr_test'] = df_summary['cv0_test']/(df_summary['cv0_test']+df_summary['cv0_train'])
# df_summary

In [None]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(x = df['draw_years'], y = df['cv0_train'], marker_color = 'blue', mode='markers'))
# fig.add_trace(go.Scatter(x = df['draw_years'], y = df['cv0_test'],  marker_color = 'red', mode='markers'))
# fig.show()

### CV00 (filter genotypes, years)

In [None]:
temp_settings = settings_df.copy()
# filter down to the first unique entries for the selected columns
temp_idx = temp_settings.loc[:, ['n_geno', 'n_year']].drop_duplicates().index
temp_settings = temp_settings.loc[temp_idx, ]

In [None]:
if 'sim_res.csv' in os.listdir(cache_path):
    df = pd.read_csv(cache_path+'sim_res.csv')
else:
    df = []
    for i in tqdm(temp_idx):
        df += [pd.DataFrame(
                sim_cv(
                    draw_years = temp_settings.loc[i, 'n_year'],
                    draw_genos = temp_settings.loc[i, 'n_geno'],
                    seed_val =   temp_settings.loc[i, 'n_rep'],
                    cv_type = 'cv00'
                ), index = [0]
            )]
    df = pd.concat(df)
    df.to_csv(cache_path+'sim_res.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,draw_years,draw_genos,seed_val,cv00_test,cv00_train
0,0,1,100,5091,334,111837
1,0,1,400,66458,1156,111533
2,0,1,700,89302,1870,102682
3,0,1,1000,33713,4178,89137
4,0,2,100,9218,600,106063


In [None]:
df_summary = df.groupby(['draw_years', 'draw_genos']).agg(
    cv00_test = ('cv00_test', np.mean),
    cv00_train = ('cv00_train', np.mean)).reset_index()
df_summary['pr_test'] = df_summary['cv00_test']/(df_summary['cv00_test']+df_summary['cv00_train'])
df_summary.loc[((df_summary.pr_test > 0.08)&(df_summary.pr_test < 0.12)), ]


The provided callable <function mean> is currently using SeriesGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.



Unnamed: 0,draw_years,draw_genos,cv00_test,cv00_train,pr_test
11,3,1000,8204.0,72910.0,0.101142
13,4,400,5721.0,58565.0,0.088993
14,4,700,8676.0,63867.0,0.119598


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = df['draw_years'], y = df['cv00_train'], marker_color = 'blue', mode='markers'))
fig.add_trace(go.Scatter(x = df['draw_years'], y = df['cv00_test'],  marker_color = 'red', mode='markers'))
fig.show()

In [None]:
# remove, looking at testers

In [None]:
load_from = '../nbs_artifacts/05_prep_matrices/'
phno = pd.read_csv(load_from+'phno_geno.csv')
phno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx,Yield_Mg_ha,Stand_Count_plants,Pollen_DAP_days,Silk_DAP_days,Plant_Height_cm,Ear_Height_cm,Root_Lodging_plants,Stalk_Lodging_plants,Grain_Moisture,Twt_kg_m3
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,0,0,0,5.721725,56.0,63.0,67.0,213.0,79.0,0.0,0.0,20.8,706.664693
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,1,0,1,11.338246,54.0,61.0,63.0,286.0,172.0,0.0,0.0,25.8,693.792841
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,2,0,2,6.540810,60.0,63.0,65.0,239.0,92.0,0.0,4.0,20.8,698.941582
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,3,0,3,10.366857,59.0,61.0,63.0,242.0,118.0,0.0,0.0,23.7,711.813434
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,4,0,4,10.908814,58.0,63.0,65.0,211.0,92.0,0.0,0.0,19.4,743.993065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,133052,235,4871,11.975018,,,,,,,,,
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,133053,235,4872,12.971193,,,,,,,,,
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,133054,235,4873,13.499769,,,,,,,,,
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,133055,235,4875,10.831640,,,,,,,,,


In [None]:
# phno = phno.loc[(phno.Year > 2015), ]
# phno = phno.loc[(phno.Year <= 2017), ]

In [None]:
phno.loc[:, ['Env', 'Year', 'Hybrid']]

Unnamed: 0,Env,Year,Hybrid
0,DEH1_2014,2014,M0088/LH185
1,DEH1_2014,2014,M0143/LH185
2,DEH1_2014,2014,M0003/LH185
3,DEH1_2014,2014,M0035/LH185
4,DEH1_2014,2014,M0052/LH185
...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244
133053,WIH3_2022,2022,W10010_0346/LH244
133054,WIH3_2022,2022,W10010_0358/LH244
133055,WIH3_2022,2022,W10010_0381/LH244


In [None]:
import re
parents = list(set(sum([e.split('/') for e in list(set(phno.Hybrid))], [])))


In [None]:
# len(parents) 2203

len([e for e in list(set(phno.Hybrid)) if re.findall('PHZ51', e)])

667

In [None]:
pcount = pd.DataFrame(zip(
    parents, 
    # [len([e for e in list(set(phno.Hybrid)) if re.findall(ee.encode('unicode_escape'), e)]) for ee in parents]
    [len([e for e in list(set(phno.Hybrid)) if ee in e.split('/')]) for ee in parents]
    ))

pcount = pcount.rename({0:'parent', 1:'count'}, axis=1)

In [None]:
pcount = pd.DataFrame({
    'parent': parents,
    'count_2014': [len([e for e in list(set(phno.loc[phno.Year == 2014, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2015': [len([e for e in list(set(phno.loc[phno.Year == 2015, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2016': [len([e for e in list(set(phno.loc[phno.Year == 2016, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2017': [len([e for e in list(set(phno.loc[phno.Year == 2017, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2018': [len([e for e in list(set(phno.loc[phno.Year == 2018, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2019': [len([e for e in list(set(phno.loc[phno.Year == 2019, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2020': [len([e for e in list(set(phno.loc[phno.Year == 2020, 'Hybrid'])) if ee in e.split('/')]) for ee in parents],
    'count_2021': [len([e for e in list(set(phno.loc[phno.Year == 2021, 'Hybrid'])) if ee in e.split('/')]) for ee in parents]
    })

In [None]:
xx = pcount.melt('parent', ['count_2014',	'count_2015',	'count_2016',	'count_2017',	'count_2018',	'count_2019',	'count_2020',	'count_2021'])
xx['Year'] = xx.variable.str.strip('count_').astype(int)
xx

Unnamed: 0,parent,variable,value,Year
0,MP718,count_2014,0,2014
1,MBNIL_B049,count_2014,0,2014
2,PHN46,count_2014,0,2014
3,ICI_441,count_2014,0,2014
4,GEMN-0097_LH212HT_0085,count_2014,0,2014
...,...,...,...,...
17619,GEMN-0192_PHK76_0007,count_2021,0,2021
17620,GEMN-0097_PHJ89_0005,count_2021,0,2021
17621,B73_PHG39-25,count_2021,0,2021
17622,GEMN-0097_PHP02_0005,count_2021,0,2021


In [None]:
xx = xx.merge(xx.groupby(['parent', 'Year']).agg(sumval= ('value', 'sum')).reset_index())
xx = xx.loc[xx.sumval > 5, ]
xx

Unnamed: 0,parent,variable,value,Year,sumval
239,PB80,count_2014,162,2014,162
414,TX205,count_2014,7,2014,7
586,CG108,count_2014,10,2014,10
895,LH195,count_2014,108,2014,108
1210,LH198,count_2014,185,2014,185
...,...,...,...,...,...
16865,B73,count_2021,6,2021,6
17188,PHK76,count_2021,379,2021,379
17560,MO17,count_2021,7,2021,7
17609,TX779,count_2021,7,2021,7


In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
import plotly.graph_objects as go

xxplt = xx.loc[(xx.sumval > 5) & (xx.Year < 2018), ]

fig = go.Figure()

for e in list(xxplt.parent):
    mask = (xxplt.parent == e)

    fig.add_trace(go.Scatter(
        x=[str(e) for e in xxplt.loc[mask, 'Year']], 
        y=xxplt.loc[mask, 'value'],
        mode='lines+markers',
        name=e))

fig.show()    

In [None]:
# What would it look like if we heald out each of the tester lines from 19-21?

phno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx,Yield_Mg_ha,Stand_Count_plants,Pollen_DAP_days,Silk_DAP_days,Plant_Height_cm,Ear_Height_cm,Root_Lodging_plants,Stalk_Lodging_plants,Grain_Moisture,Twt_kg_m3
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,0,0,0,5.721725,56.0,63.0,67.0,213.0,79.0,0.0,0.0,20.8,706.664693
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,1,0,1,11.338246,54.0,61.0,63.0,286.0,172.0,0.0,0.0,25.8,693.792841
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,2,0,2,6.540810,60.0,63.0,65.0,239.0,92.0,0.0,4.0,20.8,698.941582
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,3,0,3,10.366857,59.0,61.0,63.0,242.0,118.0,0.0,0.0,23.7,711.813434
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,4,0,4,10.908814,58.0,63.0,65.0,211.0,92.0,0.0,0.0,19.4,743.993065
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,133052,235,4871,11.975018,,,,,,,,,
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,133053,235,4872,12.971193,,,,,,,,,
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,133054,235,4873,13.499769,,,,,,,,,
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,133055,235,4875,10.831640,,,,,,,,,


In [None]:
phno[['F', 'M']] = phno['Hybrid'].str.split('/', n=1, expand=True)
phno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx,Yield_Mg_ha,...,Pollen_DAP_days,Silk_DAP_days,Plant_Height_cm,Ear_Height_cm,Root_Lodging_plants,Stalk_Lodging_plants,Grain_Moisture,Twt_kg_m3,F,M
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,0,0,0,5.721725,...,63.0,67.0,213.0,79.0,0.0,0.0,20.8,706.664693,M0088,LH185
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,1,0,1,11.338246,...,61.0,63.0,286.0,172.0,0.0,0.0,25.8,693.792841,M0143,LH185
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,2,0,2,6.540810,...,63.0,65.0,239.0,92.0,0.0,4.0,20.8,698.941582,M0003,LH185
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,3,0,3,10.366857,...,61.0,63.0,242.0,118.0,0.0,0.0,23.7,711.813434,M0035,LH185
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,4,0,4,10.908814,...,63.0,65.0,211.0,92.0,0.0,0.0,19.4,743.993065,M0052,LH185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,133052,235,4871,11.975018,...,,,,,,,,,W10010_0337,LH244
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,133053,235,4872,12.971193,...,,,,,,,,,W10010_0346,LH244
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,133054,235,4873,13.499769,...,,,,,,,,,W10010_0358,LH244
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,133055,235,4875,10.831640,...,,,,,,,,,W10010_0381,LH244


In [None]:
parents_cols = list(phno.M)+list(phno.F)
parents = pd.DataFrame({
    'parent': list(set(parents_cols)),
    'n':[len([ee for ee in parents_cols if ee == e]) for e in set(parents_cols)]
})

In [None]:
parents = parents.sort_values('n',    ascending=False).reset_index(drop=True)
parents

Unnamed: 0,parent,n
0,LH195,30351
1,PHZ51,18607
2,PHT69,12078
3,LH244,9855
4,PHP02,9650
...,...,...
2198,GEMN-0097_PHN46_0012,1
2199,GEMN-0225_PHK76_0025,1
2200,Z022E0082,1
2201,PHJ31,1


In [None]:
# parents.loc[parents.n > 1000 ]

In [None]:
e = 'LH195'

out_list = []
for e in list(set(parents.loc[parents.n > 1000, 'parent'])):
    temp = phno.loc[((phno.F == e) | (phno.M == e)), ['Year', 'Hybrid']].groupby('Year').count().reset_index()
    temp['Query'] = e
    out_list += [temp]



In [None]:
temp = pd.concat(out_list).pivot(index='Query', columns='Year', values='Hybrid').reset_index()
temp

Year,Query,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,B14A,82.0,142.0,156.0,156.0,213.0,159.0,236.0,295.0,44.0
1,B37,156.0,150.0,154.0,60.0,205.0,261.0,234.0,305.0,45.0
2,B73,224.0,328.0,330.0,353.0,256.0,133.0,366.0,512.0,95.0
3,CG102,1108.0,180.0,81.0,32.0,,,,,
4,DK3IIH6,,,3919.0,2653.0,12.0,,,,23.0
5,F42,107.0,111.0,153.0,104.0,232.0,176.0,235.0,306.0,32.0
6,H95,154.0,155.0,203.0,139.0,204.0,210.0,234.0,301.0,45.0
7,LH185,2536.0,1031.0,447.0,561.0,10.0,,,,19.0
8,LH195,1147.0,1423.0,2185.0,1536.0,11815.0,11447.0,275.0,424.0,99.0
9,LH198,3562.0,471.0,481.0,521.0,,,,,


In [None]:
temp.index = temp.Query

In [None]:
px.imshow(temp.drop('Query', axis=1),)

In [None]:
for e in [e for e in list(temp) if e not in ['Query', 'Year']]:
    temp[e] = np.log10(temp[e])
px.imshow(temp.drop('Query', axis=1),)

In [None]:
# parents.loc[((parents.n> 300) &
#              (parents.n<400000))
#             ]

# # 0	LH195	30351
# # 1	PHZ51	18607
# # 2	PHT69	12078
# 3	LH244	9855
# # 4	PHP02	9650
# # 5	PHK76	7309
# 6	DK3IIH6	6607
# 7	PHB47	6007
# 8	LH198	5035
# 9	LH185	4604
# 10	LH82	3531
# 11	PHN82	3248

In [None]:
# test info

def phno_mask_parents(
        tParents = ['LH195', 'PHZ51']
        ):
    mask_list_train = []
    mask_list_test  = []

    for i in range(len(tParents)):
        mask = ((phno.loc[:, 'F'] == tParents[i]) | (phno.loc[:, 'M'] == tParents[i]))
        mask_list_train += [~mask]
        mask_list_test  += [mask]

    # logical or: may be in any of the allowed masks
    mask_test_geno = pd.concat(mask_list_test, axis=1).sum(axis=1) > 0

    # logical and: must not be in any of the test masks
    mask_train_geno = pd.concat(mask_list_train, axis=1).prod(axis=1) > 0

    return({
        'test':mask_test_geno,
        'train':mask_train_geno
        })



In [None]:

# describe resulting set
def phno_mask_count(
        masks = phno_mask_parents(tParents = ['LH195', 'PHZ51'])
        ):
    temp = phno.loc[masks['train'], ['Year', 'Env']].groupby('Year').count().reset_index().rename(columns={'Env':'Train'}).merge(
        phno.loc[masks['test'],  ['Year', 'Env']].groupby('Year').count().reset_index().rename(columns={'Env':'Test'}), how = 'outer'
        )

    temp = pd.concat([
        temp,
        pd.DataFrame({
            'Year': '',
            'Train':temp.Train.sum(),
            'Test':temp.Test.sum()
            }, index = [9999])
            ])

    return temp


In [None]:
phno_mask_count(
        masks = phno_mask_parents(tParents = ['LH195', 'PHZ51'])
        )

Unnamed: 0,Year,Train,Test
0,2014.0,10426,1151
1,2015.0,8858,4352
2,2016.0,10214,3546
3,2017.0,9311,2810
4,2018.0,6916,11825
5,2019.0,7746,11447
6,2020.0,8729,5975
7,2021.0,12072,7389
8,2022.0,10129,161
9999,,84401,48656


In [None]:
rng = np.random.default_rng(8679)



In [None]:
parent_1k = list(parents.loc[parents.n > 1000, 'parent'])

phno_mask_count(
        masks = phno_mask_parents(tParents = list(rng.choice(parent_1k, 1)))
        )



Unnamed: 0,Year,Train,Test
0,2014.0,11573,4.0
1,2015.0,10179,3031.0
2,2016.0,12345,1415.0
3,2017.0,10763,1358.0
4,2018.0,18731,10.0
5,2019.0,19193,
6,2020.0,8990,5714.0
7,2021.0,12471,6990.0
8,2022.0,10205,85.0
9999,,114450,18607.0


In [None]:

temp_list = []
for e in parent_1k:
    temp = phno_mask_count(masks = phno_mask_parents(tParents = [e]))
    temp['Parent'] = e
    temp_list += [temp]



In [None]:
# restrict to entries with at least x observations in the year that they have the most observations
temp = pd.concat(
    [temp.loc[(temp.Test == (temp.loc[temp.index < 9999,  'Test'].max()) ), ]
     for temp in temp_list])

temp = temp.sort_values('Test', ascending=False).reset_index(drop = True)

In [None]:
# flag documented testers
temp['Known'] = ['Y' if e in [# 20-21 Testers
    'PHZ51',
    'PHP02',
    'PHK76',
    # 18-19
    'PHT69',
    'LH195',
    ] else '' for e in list(temp.Parent)]

temp = temp.loc[temp.Test > 600, ].sort_values(['Year', 'Test']).reset_index(drop = True)
temp

Unnamed: 0,Year,Train,Test,Parent,Known
0,2014,10469,1108.0,CG102,
1,2014,9974,1603.0,PB80,
2,2014,9041,2536.0,LH185,
3,2014,8015,3562.0,LH198,
4,2015,10821,2389.0,LH82,
5,2015,9942,3268.0,PHB47,
6,2016,9841,3919.0,DK3IIH6,
7,2017,11494,627.0,PHN82,
8,2017,11318,803.0,PHW52,
9,2018,6926,11815.0,LH195,Y


In [None]:
pd.concat(
    [[ee for ee in temp_list if ee.Parent[0] == e][0] for e in list(temp.Parent)],
    axis = 0).loc[9999, ]

Unnamed: 0,Year,Train,Test,Parent
9999,,131656,1401.0,CG102
9999,,131030,2027.0,PB80
9999,,128453,4604.0,LH185
9999,,128022,5035.0,LH198
9999,,129526,3531.0,LH82
9999,,127050,6007.0,PHB47
9999,,126450,6607.0,DK3IIH6
9999,,129809,3248.0,PHN82
9999,,130440,2617.0,PHW52
9999,,102706,30351.0,LH195


In [None]:
out_list = []
for i in list(temp.Parent):
    for j in list(temp.Parent):
        out = phno_mask_count(masks = phno_mask_parents(tParents = [i, j]))

        if j != 'CG102':
            out['i'] = i
            out['j'] = j
            out_list += [out.loc[9999, ]]

In [None]:
l = list(temp.Parent)

out1 = np.zeros((len(l), len(l)) )
out2 = np.zeros((len(l), len(l)) )

for i in range(len(l)):
    for j in range(len(l)):
        x = phno_mask_count(masks = phno_mask_parents(tParents = [l[i], l[j]]))
        out1[i, j] = x.loc[9999, 'Train'].astype(int)
        out2[i, j] = x.loc[9999, 'Test'].astype(int)

In [None]:
px.imshow(pd.DataFrame(out1, columns=l, index=l))

In [None]:
px.imshow(pd.DataFrame(out2, columns=l, index=l))

In [None]:
phno_mask_count(masks = phno_mask_parents(tParents = [e]))

Unnamed: 0,Year,Train,Test
0,2014.0,11577,
1,2015.0,13210,
2,2016.0,13760,
3,2017.0,12073,48.0
4,2018.0,18662,79.0
5,2019.0,19140,53.0
6,2020.0,14354,350.0
7,2021.0,19017,444.0
8,2022.0,10237,53.0
9999,,132030,1027.0


In [None]:
px.histogram(temp.Test, nbins=100)

In [None]:
pd.concat(temp_list).reset_index().loc[150:200, ]

Unnamed: 0,index,Year,Train,Test,Parent
150,0,2014.0,9974,1603.0,PB80
151,1,2015.0,12965,245.0,PB80
152,2,2016.0,13708,52.0,PB80
153,3,2017.0,12063,58.0,PB80
154,4,2018.0,18741,,PB80
155,5,2019.0,19193,,PB80
156,6,2020.0,14669,35.0,PB80
157,7,2021.0,19427,34.0,PB80
158,8,2022.0,10290,,PB80
159,9999,,131030,2027.0,PB80


In [None]:
phno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx,Yield_Mg_ha,...,Pollen_DAP_days,Silk_DAP_days,Plant_Height_cm,Ear_Height_cm,Root_Lodging_plants,Stalk_Lodging_plants,Grain_Moisture,Twt_kg_m3,F,M
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,0,0,0,5.721725,...,63.0,67.0,213.0,79.0,0.0,0.0,20.8,706.664693,M0088,LH185
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,1,0,1,11.338246,...,61.0,63.0,286.0,172.0,0.0,0.0,25.8,693.792841,M0143,LH185
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,2,0,2,6.540810,...,63.0,65.0,239.0,92.0,0.0,4.0,20.8,698.941582,M0003,LH185
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,3,0,3,10.366857,...,61.0,63.0,242.0,118.0,0.0,0.0,23.7,711.813434,M0035,LH185
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,4,0,4,10.908814,...,63.0,65.0,211.0,92.0,0.0,0.0,19.4,743.993065,M0052,LH185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,133052,235,4871,11.975018,...,,,,,,,,,W10010_0337,LH244
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,133053,235,4872,12.971193,...,,,,,,,,,W10010_0346,LH244
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,133054,235,4873,13.499769,...,,,,,,,,,W10010_0358,LH244
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,133055,235,4875,10.831640,...,,,,,,,,,W10010_0381,LH244


In [None]:
parents

Unnamed: 0,parent,n
0,LH195,30351
1,PHZ51,18607
2,PHT69,12078
3,LH244,9855
4,PHP02,9650
...,...,...
2198,GEMN-0097_PHN46_0012,1
2199,GEMN-0225_PHK76_0025,1
2200,Z022E0082,1
2201,PHJ31,1
