In [None]:
import os
import tqdm

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn import preprocessing # LabelEncoder
from sklearn.metrics import mean_squared_error # if squared=False; RMSE

from EnvDL.core import *
from EnvDL.dna import *


In [None]:
cache_path = '../nbs_artifacts/01.03_g2fc_prep_matrices/'
ensure_dir_path_exists(dir_path = cache_path)

In [None]:
load_from = '../nbs_artifacts/01.02_g2fc_imputation/'

meta = pd.read_csv(load_from+'meta.csv')

phno = pd.read_csv(load_from+'phno.csv')
soil = pd.read_csv(load_from+'soil.csv')
wthr = pd.read_csv(load_from+'wthr.csv')

cgmv = pd.read_csv(load_from+'cgmv.csv')

In [None]:
temp = phno.groupby(['Env', 'Year', 'Hybrid', 'Replicate', 'Block', 'Plot']).count().reset_index()
assert 1 == np.max(temp.Yield_Mg_ha)

In [None]:
mask = ((phno.Yield_Mg_ha.notna())) # This used to allow for missing yield so long as they were in 
# 2022. Now that the 2022 data is available they should be excluded.
phno = phno.loc[mask, :].reset_index(drop = True)
phno = phno.loc[:, ['Env', 'Year', 'Hybrid', 'Replicate', 'Block', 'Plot', 'Yield_Mg_ha']]
phno = phno.drop_duplicates().reset_index(drop = True).copy()

# Data Prep

## Prep CVs

In [None]:
# YMat[phno.Year == 2021]

## Indexing

### Filtered Phenotype and lookup
There are huge data savings to be had from keeping only deduplciated genomic data in memory in the GPU. 

In [None]:
uniq_hybrids = list(set(phno.Hybrid))
uniq_hybrids[0:3]

['M0028/LH82', 'PHN11_PHW65_0101/PHB47', 'Z035E0047/LH162']

In [None]:
# check snp records and only keep phenotypes with snp data
snps_found = [exists_geno((taxa_to_filename(taxa = ith_hybrid))) for ith_hybrid in  uniq_hybrids]

In [None]:
temp = pd.DataFrame(zip(uniq_hybrids, snps_found), columns=['Hybrid', 'SNPS'])
temp.loc[(temp.SNPS != True)]

Unnamed: 0,Hybrid,SNPS
22,GT603/PHB47,False
147,PHN11_PHW65_0032/PHT69,False
333,2369/LH123HT,False
380,GT603/PHZ51,False
448,LH195/PH463,False
...,...,...
4639,PHN66/LH123HT,False
4663,PHN11_PHW65_0407/PHT69,False
4777,MO44_PHW65_0349/LH195,False
4818,BGEM-0122-N/LH195,False


In [None]:
print('Genotypes flagged as missing that exist with different capitalization:')
missing_hybrids = temp.loc[(temp.SNPS != True), ['Hybrid']]
# quick check that some of these aren't just differing in capitalization
missing_hybrids_list = [e.replace('/', '__').lower() for e in list(missing_hybrids.Hybrid)]
print(
    [e for e in [e.lower for e in os.listdir('../data/zma/g2fc/genotypes/snps/')] if e in missing_hybrids_list]
)

Genotypes flagged as missing that exist with different capitalization:
[]


In [None]:
print(
    str(missing_hybrids.merge(phno, how = 'left').shape[0]
       )+' observations ('+str(round(100 * (missing_hybrids.merge(phno, how = 'left').shape[0]/phno.shape[0]), 3)
       )+'%) missing genotype.'
)

2736 observations (2.015%) missing genotype.


In [None]:
# limit to phno with snp data
phno_geno = temp.loc[(temp.SNPS == True)].merge(phno, how = 'left')
phno_geno

Unnamed: 0,Hybrid,SNPS,Env,Year,Replicate,Block,Plot,Yield_Mg_ha
0,M0028/LH82,True,ILH1_2015,2015,1.0,10.0,229.0,8.089230
1,M0028/LH82,True,MNH1_2015,2015,1.0,8.0,188.0,7.625307
2,M0028/LH82,True,NEH1_2015,2015,1.0,4.0,85.0,9.671387
3,M0028/LH82,True,NEH4_2015,2015,2.0,7.0,409.0,4.714949
4,M0028/LH82,True,NYH3_2015,2015,1.0,8.0,192.0,6.537947
...,...,...,...,...,...,...,...,...
133052,MOG_LH123HT-019-1-1-1-1-B/PB80,True,NCH1_2014,2014,2.0,6.0,106.0,8.801143
133053,MOG_LH123HT-019-1-1-1-1-B/PB80,True,TXH1_2014,2014,1.0,5.0,109.0,8.926460
133054,MOG_LH123HT-019-1-1-1-1-B/PB80,True,TXH1_2014,2014,2.0,5.0,139.0,8.942682
133055,MOG_LH123HT-019-1-1-1-1-B/PB80,True,TXH2_2014,2014,1.0,1.0,37.0,5.904733


In [None]:
# restrict to observations with snp data
phno_geno = phno.merge(phno_geno.loc[:, ['Hybrid', 'SNPS']].drop_duplicates(), how  = 'left')
phno_geno = phno_geno.loc[(phno_geno.SNPS == True)].drop(columns = ['SNPS']).reset_index(drop=True).copy()
phno_geno

Unnamed: 0,Env,Year,Hybrid,Replicate,Block,Plot,Yield_Mg_ha
0,DEH1_2014,2014,M0088/LH185,1.0,1.0,1.0,5.721725
1,DEH1_2014,2014,M0143/LH185,1.0,1.0,2.0,11.338246
2,DEH1_2014,2014,M0003/LH185,1.0,1.0,3.0,6.540810
3,DEH1_2014,2014,M0035/LH185,1.0,1.0,4.0,10.366857
4,DEH1_2014,2014,M0052/LH185,1.0,1.0,5.0,10.908814
...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,-999.0,-999.0,-999.0,11.975018
133053,WIH3_2022,2022,W10010_0346/LH244,-999.0,-999.0,-999.0,12.971193
133054,WIH3_2022,2022,W10010_0358/LH244,-999.0,-999.0,-999.0,13.499769
133055,WIH3_2022,2022,W10010_0381/LH244,-999.0,-999.0,-999.0,10.831640


In [None]:
assert phno_geno.Yield_Mg_ha.isna().sum() == 0

In [None]:
# phno_geno.to_csv(cache_path+'phno_geno_filter.csv', index=False)

phno_geno = phno_geno.reset_index().rename(columns = {'index':'Phno_Idx'})

In [None]:
Env_Idxs  = phno_geno.loc[:, ['Env']
                    ].drop_duplicates(
                    ).reset_index(drop = True
                    ).reset_index(
                    ).rename(columns = {'index':'Env_Idx'})
Geno_Idxs = phno_geno.loc[:, ['Hybrid']
                    ].drop_duplicates(
                    ).reset_index(drop = True
                    ).reset_index(
                    ).rename(columns = {'index':'Geno_Idx'})

In [None]:
# This is key. I'm explictly adding an index column to fix frustrations jumping between sorted versions of the df.

phno_geno = phno_geno.merge(Env_Idxs).merge(Geno_Idxs).sort_values('Phno_Idx').reset_index(drop = True)

In [None]:
phno_geno = phno_geno.loc[:, [ 'Env', 'Year', 'Hybrid', 'Yield_Mg_ha', 
                              'Replicate', 'Block', 'Plot', 
                              'Phno_Idx', 'Env_Idx', 'Geno_Idx']]

phno_geno

Unnamed: 0,Env,Year,Hybrid,Yield_Mg_ha,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx
0,DEH1_2014,2014,M0088/LH185,5.721725,1.0,1.0,1.0,0,0,0
1,DEH1_2014,2014,M0143/LH185,11.338246,1.0,1.0,2.0,1,0,1
2,DEH1_2014,2014,M0003/LH185,6.540810,1.0,1.0,3.0,2,0,2
3,DEH1_2014,2014,M0035/LH185,10.366857,1.0,1.0,4.0,3,0,3
4,DEH1_2014,2014,M0052/LH185,10.908814,1.0,1.0,5.0,4,0,4
...,...,...,...,...,...,...,...,...,...,...
133052,WIH3_2022,2022,W10010_0337/LH244,11.975018,-999.0,-999.0,-999.0,133052,235,4871
133053,WIH3_2022,2022,W10010_0346/LH244,12.971193,-999.0,-999.0,-999.0,133053,235,4872
133054,WIH3_2022,2022,W10010_0358/LH244,13.499769,-999.0,-999.0,-999.0,133054,235,4873
133055,WIH3_2022,2022,W10010_0381/LH244,10.831640,-999.0,-999.0,-999.0,133055,235,4875


In [None]:
phno_geno.to_csv(cache_path+'phno_geno.csv', index=False)

In [None]:
# and overwrite phno to prevent incomplete data from creeping in.
phno = phno_geno.copy()

### Observation to Planting/Harvest DOY

In [None]:
# use phno_index as lookup

mgmt_time = phno.merge(
    meta.loc[:, ['Env', 'Hybrid', 'Replicate', 'Block', 'Plot', 'Date_Planted','Date_Harvested']
       ].drop_duplicates(), how = 'left')

mgmt_time['DOY_Planted'] = round(mgmt_time['Date_Planted']).astype(int)
mgmt_time['DOY_Harvested'] = round(mgmt_time['Date_Harvested']).astype(int)

mgmt_time = mgmt_time.loc[:, ['DOY_Planted','DOY_Harvested']]
mgmt_time

Unnamed: 0,DOY_Planted,DOY_Harvested
0,125,272
1,125,272
2,125,272
3,125,272
4,125,272
...,...,...
133052,136,303
133053,136,303
133054,136,303
133055,136,303


In [None]:
PlantHarvest = np.array(mgmt_time.loc[:, ])
np.save(cache_path+'PlantHarvestNames.npy', list(mgmt_time))
np.save(cache_path+'PlantHarvest.npy', PlantHarvest)

### Observation to Metadata

In [None]:
meta_small = phno.loc[:, ['Env', 'Env_Idx']].drop_duplicates().merge(meta.loc[:, [
    'Env',
 'Pounds_Needed_Soil_Moisture',
 'Cover_beet',
 'Cover_corn',
 'Cover_cotton',
 'Cover_fallow',
 'Cover_lima',
 'Cover_peanut',
 'Cover_pumpkin',
 'Cover_rye',
 'Cover_sorghum',
 'Cover_soy',
 'Cover_wheat',
 'Pre_Chisel',
 'Pre_Cult',
 'Pre_Disc',
 'Pre_MinTill']].drop_duplicates(), how = 'left').sort_values('Env_Idx')

meta_small

Unnamed: 0,Env,Env_Idx,Pounds_Needed_Soil_Moisture,Cover_beet,Cover_corn,Cover_cotton,Cover_fallow,Cover_lima,Cover_peanut,Cover_pumpkin,Cover_rye,Cover_sorghum,Cover_soy,Cover_wheat,Pre_Chisel,Pre_Cult,Pre_Disc,Pre_MinTill
0,DEH1_2014,0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,GAH1_2014,1,4.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,IAH1a_2014,2,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,IAH1b_2014,3,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,IAH1c_2014,4,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231,TXH2_2022,231,4.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
232,TXH3_2022,232,4.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
233,WIH1_2022,233,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
234,WIH2_2022,234,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [None]:
mgmtMat = np.array(meta_small.loc[:, ].drop(columns = ['Env', 'Env_Idx']))
np.save(cache_path+'mgmtMatNames.npy', list(meta_small.loc[:, ].drop(columns = ['Env', 'Env_Idx'])))
np.save(cache_path+'mgmtMat.npy', mgmtMat)

### Observation to Genotype

In [None]:
uniq_geno = phno_geno.loc[:, ['Geno_Idx']].drop_duplicates().reset_index().rename(columns = {'index':'Is_Phno_Idx'})
obs_geno_lookup = phno_geno.loc[:, ['Phno_Idx', 'Geno_Idx']].merge(uniq_geno).sort_values('Phno_Idx').reset_index(drop = True).copy()
obs_geno_lookup

Unnamed: 0,Phno_Idx,Geno_Idx,Is_Phno_Idx
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,4
...,...,...,...
133052,133052,4871,123234
133053,133053,4872,123235
133054,133054,4873,123236
133055,133055,4875,123238


In [None]:
# save out obs_geno_lookup
np.save(cache_path+'obs_geno_lookup.npy', np.asarray(obs_geno_lookup))
obs_geno_lookup.to_csv(cache_path+'obs_geno_lookup.csv', index=False)

### Observation to Environment

In [None]:
uniq_env = phno_geno.loc[:, ['Env_Idx']].drop_duplicates().reset_index().rename(columns = {'index':'Is_Phno_Idx'})
obs_env_lookup = phno_geno.loc[:, ['Phno_Idx', 'Env_Idx']].merge(uniq_env).sort_values('Phno_Idx').reset_index(drop = True).copy()
obs_env_lookup

Unnamed: 0,Phno_Idx,Env_Idx,Is_Phno_Idx
0,0,0,0
1,1,0,0
2,2,0,0
3,3,0,0
4,4,0,0
...,...,...,...
133052,133052,235,132615
133053,133053,235,132615
133054,133054,235,132615
133055,133055,235,132615


In [None]:
# save out obs_geno_lookup
np.save(cache_path+'obs_env_lookup.npy', np.asarray(obs_env_lookup))
obs_env_lookup.to_csv(cache_path+'obs_env_lookup.csv', index=False)

## Y

In [None]:
YMat = np.array(phno_geno.Yield_Mg_ha)
np.save(cache_path+'YMat.npy', YMat)

## Genomic Data

### One Hot Encode G (Deduplicated)

In [None]:
# If phno and GMat observations are 1:1, then GMat is 135793, 2250. After deduplicating this will become 4926, 2250. That's only 3.6%!
# Doing this removes several hybrids so it's actually a little smaller 4926, 2203

In [None]:
obs_geno_lookup.head()

Unnamed: 0,Phno_Idx,Geno_Idx,Is_Phno_Idx
0,0,0,0
1,1,1,1
2,2,2,2
3,3,3,3
4,4,4,4


In [None]:
temp = obs_geno_lookup.drop(columns = 'Phno_Idx').drop_duplicates().reset_index(drop=True)

mask = phno_geno.index.isin(list(obs_geno_lookup['Is_Phno_Idx'].drop_duplicates()))
temp = phno_geno.loc[mask, ]#'Hybrid']
temp = temp.reset_index(drop = True) # drop so that the index in the DataFrame and ndarray match
temp

Unnamed: 0,Env,Year,Hybrid,Yield_Mg_ha,Replicate,Block,Plot,Phno_Idx,Env_Idx,Geno_Idx
0,DEH1_2014,2014,M0088/LH185,5.721725,1.0,1.0,1.0,0,0,0
1,DEH1_2014,2014,M0143/LH185,11.338246,1.0,1.0,2.0,1,0,1
2,DEH1_2014,2014,M0003/LH185,6.540810,1.0,1.0,3.0,2,0,2
3,DEH1_2014,2014,M0035/LH185,10.366857,1.0,1.0,4.0,3,0,3
4,DEH1_2014,2014,M0052/LH185,10.908814,1.0,1.0,5.0,4,0,4
...,...,...,...,...,...,...,...,...,...,...
4921,IAH2_2022,2022,GEMN-0096_LH212HT_0014/LH244,10.929844,-999.0,-999.0,-999.0,124682,217,4921
4922,IAH2_2022,2022,NC370/LH244,5.306059,-999.0,-999.0,-999.0,125036,217,4922
4923,IAH3_2022,2022,GEMN-0097_LH212HT_0046/LH244,13.179435,-999.0,-999.0,-999.0,125273,218,4923
4924,IAH4_2022,2022,GEMN-0225_PHK76_0025/LH244,11.453161,-999.0,-999.0,-999.0,126009,219,4924


In [None]:
temp = pd.concat([temp, temp.Hybrid.str.split('/', expand=True)], axis=1
        ).rename(columns = {0:'P0', 1:'P1'})

uniq_parents = list(set(pd.concat([temp['P0'], temp['P1']])))

In [None]:
if os.path.exists(cache_path+'GMat.npy'):
    GMat = np.load(cache_path+'GMat.npy')
else:
    GMat = np.zeros([temp.shape[0], len(uniq_parents)])

    for j in tqdm.tqdm(range(len(uniq_parents))):
        for parent in ['P0', 'P1']:
            mask = (temp[parent] == uniq_parents[j]) 
            GMat[temp.loc[mask, ].index, j] += 1
            
    np.save(cache_path+'GMat.npy', GMat)

In [None]:
np.save(cache_path+'GMatNames.npy', uniq_parents)

In [None]:
# confirm there are two parents encoded for each observation
assert 2 == np.min(np.sum(GMat, axis = 1))

### Nucleotides

In [None]:
# Convert IUPAC codes into one hot
# https://www.bioinformatics.org/sms/iupac.html
IUPAC = ['unk', 'A', 'C', 'G', 'T', 'K', 'M', 'N', 'R', 'S', 'W', 'Y']
IUPAC = dict(zip(IUPAC, [i for i in range(len(IUPAC))])) # begin with unk.
IUPAC

{'unk': 0,
 'A': 1,
 'C': 2,
 'G': 3,
 'T': 4,
 'K': 5,
 'M': 6,
 'N': 7,
 'R': 8,
 'S': 9,
 'W': 10,
 'Y': 11}

In [None]:
put_cached_result(cache_path+'ACGT_OneHot_dict.pkl', IUPAC)

In [None]:
if os.path.exists(cache_path+'ACGT_OneHot.npy'):
#     ACGT = np.load(cache_path+'ACGT_OneHot.npy')
    pass
else:
    temp = obs_geno_lookup.drop(columns = 'Phno_Idx').drop_duplicates().reset_index(drop=True)

    # use the first Hybrid to figure out what the length of the sequence is.
    i = 0
    phno_idx = temp.loc[i, 'Is_Phno_Idx']
    ith_hybrid = phno.loc[phno_idx, 'Hybrid']
    geno_seq_len = len(get_geno(taxa_to_filename(taxa = ith_hybrid))[1:])    

    # setup ndarray to hold data
    ACGT = np.ndarray(shape = (temp.shape[0],        # obs
                               geno_seq_len,         # values
                               len(IUPAC.keys())+1)) # channels

    # This is inefficient but needs only to be run once.
    # go over observations, channels, fill in values

    for i in tqdm.tqdm(temp.index): 
        phno_idx = temp.loc[i, 'Is_Phno_Idx']
        ith_hybrid = phno.loc[phno_idx, 'Hybrid']
        res = get_geno(taxa_to_filename(taxa = ith_hybrid)) 
        res = res[1:] # drop taxa

        res = [e.strip('\n') for e in res] # remove end of line

        for key in IUPAC.keys():
            IUPAC_mask = [True if e == key else False for e in res]
            ACGT[i, 
                 IUPAC_mask, 
                 IUPAC[key]] = 1  

    # Swap axes to match pytorch convention
    ACGT = np.swapaxes(ACGT, 1, 2)

    np.save(cache_path+'ACGT_OneHot.npy', ACGT)

### Nucleotide percents

In [None]:
if os.path.exists(cache_path+'ACGT.npy'):
#     ACGT = np.load(cache_path+'ACGT.npy')
    pass
else:
    temp = obs_geno_lookup.drop(columns = 'Phno_Idx').drop_duplicates().reset_index(drop=True)

    
    # use the first Hybrid to figure out what the length of the sequence is.
    i = 0
    phno_idx = temp.loc[i, 'Is_Phno_Idx']
    ith_hybrid = phno.loc[phno_idx, 'Hybrid']
    geno_seq_len = len(get_geno(taxa_to_filename(taxa = ith_hybrid))[1:])    
    
    # setup ndarray to hold data
    ACGT = np.ndarray(shape = (temp.shape[0], geno_seq_len, 4))

    for i in tqdm.tqdm(temp.index):
        phno_idx = temp.loc[i, 'Is_Phno_Idx']
        ith_hybrid = phno.loc[phno_idx, 'Hybrid']
        res = get_geno(taxa_to_filename(taxa = ith_hybrid)) 
        res = res[1:] # drop taxa
        res = list_to_ACGT(in_seq = res)
        ACGT[i, :, :] = res[None, :, :]
        
    # Swap axes to match pytorch convention
    # (4926, 125891, 4)
    ACGT = np.swapaxes(ACGT, 1, 2)

    # set missings to 0 
    ACGT[np.isnan(ACGT)] = 0

    np.save(cache_path+'ACGT.npy', ACGT)

### Hilbert Nucleotide percents

In [None]:
if os.path.exists(cache_path+'ACGT_hilb.npy'):
#     ACGT_hilb = np.load(cache_path+'ACGT_hilb.npy')
    pass
else:
    ACGT_hilb = np_3d_to_hilbert( np.swapaxes(ACGT, 1, 2) ) # swap channels back to dim 2 before running

    # ACGT_hilb.shape
    # (4926, 256, 512, 4)
    # Pytorch standard has channels second
    ACGT_hilb = np.swapaxes(ACGT_hilb, 1, 3)
    ACGT_hilb = np.swapaxes(ACGT_hilb, 2, 3)

    # set missings to 0
    ACGT_hilb[np.isnan(ACGT_hilb)] = 0

    np.save(cache_path+'ACGT_hilb.npy', ACGT_hilb)

## Environmental Data

### Make Environment to Observation Lookup 

In [None]:
obs_env_lookup_small = obs_env_lookup.drop(columns = 'Phno_Idx'
                                    ).drop_duplicates(
                                    ).sort_values('Env_Idx'
                                    ).reset_index(drop = True)

obs_env_lookup_small.head()

Unnamed: 0,Env_Idx,Is_Phno_Idx
0,0,0
1,1,461
2,2,936
3,3,1785
4,4,2222


### Make S Matrix

In [None]:
SMat = phno.loc[:, ['Env']].merge(soil.drop(columns = ['Unnamed: 0', 'Year'])).drop(columns = ['Env'])
SMatNames = list(SMat)
SMat = np.array(SMat)

In [None]:
SMat.shape

(133057, 23)

In [None]:
SMatSmall = np.zeros([
    obs_env_lookup_small.shape[0], # unique environments
    len(SMatNames)                 # Cin
])

In [None]:
for i in obs_env_lookup_small.index:
    ii = obs_env_lookup_small.loc[(obs_env_lookup_small.Env_Idx == i), 'Is_Phno_Idx']
    ii = int(ii)
    SMatSmall[i, ] = SMat[ii, ]

In [None]:
# overwrite with much smaller version (rows from 135793 -> 236)
SMat = SMatSmall.copy()

### Make W Matrix

In [None]:
# Input: (N,Cin,Lin)(N,Cin,Lin) or (Cin,Lin)(Cin,Lin)
wthr.DOY = wthr.DOY.astype(int)

In [None]:
WMatNames = list(wthr.drop(columns = ['Unnamed: 0', 'Env', 'Year', 'Date', 'DOY']))
WMat = np.zeros([                  # Pytorch uses
    obs_env_lookup_small.shape[0], # N
    len(WMatNames),                # Cin
    np.max(wthr.DOY)               # Lin
])

In [None]:
for i in tqdm.tqdm(obs_env_lookup_small.index):
    ii = obs_env_lookup_small.loc[(obs_env_lookup_small.Env_Idx == i), 'Is_Phno_Idx']
    ii = int(ii)

    ith_Env = phno_geno.loc[(phno_geno.index == ii), 'Env']
    ith_Env = list(ith_Env)[0]

    # selected data is transposed to match correct shape
    wthr_mask = (wthr.Env == ith_Env)
    WMat[i, :, :] = wthr.loc[wthr_mask, 
                       ].sort_values('DOY'
                       ).drop(columns = ['Unnamed: 0', 'Env', 
                                         'Year', 'Date', 'DOY']).T

100%|████████████████████████████████████████| 236/236 [00:00<00:00, 258.22it/s]


## Prep CGMV?

In [None]:
cgmv_Env = phno_geno.loc[:, ['Env', 'Env_Idx']
                   ].drop_duplicates(
                   ).merge(cgmv
                   ).sort_values('Env_Idx'
                   ).reset_index(
                   ).drop(columns = ['index', 'Env', 'Env_Idx', 'Unnamed: 0', 'Year'])

MMatNames = list(cgmv_Env)
MMat = np.asarray(cgmv_Env)

# Save data
This will streamline model generation. I'll just need to load these files in and can directly begin modeling.

In [None]:
np.save(cache_path+'SMatNames.npy', SMatNames)
np.save(cache_path+'WMatNames.npy', WMatNames)
np.save(cache_path+'MMatNames.npy', MMatNames)

In [None]:
np.save(cache_path+'SMat.npy', SMat)
np.save(cache_path+'WMat.npy', WMat)
np.save(cache_path+'MMat.npy', MMat)