In [None]:
import tqdm

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn import preprocessing # LabelEncoder
from sklearn.metrics import mean_squared_error # if squared=False; RMSE

from EnvDL.core import *

In [None]:
cache_path = '../nbs_artifacts/01.03_g2fc_prep_matrices/'
ensure_dir_path_exists(dir_path = cache_path)

In [None]:
load_from = '../nbs_artifacts/01.02_g2fc_imputation/'

meta = pd.read_csv(load_from+'meta0.csv')
# meta['Date_Planted'] = meta['Date_Planted'].astype(int)
# meta['Date_Harvested'] = meta['Date_Harvested'].astype(int)
phno = pd.read_csv(load_from+'phno0.csv')
soil = pd.read_csv(load_from+'soil0.csv')
wthr = pd.read_csv(load_from+'wthr0.csv')
# wthrWide = pd.read_csv(load_from+'wthrWide0.csv')
cgmv = pd.read_csv(load_from+'cgmv0.csv')

In [None]:
mask = ((phno.Yield_Mg_ha.notna())) # This used to allow for missing yield so long as they were in 
# 2022. Now that the 2022 data is available they should be excluded.
phno = phno.loc[mask, :].reset_index().drop(columns = 'index')
phno = phno.loc[:, ['Env', 'Year', 'Hybrid', 'Yield_Mg_ha']]

# Data Prep

## Prep CVs

In [None]:
# YMat[phno.Year == 2021]

## Prep y

In [None]:
YMat = np.array(phno.Yield_Mg_ha)

## One Hot Encode G

In [None]:
temp = phno.loc[:, ['Env', 'Year', 'Hybrid', 'Yield_Mg_ha']]
temp = pd.concat([temp, temp.Hybrid.str.split('/', expand=True)], axis=1
        ).rename(columns = {0:'P0', 1:'P1'})
temp
uniq_parents = list(set(pd.concat([temp['P0'], temp['P1']])))

In [None]:
# abandoned grouping so that 'PHN11_PHW65_0107' and 'PHN11_PHW65_0260' are the same

# search_str = 'PHN11_PHW65'
# search_list = ['PH207_PHG47-17',
#  '4N506',
#  'BGEM-0157-N',
#  'PHN11_PHW65_0107',
#  'W10004_0013',
#  'PHW65_MOG_0106',
#  'PHN11_PHW65_0260',
#  'PHN11_PHW65_0276',
#  'MOG',
#  'PHN11_PHW65-0514',
#  'MO44_PHW65_0475']


# res_list = [e for e in search_list if re.match(search_str+'[-|_]\d+', e)]

# print(search_str, ':', res_list, ',')

# # uniq_parents = [e for e in uniq_parents if e not in res_list]

In [None]:
GMat = np.zeros([temp.shape[0], len(uniq_parents)])

for j in tqdm.tqdm(range(len(uniq_parents))):
    for parent in ['P0', 'P1']:
        mask = (temp[parent] == uniq_parents[j]) 
        GMat[temp.loc[mask, ].index, j] += 1

In [None]:
# confirm there are two parents encoded for each observation
assert 2 == np.min(np.sum(GMat, axis = 1))

## Make S Matrix

In [None]:
SMat = phno.loc[:, ['Env']].merge(soil.drop(columns = ['Unnamed: 0', 'Year'])).drop(columns = ['Env'])
SMatNames = list(SMat)
SMat = np.array(SMat)

## Prep W

In [None]:
# Input: (N,Cin,Lin)(N,Cin,Lin) or (Cin,Lin)(Cin,Lin)

In [None]:
WMatNames = list(wthr.drop(columns = ['Unnamed: 0', 'Env', 'Year', 'Date', 'DOY']))
WMat = np.zeros([   # Pytorch uses
    phno.shape[0],  # N
    len(WMatNames), # Cin
    np.max(wthr.DOY)# Lin
])

In [None]:
# loop through all obs, but only add each env once (add to all relevant obs)
added_envs = []
for i in tqdm.tqdm(phno.index):
    env = phno.loc[i, 'Env']

    if env in added_envs:
        pass
    else:
        mask = (phno.Env == env)
        WMat_idxs = phno.loc[mask, ].index

        # selected data is transposed to match correct shape
        wthr_mask = (wthr.Env == env)
        WMat[WMat_idxs, :, :] = wthr.loc[wthr_mask, 
                                   ].sort_values('DOY'
                                   ).drop(columns = ['Unnamed: 0', 'Env', 
                                                     'Year', 'Date', 'DOY']).T

        added_envs += [env]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135793/135793 [00:03<00:00, 37137.65it/s]


## Prep CGMV?

In [None]:
MMatNames = list(cgmv.drop(columns = ['Unnamed: 0', 'Env', 'Year']))

In [None]:
MMat = np.zeros([   
    phno.shape[0],  
    len(MMatNames)
])

In [None]:
# loop through all obs, but only add each env once (add to all relevant obs)
added_envs = []
for i in tqdm.tqdm(phno.index):
    env = phno.loc[i, 'Env']

    if env in added_envs:
        pass
    else:
        mask = (phno.Env == env)
        MMat_idxs = phno.loc[mask, ].index

        # selected data is transposed to match correct shape
        cgmv_mask = (cgmv.Env == env)
        MMat[MMat_idxs, :] = cgmv.loc[cgmv_mask, 
                                ].drop(columns = ['Unnamed: 0', 'Env', 'Year'])

        added_envs += [env]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 135793/135793 [00:02<00:00, 65836.09it/s]


# Save data
This will streamline model generation. I'll just need to load these files in and can directly begin modeling.

In [None]:
np.save(cache_path+'GMatNames.npy', uniq_parents)
np.save(cache_path+'SMatNames.npy', SMatNames)
np.save(cache_path+'WMatNames.npy', WMatNames)
np.save(cache_path+'MMatNames.npy', MMatNames)

In [None]:
phno.to_csv(cache_path+'phno3.csv', index=False)

In [None]:
np.save(cache_path+'YMat3.npy', YMat)
np.save(cache_path+'GMat3.npy', GMat)
np.save(cache_path+'SMat3.npy', SMat)
np.save(cache_path+'WMat3.npy', WMat)
np.save(cache_path+'MMat3.npy', MMat)