In [None]:
import os
import numpy as np
import pandas as pd
import torch 

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from tqdm import tqdm

import pyarrow as pa
import pyarrow.parquet as pq

In [None]:
import datetime

# Compute unix epoch to date table

def _prep_unix_epoch_to_date(max_year = 2025):
    month_abbr = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    max_day = (datetime.datetime(max_year, 1, 1, 0, 0) - datetime.datetime(1970, 1, 1, 0, 0)).days 
    unix_epoch = [i for i in range(max_day)]
    date_times = [datetime.datetime(1970, 1, 1, 0, 0) + datetime.timedelta(i) for i in range(max_day)]

    def as_SowDate(date_time):
        day = f'{date_time.day}'
        if len(day) == 1:
            return(f'0{day}-{month_abbr[date_time.month - 1]}')
        else:
            return(f'{day}-{month_abbr[date_time.month - 1]}')

    tmp = pd.DataFrame({
        'Unix': unix_epoch,
        'Year':[e.year for e in date_times],
        'Month':[e.month for e in date_times],
        'Day':[e.day for e in date_times],
        'SowDate':[as_SowDate(date_time = e) for e in date_times]
    })
    _ = tmp.loc[:, ['Unix', 'Year']].groupby(['Year']).min().reset_index().rename(columns={'Unix':'MinUnix'})
    tmp = tmp.merge(_)

    tmp['DOY'] = tmp['Unix'] - tmp['MinUnix']
    tmp = tmp.drop(columns = ['MinUnix'])
    return tmp

In [None]:
# how bad would it be if we stored a bunch of tiny parquet files? 
# saving many tiny parquet files will increase storage cost by 1.6x
# That's not great but not horrible either. We're talking about approximately 500 gb.
# (305213583096*1.6)/1000000000
# 488.34

# apsimx_sim_parquet_dir = '/home/Shared/cultivar_sim_exps'
# Result = pq.read_table(apsimx_sim_parquet_dir+'/'+'sim_1698440407_4739.parquet').to_pandas()

# -rw-rw-r-- 1 kickd newgroup 2939581425 Jun  1 01:08 cultivar_sim_exps/sim_1698440407_4739.parquet


# for i in tqdm(Result.FactorialUID.unique()):
#     table = Result.loc[(Result.FactorialUID == i), ].drop(columns = 'FactorialUID')
#     table = pa.Table.from_pandas(table)
#     pq.write_table(table, f'/home/Shared/testing_rm_after0620/{i}.parquet')

# 100%|██████████| 24025/24025 [33:58<00:00, 11.79it/s]

In [None]:
# The goal is to have a (postgres?) SQL db that we can query. To not have a delay we're going to instead load a batch into memory BUT allow for redefining this batch by swapping out the dataloader.

# Workflow:
# Define the desired data 
# Use the main tables to figure out what parquet files we need to pull from.
# Pull all the data in and represent as tensors

In [None]:
apsimx_sim_parquet_dir = '/home/Shared/cultivar_sim_exps'
os.listdir(apsimx_sim_parquet_dir)[0:3]

['sim_1698440407_4739.parquet',
 'sim_1697418008_10643.parquet',
 'sim_1697187607_79518.parquet']

In [None]:
[e for e in os.listdir(apsimx_sim_parquet_dir) if e[0:4] != 'sim_']

['DefaultCultivarsAll.parquet', 'Genotypes.parquet', 'Ids.parquet']

In [None]:
# metadata 
DefaultCultivarsAll = pq.read_table(apsimx_sim_parquet_dir+'/'+'DefaultCultivarsAll.parquet').to_pandas()
Genotypes           = pq.read_table(apsimx_sim_parquet_dir+'/'+'Genotypes.parquet').to_pandas()
Ids                 = pq.read_table(apsimx_sim_parquet_dir+'/'+'Ids.parquet').to_pandas()

In [None]:
# I'm setting up a class to help find the files we need to read through to build the datset 
# This works by holding a copy of the Ids and Genotypes tables. 
# We'll operate on those, filtering them down until the tables only contain the enries we want to use.
# Next we'll use a that produces tuples of the (parquet file, filtering criteria)

class data_helper():
    def __init__(self, genotypes_path, ids_path, results_path):
        # used later to get the results. Append / if there isn't one.
        if results_path[-1] != '/': 
            results_path = results_path+'/'
        self.results_path = results_path

        Genotypes = pq.read_table(genotypes_path).to_pandas()
        # coerce None to NaN (default cultivars don't have all values specified)
        for e in [ee for ee in list(Genotypes) if ee not in ['File', 'Genotype']]:
            Genotypes[e] = Genotypes[e].astype(float)
        mask = (Genotypes.isna().sum(axis = 1) == 0)

        self.Genotypes = Genotypes.loc[mask, ].reset_index(drop = True)
        self.Ids = pq.read_table(ids_path).to_pandas()
    
    def apply_mask(self, table, mask):
        "This method takes care of automatically filtering the non-masked table."
        if table not in ['Genotypes', 'Ids']:
            print('table should be Genotypes or Ids')   
        else:
            if table == 'Genotypes':
                # apply mask to filter the table
                self.Genotypes = self.Genotypes.loc[mask, ].reset_index(drop = True)
                # left join to update the other table
                self.Ids = self.Genotypes.loc[:, ['File', 'Genotype']
                                        ].drop_duplicates(
                                        ).merge(self.Ids, how = 'left'
                                        ).reset_index(drop = True)
            elif table == 'Ids':
                self.Ids = self.Ids.loc[mask, ].reset_index(drop = True)
                self.Genotypes = self.Ids.loc[:, ['File', 'Genotype']
                                    ].drop_duplicates(
                                    ).merge(self.Genotypes, how = 'left'
                                    ).reset_index(drop = True)
    def get_results(self, years = [], dry_run = True):
        # Now we can ask for the files that we should get
        tmp = self.Ids.loc[:, ['File', 'Genotype', 'FactorialUID']]

        get_files = tmp.File.drop_duplicates().to_list()
        print(f'{len(get_files)} files to be read.')

        if dry_run == True:
            print('In dry_run, reading no files')

        if dry_run == False:
            res_list = []
            col_order = ''
            for file in tqdm(get_files):
                # print(f'{file}')
                res = pq.read_table(f'{x.results_path+file}.parquet').to_pandas()

                # columns should be in the same order, but we will force them to be here. 
                if col_order == '':
                    col_order = list(res)

                # filter order established with some informal testing.
                # runtime filter years, factorials: [10, 11.3, 10.5]
                # runtime filter factorials, years: [9.0, 8.4, 9.6]

                # filter factorials
                res = tmp.loc[(tmp.File == file), ['FactorialUID']].merge(res)

                # filter years
                if years != []:
                    yr = _prep_unix_epoch_to_date(max_year = 2030)
                    yr = yr.loc[(yr.Year.isin(years)), ['Unix']].rename(columns={'Unix':'Date'})
                    res = yr.merge(res)

                res_list.append(res)
            res_list = pd.concat(res_list).reset_index(drop=True)
            self.results = res_list


x = data_helper(
    genotypes_path = apsimx_sim_parquet_dir+'/'+'Genotypes.parquet',
    ids_path = apsimx_sim_parquet_dir+'/'+'Ids.parquet',
    results_path = apsimx_sim_parquet_dir)

# restrict to simulated cultivars with the maximum MaximumGrainsPerCob
mask = (x.Genotypes['Grain.MaximumGrainsPerCob.FixedValue'] == x.Genotypes['Grain.MaximumGrainsPerCob.FixedValue'].max())

print(f'Before Mask: Genotypes: {x.Genotypes.shape} Ids: {x.Ids.shape}')
x.apply_mask(table='Genotypes', mask=mask)
print(f'After Mask: Genotypes: {x.Genotypes.shape} Ids: {x.Ids.shape}')

Before Mask: Genotypes: (3150, 17) Ids: (2837275, 7)
After Mask: Genotypes: (5, 17) Ids: (4247, 7)


In [None]:
# Now let's select a region of the country. For this demonstration I'll filter to a region around Columbia MO.

mask = ((x.Ids.Longitude < -90) & 
        (x.Ids.Longitude > -95) &
        (x.Ids.Latitude  <  40) & 
        (x.Ids.Latitude  >  30) 
        )

print(f'Before Mask: Genotypes: {x.Genotypes.shape} Ids: {x.Ids.shape}')
x.apply_mask(table='Ids', mask=mask)
print(f'After Mask: Genotypes: {x.Genotypes.shape} Ids: {x.Ids.shape}')

Before Mask: Genotypes: (5, 17) Ids: (4247, 7)
After Mask: Genotypes: (4, 17) Ids: (248, 7)


In [None]:
x.Genotypes

Unnamed: 0,File,Genotype,Grain.MaximumGrainsPerCob.FixedValue,Grain.MaximumPotentialGrainSize.FixedValue,Phenology.FlagLeafToFlowering.Target.FixedValue,Phenology.FloweringToGrainFilling.Target.FixedValue,Phenology.GrainFilling.Target.FixedValue,Phenology.Juvenile.Target.FixedValue,Phenology.Maturing.Target.FixedValue,Phenology.MaturityToHarvestRipe.Target.FixedValue,Phenology.Photosensitive.Target.XYPairs.X__1,Phenology.Photosensitive.Target.XYPairs.X__2,Phenology.Photosensitive.Target.XYPairs.X__3,Phenology.Photosensitive.Target.XYPairs.Y__1,Phenology.Photosensitive.Target.XYPairs.Y__2,Phenology.Photosensitive.Target.XYPairs.Y__3,Rachis.DMDemands.Structural.DMDemandFunction.MaximumOrganWt.FixedValue
0,sim_1697288407_92446,Cultivar24,850.0,0.291458,97.0,161.0,702.0,282.0,36.0,30.0,0.0,12.5,24.0,0.0,0.0,360.938719,19.0
1,sim_1697115607_54721,Cultivar7,850.0,0.220163,65.0,135.0,524.0,124.0,49.0,50.0,0.0,12.5,24.0,0.0,0.0,171.941076,27.0
2,sim_1697072407_46129,Cultivar4,850.0,0.270984,95.0,147.0,620.0,232.0,41.0,42.0,0.0,12.5,24.0,0.0,0.0,444.77505,28.0
3,sim_1696626007_58404,Cultivar13,850.0,0.296901,17.0,157.0,620.0,144.0,13.0,20.0,0.0,12.5,24.0,0.0,0.0,135.615289,16.0


In [None]:
x.get_results(years = [1990, 2000, 2010, 2020], dry_run = True) # years is optional. If an empty list is passed all years will be returned.

5 files to be read.
In dry_run, reading no files


In [None]:
x.get_results(years = [1990, 2000, 2010, 2020], dry_run = False)

5 files to be read.


100%|██████████| 5/5 [00:36<00:00,  7.29s/it]


In [None]:
# now results can be accessed 
x.results.shape

(3193426, 5)

## Define working set

In [None]:
# Starting with a location and year

import plotly.express as px

fig = px.scatter_mapbox(Ids.loc[:, ['Longitude', 'Latitude']].drop_duplicates(), lon = 'Longitude', lat = 'Latitude',
                        color_discrete_sequence=["fuchsia"], zoom=3, height=300)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
# -> File	Genotype  --Genotypes-> 
#	 FactorialUID     --Results->

lon, lat, soil = [-76.65611874999999, 42.733264, 141] #lon lat soil

# should allow ranges, slices, or all
sow = '19-Jun'
# allow all
cultivar = 'Cultivar1'


# -> File FactorialUID

mask = (
    (Ids.Longitude == lon) &
    (Ids.Latitude == lat) &
    (Ids.SoilIdx == soil) &
    (Ids.SowDate == sow) &
    (Ids.Genotype == cultivar))

Ids.loc[mask, ]

Unnamed: 0,File,Longitude,Latitude,SoilIdx,SowDate,Genotype,FactorialUID
0,sim_1697173207_62279,-76.656119,42.733264,141,19-Jun,Cultivar1,1
652550,sim_1698267607_61887,-76.656119,42.733264,141,19-Jun,Cultivar1,1
719200,sim_1698224407_24608,-76.656119,42.733264,141,19-Jun,Cultivar1,1551
825375,sim_1696770008_03244,-76.656119,42.733264,141,19-Jun,Cultivar1,13951
874201,sim_1697461207_67118,-76.656119,42.733264,141,19-Jun,Cultivar1,11627
966425,sim_1697475607_87027,-76.656119,42.733264,141,19-Jun,Cultivar1,5426
1170250,sim_1696842008_03237,-76.656119,42.733264,141,19-Jun,Cultivar1,8526
1430657,sim_1696417207_4919,-76.656119,42.733264,141,19-Jun,Cultivar1,13183
1786375,sim_1696265622_69985,-76.656119,42.733264,141,19-Jun,Cultivar1,10076
2339725,sim_1695672661_86607,-76.656119,42.733264,141,19-Jun,Cultivar1,275901


<__main__.temp>

In [None]:
# Starting with a target set of genotypes
mask = Genotypes['Grain.MaximumGrainsPerCob.FixedValue'] == Genotypes['Grain.MaximumGrainsPerCob.FixedValue'].max()

Genotypes.loc[mask, ['File', 'Genotype']]

Unnamed: 0,File,Genotype
953,sim_1698224407_24608,Cultivar1
1001,sim_1697288407_92446,Cultivar24
1784,sim_1697115607_54721,Cultivar7
2156,sim_1697072407_46129,Cultivar4
2840,sim_1696626007_58404,Cultivar13


## Convert Small Datasets to Tensors
### `Genotypes` (Cultivar variables)

Warning! There are some NAs from Genotypes that are not "Cultivar\d+" Genotypes. These are calibrated genotypes that with defaults that are not clear.

In [None]:
# keep as df or matrix (contains text)
Genotypes_lookup = Genotypes.loc[:, ['File', 'Genotype']].copy()

# ['Grain.MaximumGrainsPerCob.FixedValue',
#  'Grain.MaximumPotentialGrainSize.FixedValue',
#  'Phenology.FlagLeafToFlowering.Target.FixedValue',
#  'Phenology.FloweringToGrainFilling.Target.FixedValue',
#  'Phenology.GrainFilling.Target.FixedValue',
#  'Phenology.Juvenile.Target.FixedValue',
#  'Phenology.Maturing.Target.FixedValue',
#  'Phenology.MaturityToHarvestRipe.Target.FixedValue',
#  'Phenology.Photosensitive.Target.XYPairs.X__1',
#  'Phenology.Photosensitive.Target.XYPairs.X__2',
#  'Phenology.Photosensitive.Target.XYPairs.X__3',
#  'Phenology.Photosensitive.Target.XYPairs.Y__1',
#  'Phenology.Photosensitive.Target.XYPairs.Y__2',
#  'Phenology.Photosensitive.Target.XYPairs.Y__3',
#  'Rachis.DMDemands.Structural.DMDemandFunction.MaximumOrganWt.FixedValue']

Genotypes_cols = list(Genotypes)
Genotypes = Genotypes.drop(columns=['File', 'Genotype'])
# coerce None to NaN so we can convert to matrix
for e in list(Genotypes):
    Genotypes[e] = Genotypes[e].astype(float)

# Genotypes = torch.tensor(Genotypes.to_numpy())

In [None]:
Genotypes_lookup

In [None]:
# for a given idx and year...
idx_Ids = 1
year = 2000

In [None]:
# get lookup information
lookup = Ids.loc[idx_Ids, ].to_dict()

In [None]:
mask = ((Genotypes_lookup.File == lookup['File']) &  (Genotypes_lookup.Genotype == lookup['Genotype']))
# should only have a single value
assert sum(mask) == 1

idx_Geno = Genotypes_lookup.loc[mask, ].index[0]

Genotypes[idx_Geno]

In [None]:
lookup

In [None]:
lookup_date = _prep_unix_epoch_to_date(max_year = 2024)
lookup_date.head()

In [None]:
Result = pq.read_table(apsimx_sim_parquet_dir+'/'+'sim_1698440407_4739.parquet').to_pandas()

In [None]:
# lookup_* is a internally generated ref
# *_lookup is a table based on loaded apsimx data

In [None]:
Result_lookup = Result.loc[:, ['Date', 'FactorialUID']].copy()
Result.drop(columns=['Date', 'FactorialUID'])

Result_list = list(Result)
Result = torch.tensor(Result.to_numpy())

Result_lookup.head()

In [None]:
print(Result_lookup.shape)
Result_lookup.merge(lookup_date.rename(columns={'Unix':'Date'}), how = 'left')

In [None]:
Result_lookup.loc[(Result_lookup.FactorialUID == 24024), ].merge(lookup_date.rename(columns={'Unix':'Date'}), how = 'left').Date.max()

In [None]:
((19250-5285)/365)+1984

In [None]:
lookup_date.loc[(lookup_date.Unix == 19250)]

In [None]:
lookup

In [None]:
# TODO make this valid for torch
sow_date = lookup_date.loc[((lookup_date.Year == year) & 
                            (lookup_date.SowDate == lookup['SowDate'])), 'Unix'].values[0]

# because of how this is set up index is also valid
_ = lookup_date.loc[(lookup_date.Year == year), 'Unix'].agg(['min', 'max'])
_['min']

In [None]:
mask = (
    (Result_lookup['Date'] >= _['min']) & 
    (Result_lookup['Date'] <= _['max']) &
    (Result_lookup['FactorialUID'] == lookup['FactorialUID'])
    )

idx_Result = Result_lookup.loc[mask, ].index

Result[idx_Result, ].shape

In [None]:
#TODO make sure there aren't any values before the SowDate
Result_list

In [None]:
start = 129486



px.imshow((Result[(start-30):(start+30), 1:-1].numpy()[0:30, ]).transpose())

In [None]:
import matplotlib.pyplot as plt

px.imshow(Result[idx_Result, ].numpy()[0:30, ])

In [None]:
out = torch.zeros((365, 6))

Result_lookup.loc[mask, ['Date']].min().values[0] - _['min']

# sow_date - _['min']

In [None]:
datetime.datetime(1970, 1, 1, 0, 0) + datetime.timedelta(10957)

In [None]:


# idx_Result.to_list()
# Result

In [None]:
Genotypes_cols