# core

> This module provides a copy of the [Genomes to Fields](https://www.genomes2fields.org/) data for 2014-2021. Data is accesible by the `get_data` function. The other notebooks contain cleaning, imputation, and exploration code. <span style="color:red">_Note:_</span> The provided phenotypic data includes some phenotypes which do not have complete cases.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

Data is prepared based on internal files and saved into `dataG2F/dataG2F/datasets` from the directories in `nbs_artifacts` where it was saved after generation.

In [None]:
expose_files = [
    'ACGT.npy',
    'ACGT_hilb.npy',
    'mgmtMatNames.npy',
    'mgmtMat.npy',
    'SMatNames.npy',
    'SMat.npy',
    'PlantHarvestNames.npy',
    'PlantHarvest.npy',
    'WMat.npy',
    'WMatNames.npy',
    'WMat_hilb.npy',
    'phno_geno.csv',
    'obs_geno_lookup.npy',
    'obs_env_lookup.npy']
# input and output names are the same _unless_ the input is a pkl which should be saved as a json
expose_files = [[e, e] for e in expose_files]

In [None]:
import os
import shutil
import pickle as pkl
import json

In [None]:
def prep_data(
        load_name,
        save_name,
        load_from = '../nbs_artifacts/05_prep_matrices/',
        save_to   = '../dataG2F/datasets/',
        force = False
        ):
    # check if file exists in save location.
    if (save_name in os.listdir(save_to)) & (force != True):
        pass
    else:
        # do the load and save name have the same ending? 
        load_ext = load_name.split('.')[-1]
        save_ext = save_name.split('.')[-1]

        if load_ext == save_ext:
            shutil.copy(
                load_from+load_name,
                save_to+save_name
            )
        elif (load_ext == 'pkl') & (save_ext == 'json'):
            # assumes pickle is a dictionary
            # convert pickle to json and save
            with open(load_from+load_name, 'rb') as f:
                dat = pkl.load(f)

            with open(save_to+save_name, 'w') as f:
                json.dump(dat, f, ensure_ascii=False, indent=4)
        else:
            print(f'Unsure how to move {load_from+load_name} to {save_to+save_name}')


for n1, n2 in expose_files:
    print(n1)
    prep_data(
        load_name = n1,
        save_name = n2,
        load_from = '../nbs_artifacts/05_prep_matrices/',
        save_to   = '../dataG2F/datasets/'
        )



ACGT.npy
ACGT_hilb.npy
mgmtMatNames.npy
mgmtMat.npy
SMatNames.npy
SMat.npy
PlantHarvestNames.npy
PlantHarvest.npy
WMat.npy
WMatNames.npy
WMat_hilb.npy
phno_geno.csv
obs_geno_lookup.npy
obs_env_lookup.npy


In [None]:
# now rerun for pickled data
expose_files = [
    'filtered_kegg_gene_entries.pkl',
    'ACGT_gene_slice_list.pkl'
    ]
expose_files = [[e, e] for e in expose_files]


for n1, n2 in expose_files:
    print(n1)
    prep_data(
        load_name = n1,
        save_name = n2,
        load_from = '../nbs_artifacts/07_filter_genotypes/',
        save_to   = '../dataG2F/datasets/'
        )

filtered_kegg_gene_entries.pkl
ACGT_gene_slice_list.pkl


## Getting Historical Data:

### Historical Weather Data

In [None]:
expose_files = [
    'power_data.npy',
    'power_date.npy',
    'power_lats.npy',
    'power_keys.npy',
    'power_lons.npy']
# input and output names are the same _unless_ the input is a pkl which should be saved as a json
expose_files = [[e, e] for e in expose_files]


for n1, n2 in expose_files:
    print(n1)
    prep_data(
        load_name = n1,
        save_name = n2,
        load_from = '../nbs_artifacts/06_gps_grid_nasa_power/power_data/',
        save_to   = '../dataG2F/datasets/'
        )


power_data.npy
power_date.npy
power_lats.npy
power_keys.npy
power_lons.npy


### Mapping of Historical Data (counties) to gps

In [None]:
# Move lookup to match county level data to gps coordinates (one to many)
prep_data(
        load_name = 'latlon_to_county.csv',
        save_name = 'latlon_to_county.csv',
        load_from = '../nbs_artifacts/06_gps_grid_nasa_power/',
        save_to   = '../dataG2F/datasets/'
        )

In [None]:
import pandas as pd

In [None]:
# Move a subset of the historical data (y value with the most non-missings)
hist = pd.read_csv('../nbs_artifacts/10_collect_historical_data/nass_historical.csv')
hist = hist.loc[:, ['State', 'County', 'GRN_BUpACRE', 'Year']]
hist.to_csv('../dataG2F/datasets/nass_historical.csv', index=False)

# Create data access function

In [None]:
#| export 

import pkgutil
from io import BytesIO
# supported formats
import numpy as np
import pandas as pd
import pickle as pkl


In [None]:
#| export 

def get_data(name = '', # `name` of the data to be retrieved. If no recognized name (or '') is passed, a list of available datasets will be printed.
             **kwargs # `filename` can be used in lieu of a name
             ):
    "This is a simple function to access cleaned and imputed Genomes to Fields data. It's based on my EnvDL.dlfn.g2fc_datawrapper() class but is simpler, not containing methods for setting validation splits, scaling, etc."
    # if a file name is passed in directly use it.
    if 'filename' in kwargs.keys():
        filename = kwargs['filename']
    else:
        # defaults for quick access
        defaults_dict = {
            ## Genomic Data
            'ACGT':         'ACGT.npy',
            'ACGT_hilb':    'ACGT_hilb.npy',            
            'KEGG_entries': 'filtered_kegg_gene_entries.pkl',
            'KEGG_slices':  'ACGT_gene_slice_list.pkl',

            ## Soil and Management 
            'mgmtMatNames': 'mgmtMatNames.npy',
            'mgmtMat':      'mgmtMat.npy',
            'SMatNames':    'SMatNames.npy',
            'SMat':         'SMat.npy',

            ## Weather
            'PlantHarvestNames': 'PlantHarvestNames.npy',
            'PlantHarvest':      'PlantHarvest.npy',
            'WMat':              'WMat.npy',
            'WMatNames':         'WMatNames.npy',
            'WMat_hilb':         'WMat_hilb.npy',

            # Response and lookup
            'phno':            'phno_geno.csv',
            'obs_geno_lookup': 'obs_geno_lookup.npy', # Phno_Idx  Geno_Idx  Is_Phno_Idx
            'obs_env_lookup':  'obs_env_lookup.npy',  # Phno_Idx  Env_Idx   Is_Phno_Idx
            # 'YMat':            'YMat.npy'

            # Historical Weather (NASA Power)
            'power_data':'power_data.npy',
            'power_date':'power_date.npy',
            'power_lats':'power_lats.npy',
            'power_keys':'power_keys.npy',
            'power_lons':'power_lons.npy',
            
            # Historical Yield and Metadata
            'nass_data':'nass_historical.csv',
            'nass_latlon':'latlon_to_county.csv',
        }
        filename = ''
        if name in defaults_dict.keys():
            filename = defaults_dict[name]
        else: 
            print(f'`name` not recognized. \nUse an allowed `name` or specify the filename as a kwarg e.g. `name = \'\', filename = \'demo.txt\'`\nAllowed `name`s are:\n{list(defaults_dict.keys())}')
    
    if filename == '':
        pass
    else:
        # retrieve the requested data 
        filetype = filename.split('.')[-1]
        x = pkgutil.get_data('dataG2F', f'datasets/{filename}')

        if filetype == 'npy':
            x = np.load(BytesIO(x))
        if filetype == 'csv':
            x = pd.read_csv(BytesIO(x))
        if filetype == 'pkl':
            x = pkl.load(BytesIO(x))
        return(x)


In [None]:
from dataG2F.core import get_data

ModuleNotFoundError: No module named 'dataG2F'

In [None]:
get_data(name = '')

`name` not recognized. 
Use an allowed `name` or specify the filename as a kwarg e.g. `name = '', filename = 'demo.txt'`
Allowed `name`s are:
['ACGT', 'ACGT_hilb', 'KEGG_entries', 'KEGG_slices', 'mgmtMatNames', 'mgmtMat', 'SMatNames', 'SMat', 'PlantHarvestNames', 'PlantHarvest', 'WMat', 'WMatNames', 'WMat_hilb', 'phno', 'obs_geno_lookup', 'obs_env_lookup']


In [None]:
get_data(name = 'obs_geno_lookup')[-3:]

array([[133054,   4873, 123236],
       [133055,   4875, 123238],
       [133056,   4897, 123684]])

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()