### purpose

fit trained GF models that used nuisance envs to the common gardens of the environment

### notes

the subsetfiles I create as a workaround to run MVP_03 without changing code aren't exactly like the convential files they're meant to represent to the pipeline. Conventional subset files are individually-based. However since I'm taking means based on subpopID (for eg envdata), if I only have one entry per pop (population-based files) I'll still get the same thing back as if using the individually-based data. (mvp03 calls mvp06.get_pop_data which calls mvp01.read_ind_data which reads in the subsetfile). This is all ok as long as I never try to use these files and assume individual data (though it should throw an error somewhere along the line if I do try)

In [1]:
from pythonimports import *

import MVP_summary_functions as mvp
import MVP_02_fit_gradient_forests as mvp02

t0 = dt.now()

mvp.latest_commit()
session_info.show()

#########################################################
Today:	April 25, 2023 - 15:48:19
python version: 3.8.5

Current commit of [1mpythonimports[0m:
[33mcommit 4ecd56c8c80ec4876790fcd425cb75b9db9a4f24[m  
Author: Brandon Lind <lind.brandon.m@gmail.com>  
Date:   Tue Mar 21 13:20:54 2023 -0400

Current commit of [94m[1mMVP_offsets[0m[0m:
[33mcommit ef86c4b94658027c580e1bbfdfc1195b0b7077e9[m  
Author: Brandon Lind <lind.brandon.m@gmail.com>  
Date:   Thu Mar 23 10:58:00 2023 -0400
#########################################################



# get dirs

In [2]:
# training shdirs
training_dirs = {
    'ISO-PSsd' : '/home/b.lind/offsets/run_20220919_nuisance/ISO-PSsd_0-225/gradient_forests/training/training_shfiles',
    'ISO-TSsd-PSsd' : '/home/b.lind/offsets/run_20220919_nuisance/ISO-TSsd-PSsd_0-225/gradient_forests/training/training_shfiles'
}

In [3]:
# get unique seeds by querying adaptive runs from the first nuis_envs dir (each dir has same seeds)
seeds = []
for nuis_envs, d in training_dirs.items():
    for sh in fs(d, pattern='adaptive', endswith='.sh'):
        seed = op.basename(sh).split("_")[0]
        seeds.append(seed)
        
    break

len(seeds), luni(seeds)

(225, 225)

# create dummy ind envfiles

mvp02 requires there to be two envfiles (one for pooled, one for individual) - so just create a blank individual file (blank so that if it's ever tried to be used by code it will likely throw up an error somewhere)

In [4]:
for nuis_envs, d in training_dirs.items():
    filedir = d.replace('_shfiles', '_files')
    assert op.exists(filedir)

    envfiles = fs(filedir, endswith='envfile_GFready_pooled.txt')

    # create a blank ind file to get passed file count assertion by mvp02
    for f in pbar(envfiles, desc=nuis_envs):
        df = pd.read_table(f, nrows=0, index_col=0)  # only read in columns so errors are produced if used
        df.to_csv(f.replace('pooled', 'ind'), sep='\t', index=True, header=True)

df

ISO-PSsd: 100%|███████████████| 225/225 [00:05<00:00, 37.71it/s]
ISO-TSsd-PSsd: 100%|███████████████| 225/225 [00:05<00:00, 40.06it/s]


Unnamed: 0,sal_opt,temp_opt,ISO,TSsd,PSsd


# create subset files

normally these would be individually based but I'm creating as pooled since averaging temp by subpopID will be the same

In [5]:
# indID - from mvp01.read_ind_data
# subpopID - from mvp06.get_pop_data
# x, y - from mvp06.get_pop_data
# env names

In [6]:
# create a subsetfile using the rangefiles that were given to GF scripts
# rangefiles will be limited to the envdata used to train GF models
for nuis_envs, d in training_dirs.items():
    filedir = d.replace('_shfiles', '_files')

    rangefiles = fs(filedir, pattern='range')

    print(nuis_envs, len(rangefiles))

    outerdir = d.split('/gradient_forests/')[0]
    slimdir = makedir(f'{outerdir}/slimdir')  # where to eventually save the subsetfile

    for rangefile in pbar(rangefiles, desc=nuis_envs):

        df = pd.read_table(rangefile)
        df['subpopID'] = range(1, 101, 1)
        df['indID'] = df['subpopID']  # copying is fine if assuming pooled data
        df.columns = [col.replace('lat', 'y').replace('lon', 'x') for col in df.columns]

        seed = op.basename(rangefile).split('_')[0]
        subsetfile = f'{slimdir}/{seed}_Rout_ind_subset.txt'
        df.to_csv(subsetfile, sep='\t')  # subsetfiles are assumed to read in as delim_whitespace=True; sep='\t' works with this

ISO-PSsd 225


ISO-PSsd: 100%|███████████████| 225/225 [00:07<00:00, 29.75it/s]


ISO-TSsd-PSsd 225


ISO-TSsd-PSsd: 100%|███████████████| 225/225 [00:08<00:00, 27.14it/s]


In [7]:
# example dataframe
df

Unnamed: 0,sal_opt,temp_opt,ISO,TSsd,PSsd,y,x,subpopID,indID
0,-1.0,-1.0,0.009381,-2.245441,1.003559,1.0,1.0,1,1
1,-0.5,-1.0,0.239071,-1.967663,0.518946,1.0,2.0,2,2
2,0.0,-1.0,0.580610,-0.963343,-0.246231,1.0,3.0,3,3
3,0.5,-1.0,0.639031,-0.164371,-0.399267,1.0,4.0,4,4
4,1.0,-1.0,-0.036985,-0.291426,-0.661614,1.0,5.0,5,5
...,...,...,...,...,...,...,...,...,...
95,1.0,1.0,0.113241,-0.239327,-0.054937,10.0,6.0,96,96
96,0.5,1.0,0.881061,0.598108,1.480884,10.0,7.0,97,97
97,0.0,1.0,1.587251,1.669095,2.461321,10.0,8.0,98,98
98,-0.5,1.0,1.381814,2.462649,2.880066,10.0,9.0,99,99


# create fitting shfiles

In [8]:
# create fitting dirs
fitting_dirs = {
    'ISO-PSsd' : makedir(
        '/home/b.lind/offsets/run_20220919_nuisance/ISO-PSsd_0-225/gradient_forests/fitting/fitting_shfiles'
    ),

    'ISO-TSsd-PSsd' : makedir(
        '/home/b.lind/offsets/run_20220919_nuisance/ISO-TSsd-PSsd_0-225/gradient_forests/fitting/fitting_shfiles'
    )
}

In [9]:
paramsfile = '/home/b.lind/offsets/run_20220919_0-225/slimdir/0b-final_params-20220428.txt'

shfiles = []
for nuis_envs, fitting_dir in fitting_dirs.items():
    training_outdir = training_dirs[nuis_envs].replace('training_shfiles', 'training_outfiles')
    slimdir = training_outdir.replace('gradient_forests/training/training_outfiles', 'slimdir')  # mvp02 doesnt use this see docstring

    # slimdir is used by mvp03 though so copy over a paramsfile
    dst = op.join(slimdir, '0b-final_params-20220428.txt')
    shutil.copy(paramsfile, dst)

    gf_parentdir = op.join(op.dirname(slimdir), 'gradient_forests')
    print(gf_parentdir)

    for seed in pbar(seeds, desc=nuis_envs):
        job = f'{seed}_{nuis_envs}_gf_fitting'

        text = f'''#!/bin/bash
#SBATCH --job-name={job}
#SBATCH --time=1-00:00:00
#SBATCH --ntasks=1
#SBATCH --mem=300000M
#SBATCH --output={job}_%j.out
#SBATCH --mail-user=b.lind@northeastern.edu
#SBATCH --mail-type=FAIL
#SBATCH --nodes=1
#SBATCH --cpus-per-task=7

cd /home/b.lind/code/MVP-offsets/01_src

source $HOME/.bashrc

conda activate mvp_env

python MVP_02_fit_gradient_forests.py {seed} {slimdir} {training_outdir} /home/b.lind/anaconda3/envs/r35/lib/R/bin/Rscript

python MVP_03_validate_gradient_forests.py {seed} {slimdir} {gf_parentdir}

'''
        sh = op.join(fitting_dir, f'{job}.sh')
        with open(sh, 'w') as o:
            o.write(text)

        shfiles.append(sh)

/home/b.lind/offsets/run_20220919_nuisance/ISO-PSsd_0-225/gradient_forests


ISO-PSsd: 100%|███████████████| 225/225 [00:03<00:00, 63.88it/s]


/home/b.lind/offsets/run_20220919_nuisance/ISO-TSsd-PSsd_0-225/gradient_forests


ISO-TSsd-PSsd: 100%|███████████████| 225/225 [00:04<00:00, 47.77it/s]


In [10]:
pids = sbatch(shfiles)

sbatching: 100%|███████████████| 450/450 [03:23<00:00,  2.21it/s]


In [11]:
Squeue(grepping='fitting').update(to_partition='long', num_jobs=225)

update: 100%|███████████████| 225/225 [00:31<00:00,  7.04it/s]


In [12]:
formatclock(dt.now() - t0)

'0-00:04:32'

In [13]:
Squeue(exclude='bash').partitions()

{'long': Counter({'PD': 225}), 'short': Counter({'PD': 223, 'R': 2})}

In [14]:
Squeue(exclude='bash').states().counter()

Counter({'PD': 448, 'R': 2})

In [15]:
Squeue(exclude='bash').states().counter()

Counter({'PD': 442, 'R': 8})

In [16]:
Squeue(exclude='bash').partitions()

{'long': Counter({'PD': 219, 'R': 6}), 'short': Counter({'PD': 223, 'R': 2})}

# check on progress

In [1]:
from pythonimports import *

import MVP_summary_functions as mvp

mvp.latest_commit()
session_info.show()

#########################################################
Today:	April 28, 2023 - 09:16:45
python version: 3.8.5

Current commit of [1mpythonimports[0m:
[33mcommit a8b230cefd0c1b0b00eeb77ec5faf75e44771905[m  
Author: Brandon Lind <lind.brandon.m@gmail.com>  
Date:   Wed Apr 26 11:01:24 2023 -0400

Current commit of [94m[1mMVP_offsets[0m[0m:
[33mcommit ef86c4b94658027c580e1bbfdfc1195b0b7077e9[m  
Author: Brandon Lind <lind.brandon.m@gmail.com>  
Date:   Thu Mar 23 10:58:00 2023 -0400
#########################################################



In [2]:
# create fitting dirs
fitting_dirs = {
    'ISO-PSsd' : 
        '/home/b.lind/offsets/run_20220919_nuisance/ISO-PSsd_0-225/gradient_forests/fitting/fitting_shfiles',

    'ISO-TSsd-PSsd' : 
        '/home/b.lind/offsets/run_20220919_nuisance/ISO-TSsd-PSsd_0-225/gradient_forests/fitting/fitting_shfiles'
}

In [3]:
lview, dview = get_client(cluster_id='1682687722-xo2p', profile='lotterhos')

36 36


In [4]:
seffs = {}
for nuis_envs, shdir in fitting_dirs.items():
    seffs[nuis_envs] = Seffs.parallel(lview, outs=fs(shdir, endswith='.out')).most_recent()

[1m
Watching 386 parallel Seffs jobs ...[0m


requesting seffs: 100%|███████████████| 386/386 [00:12<00:00, 31.00it/s]
retrieving seffs: 100%|███████████████| 386/386 [00:03<00:00, 128.58it/s]


[1m
Watching 246 parallel Seffs jobs ...[0m


requesting seffs: 100%|███████████████| 246/246 [00:01<00:00, 149.55it/s] 
retrieving seffs: 100%|███████████████| 246/246 [00:01<00:00, 123.86it/s]


In [5]:
for nuis_envs, s in seffs.items():
    print(nuis_envs, s.states.counter())

ISO-PSsd Counter({'State: COMPLETED (exit code 0)': 225})
ISO-TSsd-PSsd Counter({'State: COMPLETED (exit code 0)': 225})


In [7]:
for nuis_env, d in fitting_dirs.items():
    newd = d.replace('_shfiles', '_outfiles')
    
    files = fs(newd)
    
    print(nuis_env, len(files))

ISO-PSsd 135000
ISO-TSsd-PSsd 135000
