In [None]:
from datetime import datetime
import pandas as pd
import numpy as np
import os
import shutil


def get_df_from_path(path):
    df = pd.read_csv(path, header=0)
    return df


dataset_list = [
    # {"dataset_name": "amgut1",
    #     "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/amgut1_data_power_transformed.csv")},
    
    # {"dataset_name": "amgut2",
    #     "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/amgut2_data_power_transformed.csv")},


    # {"dataset_name": "crohns",
    #  "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/crohns_data_power_transformed.csv")},

    {"dataset_name": "ioral",
     "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/ioral_data_power_transformed.csv")},

    # {"dataset_name": "hmp2prot",
    #  "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/hmp2prot_data_power_transformed.csv")},

    # {"dataset_name": "hmp216S",
    #  "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/hmp216S_data_power_transformed.csv")},

    # {"dataset_name": "baxter_crc",
    #     "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/baxter_crc_data_power_transformed.csv")},

    # {"dataset_name": "glne007",
    #  "dataframe": get_df_from_path("/home/da2343/cs685_fall22/data/glne007_data_power_transformed.csv")},

]
params_df_list = []

for dataset in dataset_list:
    dataset_name = dataset["dataset_name"]
    df = dataset["dataframe"]
    n_row, n_col = df.shape

    # arrange n_row into 10 folds
    # n_fold = 2
    # n_row_fold = int(n_row/n_fold)
    # no_sub_samples = 30
    # sub_samples_array = n_row//(np.arange(no_sub_samples)+2)
    # sub_samples_array = np.append(sub_samples_array, n_row)
    sub_samples_array = [10, 20, 30, 40, 50, n_row]
    # sub_samples_array = [6,10, n_row]

    sub_params_df_list = []

    for sub_samples in sub_samples_array:
        n_row_fold = int(n_row/sub_samples)
        n_row_fold_vec = np.ones(n_row_fold, dtype=int)*sub_samples
        n_row_fold_vec[:sub_samples % n_row_fold] += 1
        n_row_fold_cumsum = np.cumsum(n_row_fold_vec) - 1
        # n_row_fold_cumsum should not exceed n_row
        n_row_fold_cumsum = n_row_fold_cumsum[n_row_fold_cumsum < n_row]
        end_index = n_row_fold_cumsum
        start_index = np.concatenate([[0], end_index[:-1]+1])
        # print(start_index)

        # print(end_index)

        # start_index should always be less than end_index

        params_dict = {
            'Index of Prediction Col': np.arange(n_col),
            'Start Index of # of Total Samples': start_index,
            'End Index of # of Total Samples': end_index,
        }

        sub_params_df = pd.MultiIndex.from_product(
            params_dict.values(),
            names=params_dict.keys()
        ).to_frame().reset_index(drop=True)

        # filter out rows where start_index is greater than end_index
        sub_params_df = sub_params_df[sub_params_df['Start Index of # of Total Samples']
                                      < sub_params_df['End Index of # of Total Samples']]
        # add a column for # of Total Samples
        sub_params_df['# of Total Samples'] = sub_params_df['End Index of # of Total Samples'] - \
            sub_params_df['Start Index of # of Total Samples'] + 1
        sub_params_df = sub_params_df[sub_params_df['# of Total Samples'] == sub_samples]
        sub_params_df_list.append(sub_params_df)

    concat_sub_params_df = pd.concat(sub_params_df_list, ignore_index=True)
    # Add a column for Dataset
    concat_sub_params_df['Dataset'] = dataset_name
    params_df_list.append(concat_sub_params_df)


params_concat_df = pd.concat(params_df_list, ignore_index=True)
n_tasks, ncol = params_concat_df.shape

date_time = datetime.now().strftime("%Y-%m-%d_%H:%M")
job_name = f"model_subsampling_{date_time}"
job_dir = "/scratch/da2343/" + job_name
results_dir = os.path.join(job_dir, "results")
os.system("mkdir -p "+results_dir)
params_concat_df.to_csv(os.path.join(job_dir, "params.csv"), index=False)

run_one_contents = f"""#!/bin/bash
#SBATCH --array=0-{n_tasks-1}
#SBATCH --time=4:00:00
#SBATCH --mem=4GB
#SBATCH --cpus-per-task=1
#SBATCH --error={job_dir}/slurm-%A_%a.out
#SBATCH --output={job_dir}/slurm-%A_%a.out
#SBATCH --job-name={job_name}
cd {job_dir}
python run_one.py $SLURM_ARRAY_TASK_ID
"""
run_one_sh = os.path.join(job_dir, "run_one.sh")
with open(run_one_sh, "w") as run_one_f:
    run_one_f.write(run_one_contents)

run_orig_py = "demo_run.py"
run_one_py = os.path.join(job_dir, "run_one.py")
shutil.copyfile(run_orig_py, run_one_py)
orig_dir = os.path.dirname(run_orig_py)
orig_results = os.path.join(orig_dir, "results")
os.system("mkdir -p "+orig_results)
orig_csv = os.path.join(orig_dir, "params.csv")
params_concat_df.to_csv(orig_csv, index=False)

msg = f"""created params CSV files and job scripts, test with
python {run_orig_py}
SLURM_ARRAY_TASK_ID=0 bash {run_one_sh}"""
print(msg)


In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import os
import shutil
import sys
sys.path.append(os.path.abspath("/projects/genomic-ml/da2343/ml_project_1/shared"))
from constants import *

In [4]:
dataset_list

[{'dataset_name': 'necromass_bacteria',
  'dataframe':         OTU2      OTU3      OTU4     OTU28      OTU9      OTU1     OTU16  \
  0  -0.190298 -0.861911  1.424124 -0.725607 -0.549242  1.476442 -1.062128   
  1  -1.556466 -0.861911  1.403219 -0.725607 -0.549242  1.477558 -1.062128   
  2  -0.456445 -0.861911  1.450232 -0.725607 -0.549242  1.452936 -1.062128   
  3  -0.526644 -0.861911  1.424617 -0.725607 -0.549242  1.416842 -1.062128   
  4  -0.363265 -0.861911  1.434511 -0.725607 -0.549242  1.431586 -1.062128   
  ..       ...       ...       ...       ...       ...       ...       ...   
  64  1.252508  1.327935 -0.707057  1.148243  1.811761 -0.706736  0.666133   
  65 -1.556466  1.490997 -0.707057 -0.725607  1.799363 -0.706736  0.544462   
  66  0.930162 -0.861911 -0.707057  1.044864 -0.549242 -0.706736 -1.062128   
  67  1.371990  0.714844 -0.707057 -0.725607 -0.549242 -0.706736  0.304568   
  68 -0.092712  0.943718 -0.707057 -0.725607  1.759270 -0.706736  0.794385   
  
        