In [1]:
# set a directory structure

import os, json  # Import the os module to interact with the operating system

def create_directory_structure(root_dir):
    # Define paths for src, run, inputs, and outputs
    src_at = os.path.join(root_dir, "src", "Py")  # Path for the source directory
    run_at = os.path.join(root_dir, "run", "Py")  # Path for the run directory
    inputs_at = os.path.join(root_dir, "inputs")  # Path for the inputs directory
    outputs_at = os.path.join(root_dir, "outputs")  # New output directory

    # Create a dictionary to define directories and their initial files
    directories_and_files = {
        src_at: {  # Source directory with initial files
            '__init__.py': "# Sample package initialization\n",  # Package init file
            'libs.py': "# Add common libraries here to be loaded for all analysis\n",  # Placeholder for common libraries
            'func.py': "# Add custom functions here to be loaded for all analysis\n"  # Placeholder for custom functions
        },
        run_at: {  # Run directory with a main script
            'dodo.py': "#!/usr/bin/env python3"  # Shebang for executing the script
        },
        inputs_at: {},  # Inputs directory (no initial files)
        outputs_at: {}  # Outputs directory (no initial files)
    }

    created_directories = {}  # Initialize a dictionary to track created directories

    # Iterate through the directories and their associated files
    for directory, files in directories_and_files.items():
        # Create the directory if it doesn't exist
        if not os.path.exists(directory):
            os.makedirs(directory)  # Create the directory
            # Store the absolute path of the created directory with a modified key
            created_directories[directory.replace(root_dir + '/', '').replace('/', '_')] = os.path.abspath(directory)  
            # Create and write the specified files in the new directory
            for file_name, content in files.items():
                with open(os.path.join(directory, file_name), 'w') as f:  # Open a new file for writing
                    f.write(content)  # Write the initial content to the file
        else:
            print(f"Directory {directory} already exists. No files were created.")  # Inform if the directory already exists
            # Store the absolute path of the existing directory with a modified key
            created_directories[directory.replace(root_dir + '/', '').replace('/', '_')] = os.path.abspath(directory)  

    return created_directories  # Return the dictionary of created or existing directories

# Define the root directory
root_dir = '/proj'
directory_structure = create_directory_structure(root_dir) 

def write_json(data, path, verbose = False):
    with open(path, "w") as fp:   
        json.dump(data, fp)
    if verbose:
        return print("Done")

write_json(directory_structure, "/proj/inputs/core_paths.json") # saves the paths

Directory /proj/src/Py already exists. No files were created.
Directory /proj/run/Py already exists. No files were created.
Directory /proj/inputs already exists. No files were created.
Directory /proj/outputs already exists. No files were created.


In [2]:
# Define a cv file
import pandas as pd
p_data = pd.read_feather("/proj/ext_dir/data_ravi/14_10_2024/example_pheno.feather")

# Assign data types
p_data["trait"] = p_data["trait"].astype("str")
p_data["genotype"] = p_data["genotype"].astype("str")
p_data["blups"] = p_data["blups"].astype("float64")
p_data["std.error"] =p_data["std.error"].astype("float64")
p_data["dataset"] = p_data["dataset"].astype("str")
p_data["type"] = p_data["type"].astype("str")
#data.dtypes # check column data types

# Check overlap
test = p_data.loc[p_data.type == "test", "genotype"].to_list()
train = p_data.loc[p_data.type == "train", "genotype"].to_list()
[print(x)for x in test if x in train] # all are unique

# CV file
cv_acr_1 = {}
cv_acr_1["run_1"] = {"test" : p_data.loc[p_data.type == "test", ].index.to_list(),
                     "train" : p_data.loc[p_data.type == "train", ].index.to_list()}

# Write as json
write_json(cv_acr_1, "/proj/inputs/acr_cv.json") # saves the paths

In [3]:
# Data preprocessing
g_data = pd.read_feather("/proj/ext_dir/data_ravi/14_10_2024/example_geno.feather")
g_data = g_data.set_index('genotype')
all_geno = train + test

# Check calls for heterozygotes
#for col in g_data:
#    if(int(1) in g_data[col].unique()):
#        print(col)

# Check if all phenotype records have associated genotypic data
[print(x)for x in g_data.index.to_list() if x not in all_geno] # all are there

# scale data
from sklearn.preprocessing import MinMaxScaler
import numpy as np

def scale_data(to_transform, pd_cols, pd_index):
    scaler = MinMaxScaler((0,1))
    data_scaled = scaler.fit_transform(to_transform)
    data_scaled_df = pd.DataFrame(data_scaled, columns = pd_cols, index = pd_index)
    return data_scaled_df, scaler

g_data_mod, acr_g_scl = scale_data(g_data, g_data.columns, g_data.index)
p_data_mod, acr_p_scl = scale_data(p_data.loc[:, "blups"].values.reshape(-1, 1), 
                                   ["blups"],
                                   p_data.index)

acr_p = p_data.merge(p_data_mod, how='left', left_index=True, right_index=True, 
                         sort=False, suffixes=('_raw', '_scaled'))

# reshape geno data to fit the p_data
np_g_data = np.stack([g_data_mod[g_data_mod.index == idx].iloc[0,:].values for idx in acr_p['genotype']])
acr_g = np_g_data.reshape(np_g_data.shape[0], np_g_data.shape[1], 1)

# check shape
# acr_g.shape

# save data
import pickle

def write_pkl(data, path, verbose = False):
    with open(path, "wb") as fp:   # pickling
        pickle.dump(data, fp)
    if verbose:
        return print("Done")
    
out_paths = {}
out_paths['acr_g.npy'] = "/proj/inputs/acr_g.npy"
out_paths['acr_p.csv'] = "/proj/inputs/acr_p.csv"
out_paths['acr_g.scl'] = "/proj/inputs/acr_g.scl"
out_paths['acr_p.scl'] = "/proj/inputs/acr_p.scl"

np.save(out_paths['acr_g.npy'], acr_g)
acr_p.to_csv(out_paths['acr_p.csv'], index=False)
write_pkl(acr_g_scl, out_paths['acr_g.scl'])
write_pkl(acr_p_scl, out_paths['acr_p.scl'])

In [10]:
# create the run scripts
# cd /proj/run/Py
# doit

# run the analysis
# acquire a gpu cluster
# run /qg-10/data/AGR-QG/Gogna/tutorials/tut_CNN/outputs/create_slurm_scripts/acr_CNN_acr_cv/master_script.sh create by previous step

# import predictions
import pandas as pd
output = pd.read_csv("/proj/outputs/create_slurm_scripts/acr_CNN_acr_cv/run_1/pred/output.csv") 

# check prediction ability
output['obs'].corr(output['pred']).round(2)

0.2