In [None]:

import pysam
import random
import re
import pandas as pd
import os


def get_train_valid_idx(sample_num):
    """
    Obtain the indices for the training set and validation set according to an 8:2 ratio
    """
    train_idx = []
    valid_idx = []
    random_idx = list(range(sample_num))
    random.shuffle(random_idx)
    avg_group_count = 10
    step = int(len(random_idx)/avg_group_count)
    random_idx_group = [random_idx[i:i+step] for i in range(0,len(random_idx),step)]
    for i in range(8):
        train_idx.extend(random_idx_group[i])
    for i in range(8,len(random_idx_group)):
        valid_idx.extend(random_idx_group[i])
    return {'train':train_idx,'valid':valid_idx}


tissues = ['Skeletal_Muscle','Liver','Adipose','Heart','Spleen']


# Generate train set and validation set

Generate and save the training and validation datasets to the "train_valid_csv" directory based on the "fa_diff_len" directory.

In [None]:
fasta_dir = "./fa_diff_len"

for snp_dir in ['pig_e1','pig_e10',"pig_e1_e10",'human_promoter']:
    for extend_len in [512]:
        for tissue in tissues:
            fa_positive = os.path.join(fasta_dir,snp_dir,f"{tissue}_{extend_len*2+1}_positive.fa")
            fa_positive = pysam.FastaFile(fa_positive)
            fa_negative = os.path.join(fasta_dir,snp_dir,f"{tissue}_{extend_len*2+1}_negative.fa")
            fa_negative = pysam.FastaFile(fa_negative)
            group_idx = get_train_valid_idx(len(fa_positive.references))
            seq_pos_train = [ (fa_positive.fetch(fa_positive.references[i]),1) for i in group_idx['train']]
            seq_pos_valid = [ (fa_positive.fetch(fa_positive.references[i]),1) for i in group_idx['valid']]
            
            group_idx = get_train_valid_idx(len(fa_negative.references))
            seq_neg_train = [ (fa_negative.fetch(fa_negative.references[i]),0) for i in group_idx['train']]
            seq_neg_valid = [ (fa_negative.fetch(fa_negative.references[i]),0) for i in group_idx['valid']]
            
            all_seq_train = pd.DataFrame(seq_pos_train+seq_neg_train)
            all_seq_valid = pd.DataFrame(seq_pos_valid+seq_neg_valid)
            all_seq_train.columns = ['sequence','label']
            all_seq_valid.columns = ['sequence','label']
            all_seq_train = all_seq_train.sample(frac=1)
            all_seq_valid = all_seq_valid.sample(frac=1)
            save_dir = f"./train_valid_csv/{snp_dir}_{extend_len*2+1}/{tissue}"
            os.makedirs(save_dir,exist_ok=True)
            train_csv = f"{save_dir}/train.csv"
            valid_csv = f"{save_dir}/dev.csv"
            test_csv = f"{save_dir}/test.csv"
            all_seq_train.to_csv(train_csv,index=None)
            all_seq_valid.to_csv(valid_csv,index=None)
            all_seq_valid.to_csv(test_csv,index=None)

# Generate model fine-tuning script

In [None]:
def get_cmd(CUDA_VISIBLE_DEVICES,MODEL_PATH,DATA_PATH,RUN_NAME,MAX_LENGTH,LR,save_steps,OUTPUT_DIR,eval_steps,logging_steps):
    bashFile = f"""
export CUDA_VISIBLE_DEVICES={CUDA_VISIBLE_DEVICES}
python /data2/zyd_workspace/2024_12_multi_species_models/dnabert-2/DNABERT_2/finetune/train.py \
--model_name_or_path {MODEL_PATH} \
--data_path {DATA_PATH} \
--kmer -1 \
--run_name {RUN_NAME} \
--model_max_length {MAX_LENGTH} \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 1 \
--learning_rate {LR} \
--num_train_epochs 20 \
--fp16 \
--save_steps {save_steps} \
--output_dir {OUTPUT_DIR} \
--evaluation_strategy steps \
--eval_steps {eval_steps} \
--warmup_steps 50 \
--logging_steps {logging_steps} \
--overwrite_output_dir True \
--log_level info \
--find_unused_parameters False \
--save_total_limit 20
"""
    return bashFile

In [None]:
extend_lens = [512]
batch_size = 16
model_path = "/data2/zyd_workspace/2024_12_multi_species_models/dnabert-2/DNABERT-2-117M"
train_data_dir = "./train_valid_csv"
train_output_dir = "./train_output"
all_train_cmd = []
for species in ['human','pig']:
    if species=='pig':
        cre_types = ['e1_e10','e1','e10']
    elif species=="human":
        cre_types = ['promoter']
    for extend_len in extend_lens:
        for cre_type in cre_types:
                train_dir = f"{species}_{cre_type}_{extend_len*2+1}"
                for tissue in tissues:
                    run_name = f"{species}_{cre_type}_{extend_len*2+1}_{tissue}"
                    data_path = os.path.join(train_data_dir,train_dir,tissue)
                    output_path = os.path.join(train_output_dir,train_dir,tissue)
                    df_train_csv = pd.read_csv(os.path.join(data_path,"train.csv"),sep="\t")
                    loggin_steps = int(df_train_csv.shape[0]/batch_size)
                    save_steps = int(df_train_csv.shape[0]/batch_size)
                    eval_steps = save_steps
                    loggin_steps = save_steps
                    max_len = int(0.25*(extend_len*2+1))+1
                    cuda_divice = 1 if extend_len<512 else 0
                    all_train_cmd.append(get_cmd(CUDA_VISIBLE_DEVICES=cuda_divice,MODEL_PATH=model_path,DATA_PATH=data_path,RUN_NAME=run_name,MAX_LENGTH=max_len,LR=3e-5,save_steps=save_steps,OUTPUT_DIR=output_path,eval_steps=eval_steps,logging_steps=save_steps))

In [None]:
with open("./train.sh",'w') as f:
    f.write('\n'.join(all_train_cmd))