In [None]:

import pandas as pd
import pysam
import random
import pandas as pd
import random
import os


def get_train_valid_idx(sample_num):
    """
    Obtain the indices for the training set and validation set according to an 8:2 ratio
    """
    train_idx = []
    valid_idx = []
    random_idx = list(range(sample_num))
    random.shuffle(random_idx)
    avg_group_count = 10
    step = int(len(random_idx)/avg_group_count)
    random_idx_group = [random_idx[i:i+step] for i in range(0,len(random_idx),step)]
    for i in range(8):
        train_idx.extend(random_idx_group[i])
    for i in range(8,len(random_idx_group)):
        valid_idx.extend(random_idx_group[i])
    return {'train':train_idx,'valid':valid_idx}


# Generate train set and validation set

Generate and save the training and validation datasets to the "train_valid_csv" directory based on the "fa_diff_len" directory.

In [None]:
fasta_dir = "./fa_diff_len/"
train_data_dir = "./train_valid_csv"

os.makedirs(train_data_dir,exist_ok=True)
for specie in ['human','pig']:
    if specie=='pig':
        extend_lens = [256,512,1024,2048]
        cre_types = ['e1','e4e5','e10','e1_e10','e1_e4e5_e10']
    elif specie=="human":
        extend_lens = [256,512,1024,2048]
        cre_types = ['promoter','enhancer','promoter_enhancer']
    for extend_len in extend_lens:
        for cre_type in cre_types:
            train_dir = f"{cre_type}_{extend_len*2+1}"
            absolute_dir = os.path.join(train_data_dir,specie,train_dir)
            os.makedirs(absolute_dir)
            for train_valid in ['train','valid']:
                output_csv = "train.csv" if train_valid=="train" else "dev.csv"
                csv_path = os.path.join(absolute_dir,output_csv)
                seqs = {}
                seqs_labels = {}
                for pos_neg in ['positive','negative']:
                    seq_label = 1 if pos_neg=="positive" else 0
                    fafile = os.path.join(fasta_dir,specie,f"{cre_type}_len_{extend_len*2+1}_{train_valid}_{pos_neg}.fa")
                    fa = pysam.FastaFile(fafile)
                    seqs[pos_neg] = [fa.fetch(rederence) for rederence in fa.references]
                    seqs_labels[pos_neg] = [(seq,seq_label) for seq in seqs[pos_neg]]
                merge_sample = seqs_labels['positive']+seqs_labels['negative']
                random.shuffle(merge_sample)
                df_csv = pd.DataFrame(merge_sample)
                df_csv.columns = ['sequence','label']
                df_csv.to_csv(csv_path,sep=",",index=None)
                if train_valid=="valid":
                    output_csv = "test.csv"
                    csv_path = os.path.join(absolute_dir,output_csv)
                    df_csv.to_csv(csv_path,sep=",",index=None)

# Generate model fine-tuning script

In [None]:
def get_cmd(CUDA_VISIBLE_DEVICES,MODEL_PATH,DATA_PATH,RUN_NAME,MAX_LENGTH,LR,save_steps,OUTPUT_DIR,eval_steps,logging_steps):
    bashFile = f"""
export CUDA_VISIBLE_DEVICES={CUDA_VISIBLE_DEVICES}
python /data2/zyd_workspace/2024_12_multi_species_models/dnabert-2/DNABERT_2/finetune/train.py \
--model_name_or_path {MODEL_PATH} \
--data_path {DATA_PATH} \
--kmer -1 \
--run_name {RUN_NAME} \
--model_max_length {MAX_LENGTH} \
--per_device_train_batch_size 16 \
--per_device_eval_batch_size 16 \
--gradient_accumulation_steps 1 \
--learning_rate {LR} \
--num_train_epochs 20 \
--fp16 \
--save_steps {save_steps} \
--output_dir {OUTPUT_DIR} \
--evaluation_strategy steps \
--eval_steps {eval_steps} \
--warmup_steps 50 \
--logging_steps {logging_steps} \
--overwrite_output_dir True \
--log_level info \
--find_unused_parameters False \
--save_total_limit 20
"""
    return bashFile

In [None]:

batch_size = 16
model_path = "/data2/zyd_workspace/2024_12_multi_species_models/dnabert-2/DNABERT-2-117M"
train_data_dir = "./train_valid_csv"
train_output_dir = "./train_output"
os.makedirs(train_output_dir,exist_ok=True)
all_train_cmd = []
for specie in ['human','pig']:
    if specie=='pig':
        extend_lens = [256,512,1024,2048]
        cre_types = ['e1','e4e5','e10','e1_e10','e1_e4e5_e10']
    elif specie=="human":
        extend_lens =[256,512,1024,2048]
        cre_types = ['promoter','enhancer','promoter_enhancer']
    for extend_len in extend_lens:
        for cre_type in cre_types:
            train_dir = f"{cre_type}_{extend_len*2+1}"
            run_name = f"{specie}_{train_dir}"
            data_path = os.path.join(train_data_dir,specie,train_dir)
            output_path = os.path.join(train_output_dir,specie,train_dir)
            df_train_csv = pd.read_csv(os.path.join(data_path,"train.csv"),sep="\t")
            loggin_steps = int(df_train_csv.shape[0]/batch_size)
            save_steps = int(df_train_csv.shape[0]/batch_size)
            eval_steps = save_steps
            loggin_steps = save_steps
            max_len = int(0.25*(extend_len*2+1))+1
            cuda_divice = 1 if extend_len<=512 else 0
            all_train_cmd.append(get_cmd(CUDA_VISIBLE_DEVICES=cuda_divice,MODEL_PATH=model_path,DATA_PATH=data_path,RUN_NAME=run_name,MAX_LENGTH=max_len,LR=3e-5,save_steps=save_steps,OUTPUT_DIR=output_path,eval_steps=eval_steps,logging_steps=save_steps))

In [None]:
f_4090_1 = open("./train_4090_1.sh","w")
f_4090_2 = open("./train_4090_2.sh","w")
for i in all_train_cmd:
    if "CUDA_VISIBLE_DEVICES=0" in i:
        f_4090_1.write(f"{i}\n")
    else:
        f_4090_2.write(f"{i}\n")
f_4090_1.close()
f_4090_2.close()