In [17]:
from datasets import load_dataset
import pandas as pd

import random
import numpy as np
import torch

In [18]:
def set_seed(seed=7):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(7)

In [19]:
repo_id = "InstaDeepAI/nucleotide_transformer_downstream_tasks"
tis = load_dataset(repo_id, split="train", cache_dir="F:/huggingface")

tis_train = tis

train_df = tis_train.to_pandas()

seqs = train_df["sequence"]

In [20]:
def mutate_sequence(seq, num_mutations):
    nucleotides = ['A', 'C', 'G', 'T']
    seq_list = list(seq)
    seq_len = len(seq)

    # 변이 위치 선택
    idx_list = random.sample(range(seq_len), num_mutations)
    for idx in idx_list:
        original = seq_list[idx]
        candidates = [n for n in nucleotides if n != original]
        seq_list[idx] = random.choice(candidates)

    return "".join(seq_list)

def build_triplet_dataset(seqs, pos_range=(0,5), neg_range=(10,30)):
    data = []
    for seq in seqs:
        # Anchor
        anchor = seq

        # Positive: 적은 변이
        num_pos = random.randint(*pos_range)
        positive = mutate_sequence(anchor, num_pos)

        # Negative: 많은 변이
        num_neg = random.randint(*neg_range)
        negative = mutate_sequence(anchor, num_neg)

        data.append({
            "anchor": anchor,
            "positive": positive,
            "negative": negative,
            "pos_mutations": num_pos,
            "neg_mutations": num_neg
        })

    return pd.DataFrame(data)

df = build_triplet_dataset(seqs)
print(df.head())

                                              anchor  \
0  TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...   
1  CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...   
2  TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...   
3  CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...   
4  TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...   

                                            positive  \
0  TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...   
1  CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...   
2  TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATACTATCATTC...   
3  CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...   
4  TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...   

                                            negative  pos_mutations  \
0  TCACTTCGATTATTGAGGCCGTCTTCATTTAAGTTTAATACAATGG...              2   
1  CAGTAGTGGCATAAACCCAAGGAGCCGAGCCTGTGGTACTCCATCC...              1   
2  TTTCCGATAAGCTTCAGCCCTGGCAACGCTAAAAAAAGTCTCATTC...              5   
3  CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGA

In [16]:
max(df["seq"].apply(lambda x : len(x)))

600

In [49]:
_rc_map = str.maketrans("ACGT", "TGCA")

def reverse_complement(seq: str) -> str:
    return seq.translate(_rc_map)[::-1]

In [50]:
new_seqs = []

for seq in seqs:
    rand = random.random()

    if rand > 0.5:
        new_seqs.append(reverse_complement(seq))
    else:
        new_seqs.append(seq)

new_seqs = pd.Series(new_seqs)

df["positive"] = new_seqs
df["negative"] = seqs

In [51]:
def make_pair(df, max_mutations=30):
    nucleic_list = ['A', 'T', 'G', 'C']

    for i in range(len(df)):
        seq = df.loc[i, "negative"]
        seq_len = len(seq)

        # 변이 글자 수 랜덤 선택 (1~max_mutations)
        num_mut = random.randint(1, max_mutations)
        idx_list = random.sample(range(seq_len), num_mut)

        seq_list = list(seq)
        for idx in idx_list:
            original = seq_list[idx]
            candidates = [n for n in nucleic_list if n != original]
            seq_list[idx] = random.choice(candidates)

        new_seq = "".join(seq_list)
        df.loc[i, "negative"] = new_seq

    return df


In [52]:
ft_df = make_pair(df)

In [53]:
ft_df

Unnamed: 0,seq,positive,negative
0,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGCCTTCATTAAAGTTTATTACAATGG...
1,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGGAGTGGCATTAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...
2,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...,TGTTAGATACTTTGAGTATTGAAAAATTTGCCATGTATATATTTGA...,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...
3,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAGCATGTTCTTGGTGAAGTGTGC...
4,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...,ATCATCAAACCTATTCTTCGAGCTTGAAGTATCACTTTGAGATTTT...,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGTT...
...,...,...,...
461845,TCCTTTCCCAATTACATGCACAGGCAGCAAGTAAATATTAGAGCCC...,CTGTGAAAAAGGGGGCAGCCTGCTTCCCTTGGCTCAGTGTTATCAA...,TCCTTTCCCAATTACATGCACCGGCAGCAAGTAAATATTAGAGCCC...
461846,TCGCTTGAGATCACCAACCTGCGGCTGCTCTGGGCATCGCATCGGT...,TAGCCGCCGCTCGACACGGACGTCTCAATCACGAGCGCGGGGCCGA...,TCGCTTGAGAGCACCAACCGGCGGCTGCTCTGGGCATCGCATCGAT...
461847,ATCTAGAGGAAAGCATCCGTGGGAAGGGCCTCACTGAAATGGAAGA...,ATCTAGAGGAAAGCATCCGTGGGAAGGGCCTCACTGAAATGGAAGA...,ATCTAGAGGAAAGCATTCGTGGGAAGGGCCTCACTGAAATGGAAGA...
461848,CATGGGGTAGATTAAATGGACTCATGGATGTAAAGTGCTTAGCATA...,ATTTGGGGTCAGCAGTTTGAGACCACCCTGTCCAAGACTGTGAAAC...,CATGGGGTAGATTAAATGGACTCATTGATGTAAAGTGCTTAGCATA...


In [21]:
df.to_csv("./Data/triplet_data.csv", index=None)

In [22]:
df = pd.read_csv("./Data/triplet_data.csv")

In [28]:
df[["anchor", "positive", "negative"]].values.tolist()

[['TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGGATATGGTATCACCAGTCTTGAACCTACAATCATCTATTTTAGGTGAGCTCGTAGGCATTATTGGAAAAGTGTTCTTTCTCTTAATAGAAGAGATTAAATACCCGATAATCACACCCAAAATTATTGTGGATGCCCAGATATCTTCTTGGTCATTGTTTTTTTTCGCTTCAATCTGTAATCTCTCTGCAAAATTTCGGGAGCCAATAGTGACAACATCGTCAATAATAAGTTTGATGGAATCGGAAAAAGATCTTAAAAATGTAAATGAGTATTTCCAAATAATGGCCAAAATGCTCTTTATATTGGAAAATAAAATAGTTGTTTCGCTCTTCGTAGTATTTAACATTTCCGTTCTTATCATTGTAAAGTCTGAGCCATATTCATATGGAAAAGTGCTTTTTAAACCTAGTTCCTCCATATTTTAGTTTTTTATCGATATTGGAAAAAAAAGAGC',
  'TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGGATATGGTATCACCAGTCTTGAACCTACAATCTTCTATTTTAGGTGAGCTCGTAGGCATTATTGGAAAAGTGTTCTTTCTCTTAATAGAAGAGATTAAATACCCGATAATCACACCCAAAATTATTGTGGATGCCCAGATATCTTCTTGGTCATTGTTTTTTTTCGCTTCAATCTGTAATCTCTCTGCAAAATTTCGGGAGCCAATAGTGACAACATCGTCAATAATAAGTTTGATGGAATCGGAAAAAGATCTTAAAAATGTAAATGAGTATTTCCAAATAATGGCCAAAATGCTCTTTATATTGGAAAATAAAATAGTTGTTTCGCTCTTCGTAGTATTTAACATTTCCGTTCTTATCATTGTAAAGTCTGAGCCATATTCATATGGAAAAGTGCTTTTTAAACCTAGTTCCTCCATATTTTAGTTTTTTATCGATATCGGAAA

In [61]:
anchors, poss, negs = zip(*df.values.tolist()[0:2])

In [68]:
poss

('ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATACTTTCCTTTCTTTTGTTCCTCTAAATCTGCTAACAACATATTCAAACATTCACAAACAGTAAGAGATAAACAGCAAATAATGGTGTCACAAGAATCATGTAAACAAAAAGCTGAGTCCAACATTCACAAGGAACTCAAATAGTAAAAGATAAAAAACAAACAATGGTGTCACAAGAGAAATGGACAATTTGTGAACATACTCGAATTCATGGCGAAAGAAGTGCAATTTTGTAGCGCCGATTAAACTTCCTCTCACCAAGTAACCTGAACATCATTCAATCAGAAAATTGTCTCAGACAAAAATGATTAAATGTAAAAAAGACAAAGCTGAGTTTTTTTTCTCAAATGTCAGCTAATAAGAATCAGGAGAAATCTCAGAAAAGCTACATTTTCCGATATCCAAAAATTATTATGATGATCCAATACAGAAGATGAAAAAAAACGAACTTAATATAAACCCTAAGAC',
 'CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGCAGATGACCTTATGAACAGCTTCTGCGGGACTTGGCCTTCTTCAGTTCTGCAACTTAAACAGCTTTAGGAATAAATTTATCCTCGGCTTCTCCATCTTCATTGGACTCTCTGTGGCGCAATACTTCACCGAATATCTATTCATCTCTGGTCGTGGACCTGTCCACACTCGCACTTCTGCTGTAAGTGTTCAAGAGAAACAGAAACAATCATTTTCTTTTTGTCTTGTCCTCTTTATTGATTCGTATTTGGTGTTGCCAACGACAGTTCAACGTGATAATGCAAGTGATATTCTCTTCCGCTGCAACGGTTGGGATAATGGCAGCGTTCTTGTTGGACTGTACTCATAGCTATGGACATGCCTCGGTGAGGAGAGACAGCGGAAGACATTGGTGGGAGAAATTCAGAGTCTACCACACTGATACTCGAACAGAAGAA