In [2]:
from datasets import load_dataset
import pandas as pd

import random
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def set_seed(seed=7):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(7)

In [11]:
repo_id = "InstaDeepAI/nucleotide_transformer_downstream_tasks"
tis = load_dataset(repo_id, split="train", cache_dir="F:/huggingface")

tis_train = tis

train_df = tis_train.to_pandas()

seqs = train_df["sequence"]

In [12]:
df = pd.DataFrame(data=None, columns=["seq"])

df["seq"] = seqs
# df["positive"] = seqs
# df["negative"] = seqs

In [13]:
df

Unnamed: 0,seq
0,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...
1,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...
2,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...
3,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...
4,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...
...,...
461845,TCCTTTCCCAATTACATGCACAGGCAGCAAGTAAATATTAGAGCCC...
461846,TCGCTTGAGATCACCAACCTGCGGCTGCTCTGGGCATCGCATCGGT...
461847,ATCTAGAGGAAAGCATCCGTGGGAAGGGCCTCACTGAAATGGAAGA...
461848,CATGGGGTAGATTAAATGGACTCATGGATGTAAAGTGCTTAGCATA...


In [14]:
_rc_map = str.maketrans("ACGT", "TGCA")

def reverse_complement(seq: str) -> str:
    return seq.translate(_rc_map)[::-1]

In [15]:
new_seqs = []

for seq in seqs:
    rand = random.random()

    if rand > 0.5:
        new_seqs.append(reverse_complement(seq))
    else:
        new_seqs.append(seq)

new_seqs = pd.Series(new_seqs)

df["positive"] = new_seqs
df["negative"] = seqs

In [16]:
def make_pair(df, max_mutations=30):
    nucleic_list = ['A', 'T', 'G', 'C']

    for i in range(len(df)):
        seq = df.loc[i, "negative"]
        seq_len = len(seq)

        # 변이 글자 수 랜덤 선택 (1~max_mutations)
        num_mut = random.randint(1, max_mutations)
        idx_list = random.sample(range(seq_len), num_mut)

        seq_list = list(seq)
        for idx in idx_list:
            original = seq_list[idx]
            candidates = [n for n in nucleic_list if n != original]
            seq_list[idx] = random.choice(candidates)

        new_seq = "".join(seq_list)
        df.loc[i, "negative"] = new_seq

    return df


In [17]:
ft_df = make_pair(df)

In [18]:
ft_df

Unnamed: 0,seq,positive,negative
0,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGCCTTCATTAAAGTTTATTACAATGG...
1,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGGAGTGGCATTAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...
2,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...,TGTTAGATACTTTGAGTATTGAAAAATTTGCCATGTATATATTTGA...,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...
3,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAGCATGTTCTTGGTGAAGTGTGC...
4,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...,ATCATCAAACCTATTCTTCGAGCTTGAAGTATCACTTTGAGATTTT...,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGTT...
...,...,...,...
461845,TCCTTTCCCAATTACATGCACAGGCAGCAAGTAAATATTAGAGCCC...,CTGTGAAAAAGGGGGCAGCCTGCTTCCCTTGGCTCAGTGTTATCAA...,TCCTTTCCCAATTACATGCACCGGCAGCAAGTAAATATTAGAGCCC...
461846,TCGCTTGAGATCACCAACCTGCGGCTGCTCTGGGCATCGCATCGGT...,TAGCCGCCGCTCGACACGGACGTCTCAATCACGAGCGCGGGGCCGA...,TCGCTTGAGAGCACCAACCGGCGGCTGCTCTGGGCATCGCATCGAT...
461847,ATCTAGAGGAAAGCATCCGTGGGAAGGGCCTCACTGAAATGGAAGA...,ATCTAGAGGAAAGCATCCGTGGGAAGGGCCTCACTGAAATGGAAGA...,ATCTAGAGGAAAGCATTCGTGGGAAGGGCCTCACTGAAATGGAAGA...
461848,CATGGGGTAGATTAAATGGACTCATGGATGTAAAGTGCTTAGCATA...,ATTTGGGGTCAGCAGTTTGAGACCACCCTGTCCAAGACTGTGAAAC...,CATGGGGTAGATTAAATGGACTCATTGATGTAAAGTGCTTAGCATA...


In [19]:
ft_df.to_csv("./Data/triplet_data.csv", index=None)

In [20]:
df = pd.read_csv("./Data/triplet_data.csv")

In [21]:
df.loc[:30000]

Unnamed: 0,seq,positive,negative
0,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGTCTTCATTAAAGTTTATTACAATGG...,TCACTTCGATTATTGAGGCAGCCTTCATTAAAGTTTATTACAATGG...
1,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGTAGTGGCATAAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...,CAGGAGTGGCATTAACCCAAGGAACAGAGCCAGTGGTACTCCATCC...
2,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...,TGTTAGATACTTTGAGTATTGAAAAATTTGCCATGTATATATTTGA...,TTTCCGATAAGCTTCAGCCCCGGCAACGCTAAAAATAGTATCATTC...
3,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAACTTGTTCTTGGTGAAGTGTGC...,CCGTTTGGAGTAATGAGCGGTTAAGCATGTTCTTGGTGAAGTGTGC...
4,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGAT...,ATCATCAAACCTATTCTTCGAGCTTGAAGTATCACTTTGAGATTTT...,TTTTTATTTAGTCGACTATAAAGGTGGAAGTCCATACTTAAGAGTT...
...,...,...,...
29996,GACAGTTACCGAAAGCAGCAGCAATGGAGAAGTTTGGAGAGATTGG...,GACAGTTACCGAAAGCAGCAGCAATGGAGAAGTTTGGAGAGATTGG...,GACAGTTACGGAAAGTCGCAGCAATGGAGAAGTTTGGAGAGATTGG...
29997,GAAAAGAAGATATGGTTGCCAAATAGCGATAAGCTGAACAGTTGGG...,GAAAAGAAGATATGGTTGCCAAATAGCGATAAGCTGAACAGTTGGG...,GAAAAGAAGACATGGTTGCCAAATAGCGATAAGCTGAACAGTTGGG...
29998,TATCGCAAATAAGCAGTTCCCAGCAAGCAATTAAAACGTATATAAG...,ACGAAGGACATGCCTCATGGGAGCAGACTTCTGTAGATAATGTCAA...,TATCGCAAATAAGCAGTTCCCAGCCAGCAATTAAAACGTATATAAG...
29999,TTCATCCTGAACTTGTTTGTTTCATTCATTGTTAACAACCAAGCAA...,TTCATCCTGAACTTGTTTGTTTCATTCATTGTTAACAACCAAGCAA...,TTCATCCTGAACTTGTTTGATTCATTCATTGTTAAGAACCAAGCAA...


In [61]:
anchors, poss, negs = zip(*df.values.tolist()[0:2])

In [68]:
poss

('ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATACTTTCCTTTCTTTTGTTCCTCTAAATCTGCTAACAACATATTCAAACATTCACAAACAGTAAGAGATAAACAGCAAATAATGGTGTCACAAGAATCATGTAAACAAAAAGCTGAGTCCAACATTCACAAGGAACTCAAATAGTAAAAGATAAAAAACAAACAATGGTGTCACAAGAGAAATGGACAATTTGTGAACATACTCGAATTCATGGCGAAAGAAGTGCAATTTTGTAGCGCCGATTAAACTTCCTCTCACCAAGTAACCTGAACATCATTCAATCAGAAAATTGTCTCAGACAAAAATGATTAAATGTAAAAAAGACAAAGCTGAGTTTTTTTTCTCAAATGTCAGCTAATAAGAATCAGGAGAAATCTCAGAAAAGCTACATTTTCCGATATCCAAAAATTATTATGATGATCCAATACAGAAGATGAAAAAAAACGAACTTAATATAAACCCTAAGAC',
 'CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGCAGATGACCTTATGAACAGCTTCTGCGGGACTTGGCCTTCTTCAGTTCTGCAACTTAAACAGCTTTAGGAATAAATTTATCCTCGGCTTCTCCATCTTCATTGGACTCTCTGTGGCGCAATACTTCACCGAATATCTATTCATCTCTGGTCGTGGACCTGTCCACACTCGCACTTCTGCTGTAAGTGTTCAAGAGAAACAGAAACAATCATTTTCTTTTTGTCTTGTCCTCTTTATTGATTCGTATTTGGTGTTGCCAACGACAGTTCAACGTGATAATGCAAGTGATATTCTCTTCCGCTGCAACGGTTGGGATAATGGCAGCGTTCTTGTTGGACTGTACTCATAGCTATGGACATGCCTCGGTGAGGAGAGACAGCGGAAGACATTGGTGGGAGAAATTCAGAGTCTACCACACTGATACTCGAACAGAAGAA