In [1]:
from datasets import load_dataset
import pandas as pd

import random
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed=7):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(7)

In [3]:
repo_id = 'kuleshov-group/cross-species-single-nucleotide-annotation'
tis = load_dataset(repo_id, data_files={'train': 'TIS/train.tsv', 'valid': 'TIS/valid.tsv', 'test_rice':'TIS/test_rice.tsv', 'test_sorghum':'TIS/test_sorghum.tsv', 'test_maize':'TIS/test_maize.tsv'})

tis_train = tis['train']

train_df = tis_train.to_pandas()

seqs = train_df["sequences"]

In [5]:
df = pd.DataFrame(data=None, columns=["seq"])

df["seq"] = seqs
# df["positive"] = seqs
# df["negative"] = seqs

In [6]:
df

Unnamed: 0,seq
0,ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATA...
1,CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGC...
2,TCCATAGCTCTAATTGCAACAGGCGTGGGTGCGGCTGCCGGCTTTG...
3,CCTGATATACTATCGCATTGTCACAGCTCTAGCTATGGTGGTCATT...
4,GGAAGAAAAAACTAGCCTCTCAATTCAAGGCAACATGTGGCCCGAT...
...,...
198586,TTGAGATAATCTCTGCATCTATGCTACCATTTCCAGTCAAACCATA...
198587,TGTTATAATTGTTGACGTGTTATACATATTTCACAGTTGGAGGCCT...
198588,TGGTCGAGTGTGCGATGATGAAGGGACTCGACCATGTAATCGAGTG...
198589,GGTATAATGCATGATTGACTAAGCAGACAAGTTCTGATCAAGCCAC...


In [42]:
_rc_map = str.maketrans("ACGT", "TGCA")

def reverse_complement(seq: str) -> str:
    return seq.translate(_rc_map)[::-1]

In [43]:
new_seqs = []

for seq in seqs:
    rand = random.random()

    if rand > 0.5:
        new_seqs.append(reverse_complement(seq))
    else:
        new_seqs.append(seq)

new_seqs = pd.Series(new_seqs)

df["positive"] = new_seqs
df["negative"] = seqs

In [44]:
def make_pair(df, max_mutations=30):
    nucleic_list = ['A', 'T', 'G', 'C']

    for i in range(len(df)):
        seq = df.loc[i, "negative"]
        seq_len = len(seq)

        # 변이 글자 수 랜덤 선택 (1~max_mutations)
        num_mut = random.randint(1, max_mutations)
        idx_list = random.sample(range(seq_len), num_mut)

        seq_list = list(seq)
        for idx in idx_list:
            original = seq_list[idx]
            candidates = [n for n in nucleic_list if n != original]
            seq_list[idx] = random.choice(candidates)

        new_seq = "".join(seq_list)
        df.loc[i, "negative"] = new_seq

    return df


In [45]:
ft_df = make_pair(df)

In [7]:
df.to_csv("./Data/plant_nucleotide.csv", index=None)

In [48]:
df = pd.read_csv("./Data/fine_tuning_triplet.csv")

In [69]:
df.loc[:30000]

Unnamed: 0,anchor,positive,negative
0,ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATA...,ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATA...,ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATA...
1,CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGC...,CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGC...,CTCTTTCAGATCGTTATAGCTTCTATAAATATGATTGAGATTAAGC...
2,TCCATAGCTCTAATTGCAACAGGCGTGGGTGCGGCTGCCGGCTTTG...,ACAATGGGATGATTAAAACATACTAAAATATGGATTTTGAACAACA...,TCCCTAGCTCTAATTGCAACAGGCGTGGGTGCAGCTGCTGGCTTTG...
3,CCTGATATACTATCGCATTGTCACAGCTCTAGCTATGGTGGTCATT...,CCTGATATACTATCGCATTGTCACAGCTCTAGCTATGGTGGTCATT...,CCTGATATACTATCGTATTGTCACAGCTCGAGCTATGGTGGTCAAG...
4,GGAAGAAAAAACTAGCCTCTCAATTCAAGGCAACATGTGGCCCGAT...,ACAAATCCCTAGGTAGTCTAAAAGACTGACCAAACAAACCATACCG...,GGAAGAAAAAACTAGCCTCTCAACTCAAGGCAACATGTGGCCCGAT...
...,...,...,...
29996,TAGTAATCATTCATGTCTCTATGCCTTACTTTAGACAGTTCCATGA...,TAGTAATCATTCATGTCTCTATGCCTTACTTTAGACAGTTCCATGA...,TCGTAATCATTCCTGTCTCTATGCCTTTCTTTCGACAGTTCCATGA...
29997,GAGAAGCATGAGCTTGAAGAGCCTGTGCAGCAACAATAGCCTGCGC...,GAGAAGCATGAGCTTGAAGAGCCTGTGCAGCAACAATAGCCTGCGC...,GAGAAGCATGCGCTTGAAGAGCCTGTGCAGCAACAATAGCCGGTGC...
29998,AGCAGCCAAGAAGAGTTCTACCTGATCAGCAATATCATAAACCAAT...,AATGAGAAAAAAGCAGTTGTGGGAAAGGGGACAAGCCTAGTTATCC...,TGCAGCCAAGAAGAGTTCTACCTGCTCAGAAATATCATAAACCAAT...
29999,TAGCTCCAGTCATGTTCCCCAACGCACCCATTATACTTCCCTTCCC...,TAGCTCCAGTCATGTTCCCCAACGCACCCATTATACTTCCCTTCCC...,TAGCTCCAGTCATGTTCCCCAACGCACCCATTATACTTCCCTTCCC...


In [61]:
anchors, poss, negs = zip(*df.values.tolist()[0:2])

In [68]:
poss

('ATTGTCCTAACTCAGAGTCCTCAGCATCATCACGGATTAGAACATACTTTCCTTTCTTTTGTTCCTCTAAATCTGCTAACAACATATTCAAACATTCACAAACAGTAAGAGATAAACAGCAAATAATGGTGTCACAAGAATCATGTAAACAAAAAGCTGAGTCCAACATTCACAAGGAACTCAAATAGTAAAAGATAAAAAACAAACAATGGTGTCACAAGAGAAATGGACAATTTGTGAACATACTCGAATTCATGGCGAAAGAAGTGCAATTTTGTAGCGCCGATTAAACTTCCTCTCACCAAGTAACCTGAACATCATTCAATCAGAAAATTGTCTCAGACAAAAATGATTAAATGTAAAAAAGACAAAGCTGAGTTTTTTTTCTCAAATGTCAGCTAATAAGAATCAGGAGAAATCTCAGAAAAGCTACATTTTCCGATATCCAAAAATTATTATGATGATCCAATACAGAAGATGAAAAAAAACGAACTTAATATAAACCCTAAGAC',
 'CTCTTTCAGATCCTTATAGCTTCTATAAATATGATTGAGATTAAGCAGATGACCTTATGAACAGCTTCTGCGGGACTTGGCCTTCTTCAGTTCTGCAACTTAAACAGCTTTAGGAATAAATTTATCCTCGGCTTCTCCATCTTCATTGGACTCTCTGTGGCGCAATACTTCACCGAATATCTATTCATCTCTGGTCGTGGACCTGTCCACACTCGCACTTCTGCTGTAAGTGTTCAAGAGAAACAGAAACAATCATTTTCTTTTTGTCTTGTCCTCTTTATTGATTCGTATTTGGTGTTGCCAACGACAGTTCAACGTGATAATGCAAGTGATATTCTCTTCCGCTGCAACGGTTGGGATAATGGCAGCGTTCTTGTTGGACTGTACTCATAGCTATGGACATGCCTCGGTGAGGAGAGACAGCGGAAGACATTGGTGGGAGAAATTCAGAGTCTACCACACTGATACTCGAACAGAAGAA