In [46]:
import pprint
from datasets import load_dataset

In [47]:
def load_and_print_example(dataset_name, task_name):
    print(f"=== Loading task: {task_name} ===")
    try:
        # Load the dataset for a specific task (config)
        dataset = load_dataset(dataset_name, task_name, trust_remote_code=True)
        
        # Print the available splits
        for data in dataset:
            print("Split name:", data, " Size:", len(dataset[data]))
        print("Sequence length :", len(dataset["train"]['sequence'][0]))
        
        # Print an example from the train split (if it exists)
        if "train" in dataset:
            example = dataset["train"][0]
            print(f"\nSample from 'train' split of {task_name}:")
            pprint.pprint(example)
        else:
            print(f"No train split found for task {task_name}.\n")
    except ValueError as e:
        print(f"Could not load config '{task_name}': {e}\n")

In [48]:
dataset_name = "InstaDeepAI/nucleotide_transformer_downstream_tasks_revised"
tasks = ['H2AFZ', 'H3K27ac', 'splice_sites_donors', 'splice_sites_acceptors', 'H3K27me3', 'H3K36me3', 'H3K4me1', 'splice_sites_all', 'H3K4me2', 'H3K4me3', 'enhancers_types', 'promoter_no_tata', 'H3K9ac', 'H3K9me3', 'promoter_tata', 'enhancers', 'H4K20me1', 'promoter_all']


In [49]:
load_and_print_example(dataset_name, tasks[0])

=== Loading task: H2AFZ ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 1000

Sample from 'train' split of H2AFZ:
{'label': 0,
 'name': 'chr8:16774000-16775000|0',
 'sequence': 'ATATTTTTCGGTGTTTTTTTAAAATCCAGAAAAGGTTAATTGTTTTTTATATTGACCATGTTTTCTGTAAATTCTTTCATGCTTTATCTTTTTTCTTTGCATTATCCGAGTTATTTCCAGCCTGTTTGTCATTGGTTTAATTTTCAACAATGCCTACCTCACTATTTTTTATTAGTTCCAATATTATGTCTTTTTATACTCAATTCATTTCCCTAGTTCTCCCATACTATTCTCTGGTCTGTTATCTTTTCATCTTTGAGTTTTGGTTTTATATAGTCTATACACTTTATAATTTCTTTAAAAATGCAAAGCATTAGTTGTGAATGTTTTTCTTTTGTTTTCTAGGGCGTGTTCTCATTTAGAGTACTACGCATGAGAAGGTCATTTATTTCATTTTTTAAATAATCATAGGAACTTCTCATAGTTGTTATCCTGTCTCTTCTCATTTCACTCATATTAGACAAGGCAGCTCTTCTCTTTGCCCATGTGTGAGCTCTCCTTGAATCTCTTCTTATCTGCGATCTAAGAGTGTTGTCTTACCTTCAGATGGTCAACGTGAAGAATGGGTATTGATAGGTCACACTCACTCTGCTCTTTGTTTATCTGAGATTGAGAAGCTAGAGAGAGGAAGCTATGGGCAGTTAGGAAGAAGTTGGTCTCTTCTCTATTTTCCTATTGATTTTATAAAATCCTGTTCTGTATTTTGTTTTTTCTAGTGAAATGTTGACTTCTCTCCTTAATTCAGAGTGAGCTGCTGAATCTTCCCTCTACAAAATATCACTCATTCT

In [50]:
load_and_print_example(dataset_name, tasks[1])

=== Loading task: H3K27ac ===
Split name: train  Size: 30000
Split name: test  Size: 1616
Sequence length : 1000

Sample from 'train' split of H3K27ac:
{'label': 0,
 'name': 'chr6:161414000-161415000|0',
 'sequence': 'AGCAGCCAGGCAGAGCGGTGGGACGTGAAACTCCTCGACAGCACTGTGTCCTGGGCCGAGGTGGGGAGGGTCAGTGAGGGCAGGGTACTGGTGTCATTTTAATTCAGACTTCTCTCTTTCTTTGCTGCAAATGGCTCTCTGGAGCACAAACTCTCAACAAGGTGGAAACTCGACACTATGCTCTCAATCTCCATGCACAATTTTGCAGAAGAGGCTCTTTCCAGCATTTTCTCCCCAGATGTCAGAGCCGTGCACCCTTCTCCGGAAGGCTGGAGGGTGTTCAGCGTTGTTTGTCACAGGCAGCGGCCAGTCTCTGTGTTCTCGGGCGCTCTGTGTCAGGTCCTGGTATTTATTCCCAGATTCTGTTTAGATGCACCCGAGGCCCCAGGCAAAGGTTTCTGCCTGAGACAGTCCTTGGCCTCCAGCCACGGCTTCCGTGCCATGTCCCAAATGCAAAGTGCCACTCTCAAATGTTCTCTGATCTCCTGTTCGAAGCAAATAACAGAGTGCTAAAATCTTATTTAGAGGAGTCGTGGTAGAATTATTTATTCACTTCTAGCCTTTTGGCTGAGACTGTTTTGAAAAATAGAAAATGGCAGGTAATTCAGGTGTCACCCTTCAATGGCATGTGTTCTTCTTGATGTTTCACAGCTACTGCACATCTGATGAAGAGGATTTTTTTTAACTGAGTAAGGTCATCTTTTCCTGATGAAGCCCCAAAGTGCAAAAGGCATGAAAGGCACATTATGCTTCTCCACCAGAGGAGGGTCTCCGCAGGCAGG

In [51]:
load_and_print_example(dataset_name, tasks[2])

=== Loading task: splice_sites_donors ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 600

Sample from 'train' split of splice_sites_donors:
{'label': 1,
 'name': 'chr17:78900668-78901268|1',
 'sequence': 'CCGGATACAGATAACCCTTGGGGGAGGTGGAAAATTGTTGCTAGACTCTAGCCTAGAGGTTTTTAACAGCTTGCACAACTGGCCAAATCTTACAAGATAAAAAAAGGTAATGGGAACACATCTAAAACTTATATGTGGGTCCCTAAATCCAACTATGCAAGTACAGAGGTGGGTGTGTGGAGCAGCACAGGGGACAAAAGACGCTGGGCCACCAGAAGGGCCAACGTGACTGTCCCCTGAGTGAACAGAAACGCCTCTAACATGCGAGCGAATGGTGCAGCCCAGCCAGCCCGACCTCACCTGGCCATGCTAGGAGGCAGTGGGCAAGCGGCAACGCTGTGGTGCCCAGGACGCCGGCAGGCACAGCGCCACGTCCACGGGCACCGCCCGTTCCCACCCTGCAGGACGCCACAGCATCAGACGAGACGGCCCGTGTGAAGCGCCGAGAACAGAGTCTGGCTTCCGGGAGGGCTCAGGAAATGCCGAGAGCTCGGCAGAGACACCCACGTGGTGCTGGATTCCACGCACGAGGAACCGACGGGCAGACGTGGAGCTAAATGATGGTCAACAGCTACGAAGTGCCTGTGCTCTAGAAATGGC'}


In [52]:
load_and_print_example(dataset_name, tasks[3])

=== Loading task: splice_sites_acceptors ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 600

Sample from 'train' split of splice_sites_acceptors:
{'label': 1,
 'name': 'chr16:74439479-74440079|1',
 'sequence': 'GCCAAAGTTGGGATCCACTTGGCTGGGCACGGTGGCTCACTTGAGGTTAGGAGTTTGAGACCAGCCTGGCCAACATGGTCAAACTCTGTCTCTACTAAAAACACAAAAATTAGCTGGGCATGGTGGTGCATGCCTGTAATCCCAGCTACTTGGGAGGCCAAGGCAGGAGAATCGATTGAACCTGGGAGGCGGAGGTTGCAGTGAGCTGAGATTGCGTCACTGCACTCCACCCCGGGTGACTCTGTCTCAAAAAAAAAAAAAAAAAAAAGGGCTGGGATCCACCTACTAATTATTTTACAGTTTCCTTCTCTGCGGCAGAGTGAGAGCAACAGAAAACCCTCCACTGTTAGCTGTTAGCAGATGCCTCTCAGTGTTCTCTGCTGCTTGGAGGAGTGACCCATACACTTGTGTCCAGTGAGTGCTGCACCATGTTGGCCTCTTAAGGAAGCGCTTCGCTTCAGTGCTGTCCAGGACTTCCCACTTTCCTTTACAAGAACTGCGATTACATCAGTGGCCCCCGTTAGGGGACTGGGCTACTGTAATGTGTTTTAAAAATTAGGTGCCAAGAACATGTACAAGAATGTTCACAGCAGCATTACT'}


In [53]:
load_and_print_example(dataset_name, tasks[4])

=== Loading task: H3K27me3 ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 1000

Sample from 'train' split of H3K27me3:
{'label': 1,
 'name': 'chr12:113477502-113478502|1',
 'sequence': 'TGTGCGCTGGTGAGTGCCGGCCAGCTCAGAGGTCTCTCTGGAGTGAGTGTGTGCGCTGGTGAGTGCCAGTCAGCTCAGAGGTCCCTCTGGAGCTGCAGCAGGGCGTGGGTTTCCCTCCTTCCTTACTGCAAGCCCCTCTCCTGGAATGCACCCCTCCCCTAGAGGTCCCACACCTTGGGCTATCTGGCAAGCAAAAACAGGCTACTTTCTCCTGCTGGGTGCTCCTTCCCCGGGCAAAACACACTTAGTTGCTGGGAGGGTGAAGTCTGGGGCTGATCCCTGGGGGCCAGAAGGGGGCCTGGGAGAGACAGAAGGAAAGCAGCAAGCCAGTCCTCCAACCTGTAAGGGAAACACTGACTTTTTTTTTGTTCTGAGAGGAAAGCCCTGTACTTAACACAGAGCGGCCCACTTCCCCATGGGTGAAGCTTGGATTCCAGGCTGCCAGTTGCACTGGAGGAGCAGTGGGTCCCAGCTGGGCACTGAGCACAGTCAGGCTTCTCTTCATCCTCGCATAGATGGTTGGGCCTGGAGGCTGGCCCTGGACTTTTGGGGTCTCAGGGCAGGCAGTACTTGGTCGTGAGAGGCTGTAGTCTTGCCCCTCTGGCCCCACTGAGACTATCTGATGGTAGAGGCCATGAACCAGGCCTTGGGAGCTATTTCTAAGAAGACAGCTGCAGCCCCACTCCAGCTCCTGTGCCCCAAGAAGAAGAGCTTTGCAGGGACCCCATCCTCCAAAATAGAACCCCGGACTTTCTAATGATCTCGCTCCAACCAGCAGA

In [54]:
load_and_print_example(dataset_name, tasks[5])

=== Loading task: H3K36me3 ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 1000

Sample from 'train' split of H3K36me3:
{'label': 0,
 'name': 'chr7:132273000-132274000|0',
 'sequence': 'TTTTATTTAACTCTGTAACTCTCTTTCCTAAAGCCATATCTACTACCTGGTAGGTATTCATCAAATGTTTGACAAATGAATAATTGAACAAATGCATTTGCCTGTATGGAGCAATCATAAAACGGTCATGAAGATGATCCCTACATAGCCCTCATCTTCATGGAGCAGATGTCATCCACTCTGAGTGATGTCTGTGTTTCTGGAATTGTTGAAAGTGGAAGCATTGGAACCTAATGTTGGGGGAAAATATAATACACTAAGCTCACACTGGAAACTCTTTGTTTGGAACTGAAACCCTTTAATTCATGCCTCATTGGTTTTCAACACACACACACACACACGCACACACACACACACGCACGCACACGCACACACCTCCCATCACTTTGTCTCCAGAATCATATTATAGGCTATCTCAACCTTGAGAATTGCACCCAGCTTTGCTGACTGACCTCAACTATGTACTGCTCAACTTTTCTTCTGACCTTGATTCTAGAATATAAGGGAAGCTATATTGATGAACAGCAAGATCCTCACTTAGTTATCTGCTGGGGTAGTATTATGCTGCATGGTAATTCCGCTTAGAGGCAAAGAGGAGAGGGGAGGTAGATCAGGGAAGCTGATGGCCATAGTGTTGACTGAGAACCTAGGACTCCCTACTTGAGTTTCCTCTCCATAGCCACAACTTGCTATGAGACTTCAGCTTGTTTCCCCCCATTTATAAAATGGAGAGAACTCTGTAACACCAGCTCTCATCTATCTACAAAAATAAAAAATGGA

In [55]:
load_and_print_example(dataset_name, tasks[6])

=== Loading task: H3K4me1 ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 1000

Sample from 'train' split of H3K4me1:
{'label': 1,
 'name': 'chr12:46537985-46538985|1',
 'sequence': 'AGAACTTCCAGAACATGACTTTTTATCATCTGAGTGCAGGACTGTGCCACCATGCAGTGGACTCAGCAAGGAAAGACTAAATCTAGTGACTGACTACTCCATTACATCTACCCATGTATTTACTTATCCCCTCTCTTGTGCCAGATGGGGTTTATGTTGGAGTTACATGATAAGCCCCTGGAGGATGGGGCAGTGTGTTGTTTACATTGGTTTCTTATAGTCCAGGGCCTTCTATATAGTAGCTGCTCAGTACAGGATGGCTGATGAGGAGTTGGCTTCTAACAGGTTTGATTCCTGCTTCTGTGCTCACAGCATGTGACCTGAGGCAAGGAGTTTAAGCCCCTTTAAATCTCAGATTTCTTACTGGGTTTCTTCCATTAAATAAATTAATGTATTTTTTAAACCCCAGTACCTAGCACCTAGTATGATTGCTACTGTCTACTATTTAATAATTGCCCACTATTATTATAATTGTCCTTAGCCTCAAGATTACAATACACTGTGAAAATTTTATTCCATTTTATCACATCTATATTGGATTACCTTATTTTTACAAAACGGGAACAAAGTACTCTAAAGAATTAACAACATACTAGATATAGCATGCATTCAAGAGAATACAAAATCATGATTATACTAGCCACAGAAATGTTGAAAGAAGGAAATGATTTTTTAGACAAGCTGACAAGTTCACTGGAGTCATACAGAGTCCCATCTTAGACTGTTAGCAATAAAAAAAATACTGCCTATTTGTATTTTTCAAAGCACTTTCACATTTCTCAA

In [56]:
load_and_print_example(dataset_name, tasks[7])

=== Loading task: splice_sites_all ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 600

Sample from 'train' split of splice_sites_all:
{'label': 1,
 'name': 'chr16:28208821-28209421|1',
 'sequence': 'GTACATGCCCTAGGCCCTGGTATATGGTGGTGGCTACTGAAAATTCACTTGTAAGATTTGTACACCCATATTCACAGCAGTATTATTCACAATAGCCAAAGGGTGGAAGCAACCCAGGTATCCACCGAAGGATGAAAGGATATATGAAATGCAGCACATACTCACAATGGAATTCAGCTTTCAAAAGGAAGGAAGCTCTGACACACGCCACCACACGGATGAAGCTTGAGAACACTGTGCCATGTGAAATAAGGCAGTCACAACAGGCCAAATACTTCATGACTCCACTTACATGAGCTACCTAAAGTAGTCAAATTCATAGAAATAGAAACTAAAGTAGGGCAGGGGAGAGACTGAAAAAAGACGGGTACAGGTTGTTTTGGGAAAAAACCTTTGGAGATATATGGTGGTGATCGTTGCACAACAATGTGAATGTTCTTAATTTCACTAAACTGTACACTTAAAAATGGTTAAAATGGCTAATTTTAATTATGTATATTTTACTACAATTAAAAAACAAAACTTAAGTATATTGAGGCCGGGCGCGATGGCTCATGCATGTAATCTCAGCACTTTGGGAGGCCGAGGCGGGTGGATCAC'}


In [57]:
load_and_print_example(dataset_name, tasks[8])

=== Loading task: H3K4me2 ===
Split name: train  Size: 30000
Split name: test  Size: 2138
Sequence length : 1000

Sample from 'train' split of H3K4me2:
{'label': 1,
 'name': 'chr11:57240722-57241722|1',
 'sequence': 'TTGAAGCGTCCCTGAAATCAGGTCCCTGTCTCCCTCCCCTCAGCTGTTCCCACTGGAATGCTGAGCCAGACAGGAGCCTGGAGACCCCTGTGGGAAAGGATATGGATCCATCGCTTTCATCTGCCGACCTCCAGGATGTCTGCCCTAGAGGGAGTAAAGAGCAGTTAATCTCACTAGAGTTACCAGACTCATTGAGAGGGGAGGGAATTGGGGGCCAGGGCTGGGAAGAGGCCACCGTTTGGGACCAGAGAGGGTAGAGTGCTGATAAGATCCGGCCTCCAACCAGGAGCCAGCTGTAGCCAGATGGCCTGAGTGCCCCCTGCAATGACAGCCTGAAGTGAGCAGAATTAGCCAGCTCACTCCTTATCCTGCCTGATCTGATCTGTCCCTGTTCCGCATTGCACCATTCCACACAGAAGAAAGACTGGAAAATAGGACCAGTAGCTGAAGATGAAACTTGTGTGTCCCGGGGCTCAGAAGTATATGGGTCCTGGGCCTCACAGACTAGACATATACAAGGCCTGGGACAGATATCTTCTTTTCATTTCTGCCCCCCACCCTACTTGGCACCTGGTAAAAGTCTGTTGAATTAAAGCAATAGAAACACACTCAGGAGAAGGGTGTAAGACTTGGATCTTCACCCCAGAGCTGCTCTACTGCATTAACAAGGCAACTTTTGCTAAAGTGATCCAGGAAATTCTACTCTGAGCTATCTGGCAAACCAATAGCTAAGGCAAGCCTGTTGGCCAGGGTGCCCAGAGCTGGCCTGCCCTGGAGAGGCCT

In [58]:
load_and_print_example(dataset_name, tasks[9])

=== Loading task: H3K4me3 ===
Split name: train  Size: 17468
Split name: test  Size: 776
Sequence length : 1000

Sample from 'train' split of H3K4me3:
{'label': 1,
 'name': 'chr10:73716-74716|1',
 'sequence': 'GCCAGTCCCCAACATCCCGACAGCCCGACGGCGTCTCCGCTTTCCTCCTCCTCTCCCGGTACCAGGGTCTCTCCCCAGAAACAAACTCGCATTCCTAACCGGCATCTTGGCCTTGCGCTGGGGGTGACCCGCCCAAGCCACCATGAAGGGACGCTCGCACAAAGTGAAGTCCACATACAAACAGCCCGACGGGAAACGCGGCCGCGCTCGCCCCGCTGCACTCGCAGCGCGGGCAGGAAGCCTTTTCCTCACTTTTGCCTCGGTGTCCCGGAGCTGCAGCAGCGTCTCCCTGTCCTCACAGCGGACGCGGCCCCAGGTGTCCCAAGCCCCGGCCCCTCCTGGGTGGGTGCTGAGGAGAGGAAGCTCCGTCCTCACGGTGGACCCCCCGTCCTCACAGTGGACCCCCCGTCCTCACAGTGGACCCCCCAGGACGCCGCCGTGCGGTTCGGACACGGTTCGCGCGCGCGCGGCCCTCCGGGTTTGGCAGGGCCGGGCGCCCCCTCGCGACAGCTCTGGGGAATCTCTGGAAATCAGCGTCTGATTTTTTCCAGGCCCTGATTTTGGAAATTTCAACTGAACTGGAGACCACCATCCCGTCCCTTGACCGGAACGCATGGAGATAGCAATCGAAGAAGATGACCAGTTCACGCCTGTAATCCCAGTACTTTGGGAGGCCGAGTCGGGCGGATCACTCGAGGTCAGGAAATCGAGACCATTCTGGCCAACATGGTAAAATCCCATCTCTACTAAAAATACAAAAATTATCTGGGCGTGTTGGCACACGCCTGTA

In [59]:
load_and_print_example(dataset_name, tasks[10])

=== Loading task: enhancers_types ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 400

Sample from 'train' split of enhancers_types:
{'label': 1,
 'name': 'chr1:46339383-46339783|1',
 'sequence': 'TCCAACTCCTGACCTCAGGTGATCCACCCACCTCGGCCTCCCAAGTGCTAGGATTACAGGAATGAGCCACAGTGCCGGCCAGCCTTTTTCTTGAAAAATGAACAAAGTGAGCCTGTCACTTCAAGGAAAATAACTGACAGTATTTGTTGCCAATAAGAAAATTCCAAAATTCCACATTCAAATGCCAAATAAGAAAAAGCTTTCGGCTGGGGGCAGCGTGAGCCACCGCGCCCGGCCAAGTTTTGTATTATTTTTCTTAACTATGTCTTCTCAAATTGCGGAAGCGTTAGGCCCCACAACATTTGACATCACCAGTAGGGATCCAGTGGAGAGGAGGGACATGATCAGAAACTGTGTTTTTAAATGATCATTCAACAGCCAATTATCAGGGTGGGTAGGA'}


In [60]:
load_and_print_example(dataset_name, tasks[11])

=== Loading task: promoter_no_tata ===
Split name: train  Size: 30000
Split name: test  Size: 1372
Sequence length : 300

Sample from 'train' split of promoter_no_tata:
{'label': 1,
 'name': 'chr10:70478753-70479053|1',
 'sequence': 'GGGTCCGCCCCTCGGCGTTGGGTAGCGGGGCGCTGGGGAGCAGCGCGGCGCGCACGGGCCGGGGCGCGCAGGTCCCGTCGCCGGTGAGCACGGGCTCCCTCTCGCGTGGCCTCGCCGGGTCCGCCTGGCCTGCCCACCTCCGGAGCCACCTCTGCCCCCGCATGGGCTGGCGAAGTTGGGAGGAGCGAGCTGGAGCCAGAGCGCGCGCCGGGCGCGCCCCGTCGCTGCCTGACTCGGCGCCCGCAGTTCGGGCGCAGCACGCCGGCCGCAGGAGCACGGATGCCCCCCGGAGCCGCGGGC'}


In [61]:
load_and_print_example(dataset_name, tasks[12])

=== Loading task: H3K9ac ===
Split name: train  Size: 23274
Split name: test  Size: 1004
Sequence length : 1000

Sample from 'train' split of H3K9ac:
{'label': 1,
 'name': 'chr10:921513-922513|1',
 'sequence': 'AGGGGGAGGACTACCACAATGACCAACACACTTCCAAAATTCTGAACTTACAGGAAATAGTATGCCAAGAATCTGTGTGGCCTTGCCTGAAAGATAATAAAACTTGTGGTCACTCGGAATCCTAAGGCACACTGCAAAAGTTTACTCTTCACTGGAATTTCTCCAGGTCATGGAGATTTTTAAGTGCCTCAATGCTTCTACAGTTTATTTGGCAATTCTACATCCCATTTCCAGTCCTTTCATGCTTAGGTTTAACTCTCCTGTACTGAGCCCTAAAGAATAATGATTTCAGCAGAAGAGATTAAATTCATCGGTGTGTACAAAGAAATGGGAAGAGCAGGGAAAAACTGAAGGCCACGTGCACACGCAGCAACACTGGTAATACTGCTTCCCATATTCATTCTTTGGTCTTCACTGACAGCGTGAAGAAGCAACTATTGCTGATTCATCAGTTACTCATTTTCACAATGGTCAGTACTCCAAGTCAGCGGTAACCAATCTTTTTTGGCACCAGGGACTGACAGGTTTCATGCAAGACAATTTTTCCACAGACAGAGCGGAGGGGTCAGGGGGTGGGGGGTGGTGAGGAGGATGCTTTCAGGATGAAACTGTTCCACCTCAGATCATCAGGCACTAGATTCTCATAAGGAGCATGCGACCTAGATCCCTTGCATGTGCAGTGCACAGTAGGATTTGAGCTCTTATGAGAATCTAGCGCAACTGCTCATCTGAGAGGAGGTGGAGCTCAGGCAGTAATGCTCACTCACCCTGCTGCTCACCTCCTGCTGT

In [62]:
load_and_print_example(dataset_name, tasks[13])

=== Loading task: H3K9me3 ===
Split name: train  Size: 27438
Split name: test  Size: 850
Sequence length : 1000

Sample from 'train' split of H3K9me3:
{'label': 1,
 'name': 'chr10:564545-565545|1',
 'sequence': 'CAGAAGCTGAGCACTGTGTTCTGATCCCAGGTCTGCCGCTGGACTCTGTGGCATTTTGCCTGTCTCCTTTTGACCTCTGTTTCCTCATCTCTAACATGAGGTAAAAGAAGTTCCCTCCTTAGCGGATTTCTGTGAGGACTCAATGAGCCAATATGCCCGAAGCATTCAGCATAGCGTCCAGCAGACACCAGGCACTGCACAGAACTCAGCCTACTCGGCCGAACACACCAGCCACCTCCACCCAGTGCTCTTCCCATCTCAGAAAAAAACACATAACACGTGAGCAGAAACTGGAAAACAACAAATATGTTCCCAACCTGACTTCCTAGAAGACTCACACACACAGTAGAAATCTTTCTTCTCTGTGTTTGTCAATCACCTTTACAATAAATCCAAGCAAAACCACATTAGAACTCAGTGAGTTTAACATCTACTGGAAACCTGGTGAAGATGAAGAGTAAACACTTTTCTGGTCCCCTTTGCTACCAGCAATTTCCCTTCCTCTATTCAATGGCAAACCAGGTGCTGTAGGAGAGGGCGGTTCCATCACTCCCCTAATTGGTTAGGAAGCTTTCCATGGAGCCTGGACCACCCCTAGTCAGTATCAGGGCCTCGAGTGAGCACAGGCAAGAGAACACGTCCATTCCATCAATAGACTAAGCGTGAATGTCCTCTCTGTACAAAGCATCATGCTGGCATTTGTACATACACAGCATTTAAGAACCTTAAGAGTACCGCCCTTCCACAAGTTTAAATTCAGCTGAGGAAACCATGTGACTCCTAAAAAT

In [63]:
load_and_print_example(dataset_name, tasks[14])

=== Loading task: promoter_tata ===
Split name: train  Size: 5062
Split name: test  Size: 212
Sequence length : 300

Sample from 'train' split of promoter_tata:
{'label': 1,
 'name': 'chr10:1025848-1026148|1',
 'sequence': 'ATGTGGAACTCAATGTGCAAGTTGAACCAGGGCCCTTTTATACCCTGGGGATGTAGAAAGATATCAGCTAAGGCAATCTGATATTTTTGTATTCTTAACAAGAAAACACACCCCGGTCTCTCTCTATCTCTCCATCTCTTTCTCTCTATCTTTCTCCCTCTCAATTGAGGGATAATAGAAAATTGTTTTGACACAATCTCACTTTAAAAAATGCTTTTAGTTTTTAAACCTAGGGTAAGTTATTTTCTACTCTATGGCTTTCTTTTTTTTTTTTTTTTTAAGGTGATGTCTAGCTCTGTT'}


In [64]:
load_and_print_example(dataset_name, tasks[15])

=== Loading task: enhancers ===
Split name: train  Size: 30000
Split name: test  Size: 3000
Sequence length : 400

Sample from 'train' split of enhancers:
{'label': 1,
 'name': 'chr1:46339383-46339783|1',
 'sequence': 'TCCAACTCCTGACCTCAGGTGATCCACCCACCTCGGCCTCCCAAGTGCTAGGATTACAGGAATGAGCCACAGTGCCGGCCAGCCTTTTTCTTGAAAAATGAACAAAGTGAGCCTGTCACTTCAAGGAAAATAACTGACAGTATTTGTTGCCAATAAGAAAATTCCAAAATTCCACATTCAAATGCCAAATAAGAAAAAGCTTTCGGCTGGGGGCAGCGTGAGCCACCGCGCCCGGCCAAGTTTTGTATTATTTTTCTTAACTATGTCTTCTCAAATTGCGGAAGCGTTAGGCCCCACAACATTTGACATCACCAGTAGGGATCCAGTGGAGAGGAGGGACATGATCAGAAACTGTGTTTTTAAATGATCATTCAACAGCCAATTATCAGGGTGGGTAGGA'}


In [65]:
load_and_print_example(dataset_name, tasks[16])

=== Loading task: H4K20me1 ===
Split name: train  Size: 30000
Split name: test  Size: 2270
Sequence length : 1000

Sample from 'train' split of H4K20me1:
{'label': 1,
 'name': 'chr6:2184275-2185275|1',
 'sequence': 'AAGTGATCTCAAACCACACGTGAGACACCACTTCCACTCAACAACTGGCACTAAGAATAAAACTAAAACTCATGACCAAGTCAATTAAGCCCCTAGAAGATCTGGCCCTTGCCTACCTTCCCCCCACCTCTCTTAACGATTCTCTTCCTCATTATACTCCTGGCCTCTTTCCATCCCTTGAACACACTAACTAGGCTTTTTGCTACTTTGAGGTCTTTGCACTTCTATTTGTTCTGCCTAGAAAGGTCCTCTCTGATGACTTTAGATTTCAGTTCAAATCTCAGCTCAGAGAGGCTGCCCGGACCTACATAGCTATCATTCCCTGTCACTCTCTAGCACGGTGCTTCCCAAACTATCTGTGATGAAGGACCCATTGTTTTTCATTGTCAATCTGTCATAAAACTTTTGTAAAAATTCAATAAAAAATTTTCTTTTAAAAATGTAACTTTAAAAACATATAAAATACAAGCCCAAAATTCTCATTATCAGCATTAACAGTCAAAAAATCTATCATATTACCATAAAAGTCTCTTGTTTTCTGTGTTTATACCATTGGGAACCATTGTAATCAGCACACAACTGTCAGCAGATAAGACTGTGACATTGCTCCAGTGTGCCACCAAGCTAATTCCTTTCACAATACTTGCCACACTCTAGAGTCGCTATTCATTTATGTGTTGATCTGTACCAGACACTGTTTTAGGCACTGGAATTGAAGCGATCAACAAAACAGACAAGATCCCTGCACTCACAGGGCTCACGTTCTATGGTGGAGGCCCAGGGA

In [66]:
load_and_print_example(dataset_name, tasks[17])

=== Loading task: promoter_all ===
Split name: train  Size: 30000
Split name: test  Size: 1584
Sequence length : 300

Sample from 'train' split of promoter_all:
{'label': 0,
 'name': 'chr1:87378000-87378300|0',
 'sequence': 'TCATCTTCTGTTTTTGTTGCACTGTGTTCTCAGAATGTGTTTGGTATGATTTTGGTCCTTTTGTATTTGCTAAAGATTGTTTTATGTCCAATTATGTAGTTGATTTTAGAGTGTGTGCCATGTGGCAACAAGAATGTATATTCTGTTGTTTTGGGGTGGAGATTTCCGTAGAGGTGTAAAAAATCCATTTGGTCCAATGTTGAATTCAGGTTCTGAATATCTTTGTTAACTTTTTGCCTTGATGATCTGTTTAATACTGTCTATGGAGTTTTGGTCTCCCATTATTATTTTGTGGGAGTC'}
