# CB513 Batching

In [1]:
import gzip
import numpy as np
import json
import csv
import os

directory = '../Data/Preprocessed/'
output_path = '../Data/Input/'
cb513_path = directory + 'cb513+profile_split1.npy.gz'

# Batching Function for all

In [2]:
def process_dataset(dataset_name, batch_size=30):
    # Ensure output directory exists
    batch_dir = os.path.join(output_path, f'{dataset_name}_batch/')
    os.makedirs(batch_dir, exist_ok=True)

    # Read dataset and extract sequences, skipping the header
    dataset_path = os.path.join(directory, f'{dataset_name}_preprocessed.csv')
    with open(dataset_path, 'r') as file:
        reader = csv.reader(file)
        next(reader, None)  # Skip the header
        sequences = [row[2]
                     for row in reader]  # Sequence is in the third column

    print(f"Processing {len(sequences)} sequences from {dataset_name}...")

    # Create job lists in batches
    joblist, batch_counter = [], 1
    for n, seq in enumerate(sequences):
        joblist.append({
            "name":
            f"{dataset_name}_{n}", # None for the first edition
            "modelSeeds": [1], # Parameter 1 yang diubah
            "sequences": [{
                "proteinChain": {
                    "sequence": seq,
                    "count": 1,
                }
            }],
        })

        if len(joblist) == batch_size or n == len(sequences) - 1:
            filename = os.path.join(batch_dir, f'joblist_{batch_counter}.json')
            with open(filename, 'w') as f:
                json.dump(joblist, f, indent=4)
            print(f"Batch {batch_counter} created: {filename}")
            joblist, batch_counter = [], batch_counter + 1

In [3]:
process_dataset('CASP10', batch_size=30)

Processing 123 sequences from CASP10...
Batch 1 created: ../Data/Input/CASP10_batch/joblist_1.json
Batch 2 created: ../Data/Input/CASP10_batch/joblist_2.json
Batch 3 created: ../Data/Input/CASP10_batch/joblist_3.json
Batch 4 created: ../Data/Input/CASP10_batch/joblist_4.json
Batch 5 created: ../Data/Input/CASP10_batch/joblist_5.json


In [4]:
process_dataset('CASP14', batch_size=30)

Processing 10 sequences from CASP14...
Batch 1 created: ../Data/Input/CASP14_batch/joblist_1.json


In [5]:
process_dataset('TS115', batch_size=30)

Processing 115 sequences from TS115...
Batch 1 created: ../Data/Input/TS115_batch/joblist_1.json
Batch 2 created: ../Data/Input/TS115_batch/joblist_2.json
Batch 3 created: ../Data/Input/TS115_batch/joblist_3.json
Batch 4 created: ../Data/Input/TS115_batch/joblist_4.json


In [6]:
process_dataset('CASP12', batch_size=30)

Processing 21 sequences from CASP12...
Batch 1 created: ../Data/Input/CASP12_batch/joblist_1.json


In [8]:
process_dataset('CB513', batch_size=20)

Processing 514 sequences from CB513...
Batch 1 created: ../Data/Input/CB513_batch/joblist_1.json
Batch 2 created: ../Data/Input/CB513_batch/joblist_2.json
Batch 3 created: ../Data/Input/CB513_batch/joblist_3.json
Batch 4 created: ../Data/Input/CB513_batch/joblist_4.json
Batch 5 created: ../Data/Input/CB513_batch/joblist_5.json
Batch 6 created: ../Data/Input/CB513_batch/joblist_6.json
Batch 7 created: ../Data/Input/CB513_batch/joblist_7.json
Batch 8 created: ../Data/Input/CB513_batch/joblist_8.json
Batch 9 created: ../Data/Input/CB513_batch/joblist_9.json
Batch 10 created: ../Data/Input/CB513_batch/joblist_10.json
Batch 11 created: ../Data/Input/CB513_batch/joblist_11.json
Batch 12 created: ../Data/Input/CB513_batch/joblist_12.json
Batch 13 created: ../Data/Input/CB513_batch/joblist_13.json
Batch 14 created: ../Data/Input/CB513_batch/joblist_14.json
Batch 15 created: ../Data/Input/CB513_batch/joblist_15.json
Batch 16 created: ../Data/Input/CB513_batch/joblist_16.json
Batch 17 created: .

In [11]:
def process_dataset(dataset_name, batch_size=30):
    # Ensure output directory exists
    batch_dir = os.path.join(output_path, f'{dataset_name}_2_batch/') # _2 for the edited version
    os.makedirs(batch_dir, exist_ok=True)

    # Read dataset and extract sequences, skipping the header
    dataset_path = os.path.join(directory, f'{dataset_name}_preprocessed.csv') # read the original casp12 dataset
    with open(dataset_path, 'r') as file:
        reader = csv.reader(file)
        next(reader, None)  # Skip the header
        sequences = [row[2]
                     for row in reader]  # Sequence is in the third column

    print(f"Processing {len(sequences)} sequences from {dataset_name}...")

    # Create job lists in batches
    joblist, batch_counter = [], 1
    for n, seq in enumerate(sequences):
        joblist.append({
            "name":
            f"{dataset_name}_{n}_2", # _2 for the edited version
            "modelSeeds": [1], # Parameter 1 yang diubah
            "sequences": [{
                "proteinChain": {
                    "sequence": seq,
                    "count": 1,
                    "maxTemplateDate": "2025-02-03", # Parameter 2 yang diubah
                }
            }],
            "dialect": "alphafoldserver",
            "version": 1,
        })

        if len(joblist) == batch_size or n == len(sequences) - 1:
            filename = os.path.join(batch_dir, f'joblist_{batch_counter}.json')
            with open(filename, 'w') as f:
                json.dump(joblist, f, indent=4)
            print(f"Batch {batch_counter} created: {filename}")
            joblist, batch_counter = [], batch_counter + 1

In [12]:
process_dataset('CASP12', batch_size=30)

Processing 21 sequences from CASP12...
Batch 1 created: ../Data/Input/CASP12_2_batch/joblist_1.json
