# CB513 Batching

In [1]:
import gzip
import numpy as np
import json
import csv
import os

directory = '../Data/Preprocessed/'
output_path = '../Data/Input/'
cb513_path = directory + 'cb513+profile_split1.npy.gz'

In [2]:
# with open(cb513_path, 'r') as file:
#     reader = csv.reader(file)
#     for row in reader:
#         cb513 = list(reader)

# # Extract the sequences from the cb513 dataset
# sequences = [row[0] for row in cb513]
# print(f"Number of sequences: {len(sequences)}")
# for i, seq in enumerate(sequences):
#     print(f"Sequence {i}: {seq}")

# # Function to create job lists for AlphaFoldServer (20 sequences per job)
# # n from 0-513: 514 sequences
# n_end = 513
# n_start = 0

# def create_joblist_batches(n_start, n_end, batch_size=20):
#     joblist = []
#     batch_counter = 1
#     for n in range(n_start, n_end+1):
#         seq = sequences[n]
#         job = {
#             "name": f"Cb513_{n}",
#             "modelSeeds": ["1"],
#             "sequences": [
#                 {
#                     "proteinChain": {
#                         "sequence": seq,
#                         "count": 1
#                     }
#                 }
#             ]
#         }
#         joblist.append(job)
        
#         if len(joblist) == batch_size or n == n_end:
#             filename = output_path + f'cb513_batch/joblist_{batch_counter}.json'
#             with open(filename, 'w') as f:
#                 json.dump(joblist, f, indent=4)
#             print(f"Joblist batch {batch_counter} created")
#             joblist = []
#             batch_counter += 1

# create_joblist_batches(n_start, n_end)

# TS115 Batching

In [3]:
# # Read the TS115.csv file
# ts115_path = directory + 'TS115_preprocessed.csv'

# with open(ts115_path, 'r') as file:
#     reader = csv.reader(file)
#     for row in reader:
#         ts115 = list(reader)

# # Extract the sequences
# sequences = [row[0] for row in ts115]
# print(f"Number of sequences: {len(sequences)}")
# for i, seq in enumerate(sequences):
#     print(f"Sequence {i}: {seq}")

# # Function to create job lists for AlphaFoldServer (30 sequences per job)
# # n from 0-114: 115 sequences
# n_end = 114
# n_start = 0

# def create_joblist_batches(n_start, n_end, batch_size=30):
#     joblist = []
#     batch_counter = 1
#     for n in range(n_start, n_end+1):
#         seq = sequences[n]
#         job = {
#             "name": f"ts115_{n}",
#             "modelSeeds": ["1"],
#             "sequences": [
#                 {
#                     "proteinChain": {
#                         "sequence": seq,
#                         "count": 1
#                     }
#                 }
#             ]
#         }
#         joblist.append(job)
        
#         if len(joblist) == batch_size or n == n_end:
#             filename = output_path + f'ts115_batch/joblist_{batch_counter}.json'
#             with open(filename, 'w') as f:
#                 json.dump(joblist, f, indent=4)
#             print(f"Joblist batch {batch_counter} created")
#             joblist = []
#             batch_counter += 1

# create_joblist_batches(n_start, n_end)

# Batching Function for all

In [4]:
def process_dataset(dataset_name, batch_size=30):
    # Ensure output directory exists
    batch_dir = os.path.join(output_path, f'{dataset_name}_batch/')
    os.makedirs(batch_dir, exist_ok=True)

    # Read dataset and extract sequences, skipping the header
    dataset_path = os.path.join(directory, f'{dataset_name}_preprocessed.csv')
    with open(dataset_path, 'r') as file:
        reader = csv.reader(file)
        next(reader, None)  # Skip the header
        sequences = [row[2] for row in reader]  # Sequence is in the third column

    print(f"Processing {len(sequences)} sequences from {dataset_name}...")

    # Create job lists in batches
    joblist, batch_counter = [], 1
    for n, seq in enumerate(sequences):
        joblist.append({
            "name": f"{dataset_name}_{n}",
            "modelSeeds": ["1"],
            "sequences": [{"proteinChain": {"sequence": seq, "count": 1}}]
        })

        if len(joblist) == batch_size or n == len(sequences) - 1:
            filename = os.path.join(batch_dir, f'joblist_{batch_counter}.json')
            with open(filename, 'w') as f:
                json.dump(joblist, f, indent=4)
            print(f"Batch {batch_counter} created: {filename}")
            joblist, batch_counter = [], batch_counter + 1

In [5]:
process_dataset('CASP10', batch_size=30)

Processing 123 sequences from CASP10...
Batch 1 created: ../Data/Input/CASP10_batch/joblist_1.json
Batch 2 created: ../Data/Input/CASP10_batch/joblist_2.json
Batch 3 created: ../Data/Input/CASP10_batch/joblist_3.json
Batch 4 created: ../Data/Input/CASP10_batch/joblist_4.json
Batch 5 created: ../Data/Input/CASP10_batch/joblist_5.json


In [6]:
process_dataset('CASP14', batch_size=30)

Processing 10 sequences from CASP14...
Batch 1 created: ../Data/Input/CASP14_batch/joblist_1.json


In [7]:
process_dataset('TS115', batch_size=30)

Processing 115 sequences from TS115...
Batch 1 created: ../Data/Input/TS115_batch/joblist_1.json
Batch 2 created: ../Data/Input/TS115_batch/joblist_2.json
Batch 3 created: ../Data/Input/TS115_batch/joblist_3.json
Batch 4 created: ../Data/Input/TS115_batch/joblist_4.json


In [9]:
process_dataset('CASP12', batch_size=30)

Processing 21 sequences from CASP12...
Batch 1 created: ../Data/Input/CASP12_batch/joblist_1.json
