# CB513 Batching

In [None]:
import gzip
import numpy as np
import json
import csv

directory = '../Data/AlphaFoldServer/Datasets/'
cb513_path = directory + 'cb513+profile_split1.npy.gz'

with open(cb513_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        cb513 = list(reader)

# Extract the sequences from the cb513 dataset
sequences = [row[0] for row in cb513]
print(f"Number of sequences: {len(sequences)}")
for i, seq in enumerate(sequences):
    print(f"Sequence {i}: {seq}")

# Function to create job lists for AlphaFoldServer (20 sequences per job)
# n from 0-513: 514 sequences
n_end = 513
n_start = 0

def create_joblist_batches(n_start, n_end, batch_size=20):
    joblist = []
    batch_counter = 1
    for n in range(n_start, n_end+1):
        seq = sequences[n]
        job = {
            "name": f"Cb513_{n}",
            "modelSeeds": ["1"],
            "sequences": [
                {
                    "proteinChain": {
                        "sequence": seq,
                        "count": 1
                    }
                }
            ]
        }
        joblist.append(job)
        
        if len(joblist) == batch_size or n == n_end:
            filename = directory + f'/joblist_{batch_counter}.json'
            with open(filename, 'w') as f:
                json.dump(joblist, f, indent=4)
            print(f"Joblist batch {batch_counter} created")
            joblist = []
            batch_counter += 1

create_joblist_batches(n_start, n_end)

# TS115 Batching

In [None]:
# Read the TS115.csv file
ts115_path = directory + 'TS115.csv'

with open(ts115_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        ts115 = list(reader)

# Extract the sequences
sequences = [row[0] for row in ts115]
print(f"Number of sequences: {len(sequences)}")
for i, seq in enumerate(sequences):
    print(f"Sequence {i}: {seq}")

# Function to create job lists for AlphaFoldServer (30 sequences per job)
# n from 0-114: 115 sequences
n_end = 114
n_start = 0

def create_joblist_batches(n_start, n_end, batch_size=30):
    joblist = []
    batch_counter = 1
    for n in range(n_start, n_end+1):
        seq = sequences[n]
        job = {
            "name": f"ts115_{n}",
            "modelSeeds": ["1"],
            "sequences": [
                {
                    "proteinChain": {
                        "sequence": seq,
                        "count": 1
                    }
                }
            ]
        }
        joblist.append(job)
        
        if len(joblist) == batch_size or n == n_end:
            filename = directory + f'/joblist_{batch_counter}.json'
            with open(filename, 'w') as f:
                json.dump(joblist, f, indent=4)
            print(f"Joblist batch {batch_counter} created")
            joblist = []
            batch_counter += 1

create_joblist_batches(n_start, n_end)

Number of sequences: 115
Sequence 0: MTRLSEILDQMTTVLNDLKTVMDAEQQQLSVGQINGSQLQRITEEKSSLLATLDYLEQQRRLEQNAQRSANDDIAERWQAITEKTQHLRDLNQHNGWLLEGQIERNQQALEVLKPHQEPTLYGADGQTSVSHRGGKKISI
Sequence 1: MKPTYEILGQMDETFILVKDSEYLYFVDQHLLEERINYEKLKDENLACRISVKAGQKLSEEKIRELIKTWRNLENPHVCPHGRPIYYKIPLREIYEKVGRNY
Sequence 2: SGRPMPVFEDVTRALVRELNPRGDLTPLDSLIDFKHFRPFCLVLRKRKSTLFWGARYVRTDYTLLDLLEPGSSPSDLTDSGNFSFKNMLDVQVQGLVEVPKTVKVKGTAGLSQSSTLEVQTLSVAPSALENLKKERKLSADHSFLNEMRYHEKNLYVVMEAVEAKQEVTVEQTGNANAIFSLPSLALLGLQGSLNNNKAVTIPKGCVLAYRVRLLRVFLFNLWDIPYICNDSMQTFPKIRRVPCSAFISPTQMISEEPEEEKLIGEMHEDFKTLKEEVQRETQEVEKLSPVGRSSLLTSLSHLLGKKKELQDLEQKLEGALDKGQKVTLEALPKDVLLSKDAMDAILYFLGALTELTEEQLKILVKSLEKKILPVQLKLVESTLEQNFLQDKEGVFPLQPDLLSSLGEEELTLTEALVGLSGLEVQRSGPQYAWDPDTRHNLCALYAGLSLLHLLSRKSNALTYCALS
Sequence 3: MTARRFLNELADLYGVATSYTDYKGAHIEVSDDTLVKILRALGVNLDTSNLPNDDAIQRQIALFHDREFTRPLPPSVVAVEGDELVFPVHVHDGSPADVHIELEDGTQRDVSQVENWTAPREIDGIRWGEASFKIPGDLPLGWHKLHLKSNERSAECGLIITPARLSTADKYLDSPRSGVMAQIYSVRSTLSWGMGDFNDLGNLASVVAQDGAD

# CASP14 Batching