In [1]:
import os

In [2]:
from collections import defaultdict

def split_pdb_by_chains(pdb_path, output_dir):
    """
    Splits a PDB file into multiple files, each containing one chain.

    :param pdb_path: Path to the input PDB file.
    :param output_dir: Directory where the output files will be saved.
    """
    # Read the PDB file
    with open(pdb_path, 'r') as pdb_file:
        lines = pdb_file.readlines()

    # Dictionary to hold lines for each chain
    chains = defaultdict(list)

    # Iterate through the lines and group by chain
    for line in lines:
        if line.startswith("ATOM") or line.startswith("HETATM"):
            chain_id = line[21]  # Chain identifier is at index 21
            chains[chain_id].append(line)

    # Write each chain to a separate file
    for chain_id, chain_lines in chains.items():
        output_path = f"{output_dir}/{file.split('.')[0]}_{chain_id}.pdb"
        with open(output_path, 'w') as output_file:
            output_file.writelines(chain_lines)
        # print(f"Wrote {chain_id} written to {output_path}")

In [3]:
from tqdm import tqdm

pdb_dir = '../pdb_test_2_merged'
files = os.listdir(pdb_dir)
print(len(files))

out_dir = '../pdb_chains_new_test'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

for file in tqdm(files):
    split_pdb_by_chains(f'{pdb_dir}/{file}', out_dir)

358


100%|██████████| 358/358 [00:03<00:00, 94.39it/s] 


In [7]:
from tqdm import tqdm

pdb_dir = '../data_preparation/good_positives_pdb_06may'
files = os.listdir(pdb_dir)
print(len(files))

out_dir = '../pdb_chains_new'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

for file in tqdm(files):
    split_pdb_by_chains(f'{pdb_dir}/{file}', out_dir)

7193


  0%|          | 22/7193 [00:00<01:06, 107.27it/s]

100%|██████████| 7193/7193 [01:21<00:00, 87.86it/s] 


In [9]:
from tqdm import tqdm

pdb_dir = '../data_preparation/pdb_test_06May_merged'
files = os.listdir(pdb_dir)
print(len(files))

out_dir = '../pdb_chains_new'
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

for file in tqdm(files):
    split_pdb_by_chains(f'{pdb_dir}/{file}', out_dir)

15385


100%|██████████| 15385/15385 [03:22<00:00, 75.96it/s] 


In [6]:
len(os.listdir('../pdb_chains_new_test'))

716

In [7]:
import os
desc_dir = '../descriptors_test_2'
des_files = os.listdir(desc_dir)

import shutil

out = '../masif_descriptors_test/'
if not os.path.exists(out):
    os.makedirs(out)
for file in tqdm(des_files):
    left_name = file.split('_')[0] + '_' + file.split('_')[1]
    right_name = file.split('_')[0] + '_' + file.split('_')[2]
    if not os.path.exists(os.path.join(out, left_name)):
        os.makedirs(os.path.join(out, left_name))
    if not os.path.exists(os.path.join(out, right_name)):
        os.makedirs(os.path.join(out, right_name))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_flipped.npy'), os.path.join(out, left_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_straight.npy'), os.path.join(out, left_name, 'desc_straight.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_flipped.npy'), os.path.join(out, right_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_straight.npy'), os.path.join(out, right_name, 'desc_straight.npy'))


100%|██████████| 351/351 [00:17<00:00, 19.57it/s]


In [16]:
import os
desc_dir = '../sc05/good_descriptors_negatives_06may'
des_files = os.listdir(desc_dir)

import shutil

out = '../masif_descriptors'
if not os.path.exists(out):
    os.makedirs(out)
for file in tqdm(des_files):
    left_name = file.split('_')[0] + '_' + file.split('_')[1]
    right_name = file.split('_')[0] + '_' + file.split('_')[2]
    if not os.path.exists(os.path.join(out, left_name)):
        os.makedirs(os.path.join(out, left_name))
    if not os.path.exists(os.path.join(out, right_name)):
        os.makedirs(os.path.join(out, right_name))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_flipped.npy'), os.path.join(out, left_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_straight.npy'), os.path.join(out, left_name, 'desc_straight.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_flipped.npy'), os.path.join(out, right_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_straight.npy'), os.path.join(out, right_name, 'desc_straight.npy'))


100%|██████████| 6757/6757 [05:23<00:00, 20.90it/s]


In [17]:
import os
desc_dir = '../sc05/good_descriptors_positives_06may'
des_files = os.listdir(desc_dir)

import shutil

out = '../masif_descriptors/'
if not os.path.exists(out):
    os.makedirs(out)
for file in tqdm(des_files):
    left_name = file.split('_')[0] + '_' + file.split('_')[1]
    right_name = file.split('_')[0] + '_' + file.split('_')[2]
    if not os.path.exists(os.path.join(out, left_name)):
        os.makedirs(os.path.join(out, left_name))
    if not os.path.exists(os.path.join(out, right_name)):
        os.makedirs(os.path.join(out, right_name))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_flipped.npy'), os.path.join(out, left_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p1_desc_straight.npy'), os.path.join(out, left_name, 'desc_straight.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_flipped.npy'), os.path.join(out, right_name, 'desc_flipped.npy'))
    shutil.copy(os.path.join(desc_dir, file, 'p2_desc_straight.npy'), os.path.join(out, right_name, 'desc_straight.npy'))


100%|██████████| 7193/7193 [06:25<00:00, 18.65it/s]


In [10]:
import os
pdb_files = set([x[:-len('.pt')] for x in os.listdir('../masif_features/processed')])
desc_files = [x for x in os.listdir('../masif_features/processed/masif_descriptors')]
# desc_files = []
# for file in desc_files_tmp:
#     if len(os.listdir(os.path.join('../masif_features/processed/masif_descriptors', file))) > 0:
#         desc_files.append(file)

# desc_files = set(desc_files)

In [11]:
import numpy as np
def generate_file(first, second, label, path):
    good_ids = []
    for i in range(len(first)):
        if first[i] in pdb_files and second[i] in pdb_files and first[i] in desc_files and second[i] in desc_files:
            good_ids.append(i)
    data = [['', '', first[i], '', '', second[i], label[i]] for i in good_ids]
    print(f'took {len(data)} out of {len(first)}')
    np.random.shuffle(data)
    data = np.array(data, dtype='<U12')
    np.save(path, data)

In [12]:
# !pip3 install pandas

In [19]:
import pandas as pd

df = pd.read_csv('../test_final_pairs_labels.txt', sep='\t')

first = df.pair.apply(lambda x: '_'.join(x.split('_')[:2])).to_list()
second = df.pair.apply(lambda x: x.split('_')[0] + '_' + x.split('_')[2]).to_list()
label = df.label.to_list()

In [20]:
from sklearn.model_selection import train_test_split
ids = list(range(len(first)))
first_val, first_test, second_val, second_test, label_val, label_test = train_test_split(first, second, label, test_size=0.5, random_state=42)

In [21]:
generate_file(first_test, second_test, label_test, '../masif_features/testset.npy')
generate_file(first_val, second_val, label_val, '../masif_features/valset.npy')

took 7682 out of 7693
took 7680 out of 7692


In [22]:
import pandas as pd

df = pd.read_csv('../labels_goods_06May.txt', sep='\t', header=None)

first = df[0].apply(lambda x: '_'.join(x.split('_')[:2])).to_list()
second = df[0].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[2]).to_list()
label = df[1].to_list()

generate_file(first, second, label, '../masif_features/trainset.npy')

took 13911 out of 13950


In [16]:
import os
pdb_files = set([x[:-len('.pt')] for x in os.listdir('../masif_features/processed')])
desc_files = [x for x in os.listdir('../masif_features/processed/masif_descriptors')]

import numpy as np
def generate_file(first, second, label, path):
    good_ids = []
    for i in range(len(first)):
        if first[i] in pdb_files and second[i] in pdb_files and first[i] in desc_files and second[i] in desc_files:
            good_ids.append(i)
    data = [['', '', first[i], '', '', second[i], label[i]] for i in good_ids]
    print(f'took {len(data)} out of {len(first)}')
    np.random.shuffle(data)
    data = np.array(data, dtype='<U12')
    np.save(path, data)

In [3]:
import pandas as pd

df = pd.read_csv('../test_final_pairs_labels.txt', sep='\t')

first = df.pair.apply(lambda x: '_'.join(x.split('_')[:2])).to_list()
second = df.pair.apply(lambda x: x.split('_')[0] + '_' + x.split('_')[2]).to_list()
label = df.label.to_list()

In [4]:
from sklearn.model_selection import train_test_split
ids = list(range(len(first)))
first_val, first_test, second_val, second_test, label_val, label_test = train_test_split(first, second, label, test_size=0.5, random_state=42)

In [5]:
generate_file(first_test[::20], second_test[::20], label_test[::20], '../masif_features/testset_small.npy')
generate_file(first_val[::20], second_val[::20], label_val[::20], '../masif_features/valset_small.npy')

took 384 out of 385
took 385 out of 385


In [6]:
import pandas as pd

df = pd.read_csv('../labels_goods_06May.txt', sep='\t', header=None)

first = df[0].apply(lambda x: '_'.join(x.split('_')[:2])).to_list()
second = df[0].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[2]).to_list()
label = df[1].to_list()

generate_file(first[::20], second[::20], label[::20], '../masif_features/trainset_small.npy')

took 697 out of 698


In [17]:
import os
pdb_files = set([x[:-len('.pt')] for x in os.listdir('../masif_test/processed')])
desc_files = [x for x in os.listdir('../masif_test/processed/masif_descriptors')]
# desc_files = []
# for file in desc_files_tmp:
#     if len(os.listdir(os.path.join('../masif_features/processed/masif_descriptors', file))) > 0:
#         desc_files.append(file)

# desc_files = set(desc_files)

In [19]:
import pandas as pd

df = pd.read_csv('../test_2_labels.txt', sep='\t')

first = df['pair'].apply(lambda x: '_'.join(x.split('_')[:2])).to_list()
second = df['pair'].apply(lambda x: x.split('_')[0] + '_' + x.split('_')[2]).to_list()
label = df['label'].to_list()

generate_file(first, second, label, '../masif_test/testset.npy')

took 351 out of 351
