In [50]:
from utils.parse_pdb import align_pdb, open_pdb, PDBError, get_pdb_file
import os
import boto3
import pickle
from tqdm import tqdm
from p_tqdm import p_map
import sidechainnet as scn
import numpy as np
from rcsbsearch import TextQuery, Attr
import subprocess
from collections import defaultdict

In [64]:
local_path = get_pdb_file(
    bucket=boto3.resource('s3').Bucket("pdbsnapshots"),
    pdb_file="1a1q.pdb3.gz", 
    tmp_folder="data/tmp_pdb",
    folders=["20220103/pub/pdb/data/biounit/PDB/all/"]
)

In [66]:
file = open_pdb(local_path, tmp_folder="data/tmp_pdb")

In [68]:
file["crd_raw"]

Unnamed: 0,record_name,atom_number,blank_1,atom_name,alt_loc,residue_name,blank_2,chain_id,residue_number,insertion,...,x_coord,y_coord,z_coord,occupancy,b_factor,blank_4,segment_id,element_symbol,charge,line_idx
0,ATOM,1,,CA,,PRO,,C,2,,...,-6.152,88.613,-15.042,1.0,19.03,,,C,,77
1,ATOM,2,,CA,,ILE,,C,3,,...,-8.467,86.347,-13.044,1.0,18.81,,,C,,78
2,ATOM,3,,CA,,THR,,C,4,,...,-7.881,83.301,-10.808,1.0,15.54,,,C,,79
3,ATOM,4,,CA,,ALA,,C,5,,...,-10.303,80.548,-9.965,1.0,15.57,,,C,,80
4,ATOM,5,,CA,,TYR,,C,6,,...,-10.185,78.356,-6.890,1.0,16.14,,,C,,81
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,ATOM,174,,CA,,MET,,C,175,,...,8.022,46.746,-19.673,1.0,31.84,,,C,,250
174,ATOM,175,,CA,,GLU,,C,176,,...,6.519,43.244,-20.353,1.0,37.50,,,C,,251
175,ATOM,176,,CA,,THR,,C,177,,...,10.296,42.851,-20.812,1.0,39.60,,,C,,252
176,ATOM,177,,CA,,THR,,C,178,,...,11.885,40.690,-18.167,1.0,40.68,,,C,,253


In [69]:
visualize("1a1q-3")

<py3Dmol.view at 0x7f596fdab160>

In [61]:
def visualize(id):
    with open(f"./data/pdb/{id}.pickle", "rb") as f:
        data = pickle.load(f)
    crds = []
    seq = ""
    for chain in data:  
        crd = np.concatenate([data[chain]["crd_bb"], data[chain]["crd_sc"]], axis=1).reshape((-1, 3))
        crds.append(crd)
        seq += data[chain]["seq"]
    crd = np.concatenate(crds, 0)
    sb2 = scn.StructureBuilder(seq, crd)
    return sb2.to_3Dmol()

In [33]:
def get_log_stats(log_file, verbose=True):
    stats = defaultdict(lambda: [])
    with open(log_file, "r") as f:
        for line in f.readlines():
            if line.startswith("<<<"):
                stats[line.split(':')[0]].append(line.split(":")[-1].strip())
    keys = sorted(stats.keys(), key=lambda x: len(stats[x]), reverse=True)
    if verbose:
        for key in keys:
            value = stats[key]
            print(f'{key}: {len(value)}')
    return stats

In [47]:
from copy import copy
total_stats = defaultdict(lambda: [])
for log_file in ["data/logs/log_4.txt", "data/logs/log_6.txt", "data/logs/log_7.txt", "data/logs/log_9.txt"]:
    stats = get_log_stats(log_file, verbose=False)
    if "<<< Could not download" not in stats.keys():
        keys = list(stats.keys())
        stats["<<< Could not download"] = []
        for key in keys:
            if key.startswith("<<< Could not download"):
                stats["<<< Could not download"] += copy(stats[key])
                stats.pop(key)
    if "<<< PDB file not found." in stats.keys():
        stats.pop("<<< PDB file not found.")
    for key, value in stats.items():
        total_stats[key] += value


In [48]:
keys = sorted(total_stats.keys(), key=lambda x: len(total_stats[x]), reverse=True)
for key in keys:
    value = total_stats[key]
    print(f'{key}: {len(value)}')

<<< File already exists: 132989
<<< Removed due to redundancy: 63418
<<< Could not download: 15326
<<< Too many missing values in the middle: 12947
<<< Sequence is too short: 9824
<<< Too many missing values in total: 6657
<<< Too many missing values in the ends: 2778
<<< Unnatural amino acids found: 626
<<< Alignment issue in processing: 128
<<< Unexpected atoms (D1): 57
<<< Incorrect alignment: 43
<<< Unexpected atoms (D): 31
<<< Unexpected atoms (D2): 19
<<< Unexpected atoms (DA): 13
<<< Some chains in the PDB do not appear in the fasta file.: 10
<<< Fasta file not found.: 7
<<< Unexpected atoms (DG1): 7
<<< Unknown: 4
<<< Unexpected atoms (DE2): 4
<<< Unexpected atoms (DD21): 3
<<< Unexpected atoms (CB): 3
<<< Unexpected atoms (DA2): 2
<<< Unexpected atoms (DE): 1
<<< Unexpected atoms (DB2): 1
<<< Unexpected atoms (DG): 1


In [49]:
total_stats["<<< Could not download"]

['4v44-1',
 '4v44-3',
 '4v44-2',
 '4v44-4',
 '4yd9-1',
 '5s9l-1',
 '5s9m-1',
 '5s9n-1',
 '6e2x-1',
 '6e2z-1',
 '6e30-1',
 '6e32-1',
 '6e39-1',
 '6l9t-1',
 '6m8p-1',
 '6m8p-10',
 '6m8p-11',
 '6m8p-12',
 '6m8p-13',
 '6m8p-14',
 '6m8p-15',
 '6m8p-16',
 '6m8p-17',
 '6m8p-18',
 '6m8p-19',
 '6m8p-2',
 '6m8p-20',
 '6m8p-21',
 '6m8p-22',
 '6m8p-3',
 '6m8p-4',
 '6m8p-5',
 '6m8p-6',
 '6m8p-7',
 '6m8p-8',
 '6m8p-9',
 '6sxs-1',
 '6t2q-1',
 '6t2q-2',
 '6t2s-1',
 '6t2s-2',
 '6t2s-3',
 '6thy-1',
 '6tig-1',
 '6tid-1',
 '6tjj-1',
 '6tjj-2',
 '6tjk-1',
 '6tjk-2',
 '6tjq-1',
 '6tn1-1',
 '6tsl-1',
 '6tsn-1',
 '6tso-1',
 '6tsm-1',
 '6tsp-1',
 '6tsq-1',
 '6tsr-1',
 '6tt9-1',
 '6u2l-1',
 '6vho-1',
 '6wet-1',
 '6wet-2',
 '6weu-1',
 '6weu-2',
 '6wev-2',
 '6wev-1',
 '6wew-1',
 '6wew-2',
 '6wfj-1',
 '6wfj-2',
 '6y60-1',
 '6y60-2',
 '6y63-1',
 '6y63-2',
 '6yax-1',
 '6yh0-1',
 '6yjr-1',
 '6yjr-2',
 '6yjs-1',
 '6yjs-2',
 '6yju-1',
 '6yju-2',
 '6yjv-1',
 '6yjv-2',
 '6ytp-1',
 '6ytp-2',
 '6ytr-1',
 '6ytr-2',
 '6yut-1

In [14]:
def get_unknown_stats(log_file):
    stats = defaultdict(lambda: [])
    with open(log_file, "r") as f:
        error = None
        id = None
        for line in f.readlines():
            if line.startswith("<<< Unknown"):
                error = ""
                id = line.split(":")[-1].strip()
            elif line.startswith("<<<") and error is not None:
                if error.startswith("Could not download"):
                    error = "Could not download PDB"
                stats[error].append(id)
                error = None
            elif error is not None:
                error += line
    keys = sorted(stats.keys(), key=lambda x: len(stats[x]), reverse=True)
    for key in keys:
        value = stats[key]
        print(f'{key}: {value}')
    return stats

In [15]:
stats = get_log_stats("data/logs/log_4.txt")
download = [x for x in stats.keys() if x.startswith("<<< Could not download")]

<<< File already exists: 132987
<<< Removed due to redundancy: 63418
<<< Too many missing values in the middle: 12883
<<< Sequence is too short: 9794
<<< Too many missing values in total: 6640
<<< Too many missing values in the ends: 2764
<<< Unnatural amino acids found: 625
<<< PDB file not found.: 558
<<< Alignment issue in processing: 128
<<< Unexpected atoms (D1): 57
<<< Incorrect alignment: 43
<<< Unexpected atoms (D): 31
<<< Unexpected atoms (D2): 19
<<< Unexpected atoms (DA): 13
<<< Some chains in the PDB do not appear in the fasta file.: 10
<<< Unexpected atoms (DG1): 7
<<< Fasta file not found.: 6
<<< Unexpected atoms (DE2): 4
<<< Unexpected atoms (DD21): 3
<<< Unknown: 3
<<< Unexpected atoms (CB): 3
<<< Unexpected atoms (DA2): 2
<<< Could not download 4v44.pdb1.gz: 1
<<< Could not download 4v44.pdb3.gz: 1
<<< Could not download 4v44.pdb2.gz: 1
<<< Could not download 4v44.pdb4.gz: 1
<<< Could not download 4yd9.pdb1.gz: 1
<<< Could not download 5s9l.pdb1.gz: 1
<<< Could not dow

In [16]:
len(download)

15326