In [2]:
%cd ../

/Users/qiaochufeng/Documents/GitHub/DS596-Project


In [3]:
import os
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import load_pretrained_model


BENCHMARK_NAME = 'secondary_structure'
BENCHMARKS_DIR = 'protein_bert/protein_benchmarks'

train_set_file_path = os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.train.csv')
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, test_size=0.1, random_state=0)

test_set_file_path = os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.test.csv')
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

In [21]:
from proteinbert import OutputType, OutputSpec, FinetuningModelGenerator, load_pretrained_model, finetune, evaluate_by_len, log
from proteinbert.conv_and_global_attention_model import get_model_with_hidden_layers_as_outputs


output_type = OutputType(True, 'categorical')
if output_type.is_categorical:
    if output_type.is_seq:
        unique_labels = sorted(set.union(*train_set['label'].apply(set)) | set.union(*valid_set['label'].apply(set)) | \
                set.union(*test_set['label'].apply(set)))
    else:
        unique_labels = sorted(set(train_set['label'].unique()) | set(valid_set['label'].unique()) | set(test_set['label'].unique()))
    log('%d unique lebels.' % len(unique_labels))
elif output_type.is_binary:
    unique_labels = [0, 1]
else:
    unique_labels = None
OUTPUT_SPEC = OutputSpec(output_type, unique_labels)


pretrained_model_generator, input_encoder = load_pretrained_model()

model_generator = FinetuningModelGenerator(pretrained_model_generator, OUTPUT_SPEC, pretraining_model_manipulation_function = \
        get_model_with_hidden_layers_as_outputs, dropout_rate = 0.5)

training_callbacks = [
    keras.callbacks.ReduceLROnPlateau(patience = 1, factor = 0.25, min_lr = 1e-05, verbose = 1),
    keras.callbacks.EarlyStopping(patience = 2, restore_best_weights = True),
]

finetune(model_generator, input_encoder, OUTPUT_SPEC, train_set['seq'], train_set['label'], valid_set['seq'], valid_set['label'], \
        seq_len = 512, batch_size = 32, max_epochs_per_stage = 1, lr = 1e-04, begin_with_frozen_pretrained_layers = True, \
        lr_with_frozen_pretrained_layers = 1e-02, n_final_epochs = 1, final_seq_len = 1024, final_lr = 1e-05, callbacks = training_callbacks)

[2024_11_26-22:01:29] 3 unique lebels.
[2024_11_26-22:01:30] Training set: Filtered out 499 of 7810 (6.4%) records of lengths exceeding 510.
[2024_11_26-22:01:30] Validation set: Filtered out 71 of 868 (8.2%) records of lengths exceeding 510.
[2024_11_26-22:01:30] Training with frozen pretrained layers...




AttributeError: 'numpy.float32' object has no attribute 'assign'

In [16]:
test_seqs_truncated = [seq[:510] for seq in test_set['seq']]

X = input_encoder.encode_X(test_seqs_truncated, 512)
model = pretrained_model_generator.create_model(512)
y_pred = model.predict(X, batch_size=32)






In [19]:
y_pred[0].shape

(434, 512, 26)

In [3]:
from Bio import SeqIO
from Bio.Blast import NCBIWWW

sequence = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
result_handle = NCBIWWW.qblast("blastp", "nr", sequence)

<_io.StringIO at 0x1115f4160>

In [None]:
sequence = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"

with torch.no_grad():
    output = esmfold.infer_pdb(sequence)

pdb_filename = "/Users/qiaochufeng/Downloads/6kl9.pdb"
with open(pdb_filename, "w") as f:
    f.write(output)

pdb_file = PDBFile.read(pdb_filename)
structure = pdb_file.get_structure()
backbone = structure[(structure.atom_name == "N") | 
                     (structure.atom_name == "CA") | 
                     (structure.atom_name == "C")]

secondary_structure = struc.annotate_sse(backbone)

sse_symbols = ''.join(secondary_structure)
print("predicted structure:")
print(sse_symbols)

In [5]:
def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psiprid}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")

In [10]:
import time
import json
import requests

def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psipred}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")

call_psipred_api("MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK")

{'UUID': '68045620-aced-11ef-979d-00163e100466', 'submission_name': 'test'}


'CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEEEEECCCCCCCHHHHHHHCCCCCEECCCCCCCCCCCCHHHHHCCCCCEEEEEEEEECCCEEEEEEEEEEECCEEEEEEEEEEECCCCCCCCCCCCCCCCCCCCCEEEEECCCCCCEEEEEEEEEECCCCCEEEEEEECCCCCCCCCCCCCCCCEEEEEEEEECCCCCCCCCCEEEEEEHHHCCCCCCCCCCCC'