In [28]:
import time
import json
import requests
import pandas as pd



def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psipred}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")



split_data = pd.read_csv('../data/split.csv')
split_data = split_data[['Split Site', 'Sequence']]
split_data['Sequence'] = split_data['Sequence'].str.replace(' ', '', regex=False)

expanded_rows = split_data['Split Site'].str.split('/').explode()
expanded_data = pd.DataFrame({
    'Split Site': expanded_rows,
    'Sequence': split_data.loc[expanded_rows.index, 'Sequence'].values
})
expanded_data.reset_index(drop=True, inplace=True)

In [22]:
def read_seq_json(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)
Seqs = read_seq_json('../data/seq_2_second.json')

In [29]:
expanded_data['Secondary'] = expanded_data['Sequence'].map(Seqs)
expanded_data.dropna(subset=["Secondary"], inplace=True) # 换成AlphaFold结果之后记得删掉这行
expanded_data

Unnamed: 0,Split Site,Sequence,Secondary
0,157,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,158,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,193,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
3,194,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,212,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
5,213,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
6,214,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
7,215,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
8,485,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
9,507,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...


In [30]:
expanded_data.to_csv('expanded.csv')

In [None]:
# expanded_data_grouped = expanded_data.groupby(['Sequence', 'Secondary'])['Split Site'].agg(list).reset_index()

In [49]:
def process_group(group):
    all_sites = list(range(1, len(group['Sequence'].iloc[0]) + 1))
    split_sites = group['Split Site'].tolist()
    site = ','.join(map(str, all_sites))
    split = [True if site in split_sites else False for site in all_sites]
    return pd.DataFrame({
        'Site': [site],
        'Split': [split],
        'Sequence': [group['Sequence'].iloc[0]],
        'Secondary': [group['Secondary'].iloc[0]]
    })
# Debugging the `process_group` function to ensure correct handling of True/False marking
def process_group_debug(group):
    sequence_length = len(group['Sequence'].iloc[0])  # Length of the sequence
    all_sites = list(range(1, sequence_length + 1))  # All positions in the Sequence
    split_sites = group['Split Site'].tolist()  # Original Split Site values

    # Convert Split Site values to integers for matching
    split_sites = [int(site) for site in split_sites]

    # Combine all positions as a string
    site = ','.join(map(str, all_sites))

    # Mark Split positions as True
    split = [site in split_sites for site in all_sites]

    return pd.DataFrame({
        'Site': [site],
        'Split': [split],
        'Sequence': [group['Sequence'].iloc[0]],
        'Secondary': [group['Secondary'].iloc[0]]
    })



expanded_data_ = expanded_data.groupby('Sequence').apply(process_group_debug).reset_index(drop=True)
expanded_data_

Unnamed: 0,Site,Split,Sequence,Secondary
0,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
2,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MHHHHHHGSLLPATHELHIFGSINSLEFDLVGRGTGNPKEGYEELH...,CCCCCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEE...
3,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
5,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
6,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MSVIKPDMKIKLRMEGAVNGHPFAIEGVGLGKPFEGKQSMDLKVKE...,CCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEEEC...
7,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MVAGHASGSPAFGTASHSNCEHEEIHLAGSIQPHGALLVVSEHDHR...,CCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCCCEEEEEEECCCCE...
8,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MVEKFVGTWKIADSHNFGEYLKAIGAPKELSDGGDATTPTLYISQK...,CCCCCCEEEEECEECCHHHHHHHCCCCHHHHHHHHHCCCEEEEEEE...
9,"1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,1...","[False, False, False, False, False, False, Fal...",MVSELIKENMPMKLYMEGTVNNHHFKCTSEGEGKPYEGTQTMRIKV...,CCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEE...
