In [28]:
import time
import json
import requests
import pandas as pd



def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psipred}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")



split_data = pd.read_csv('../data/split.csv')
split_data = split_data[['Split Site', 'Sequence']]
split_data['Sequence'] = split_data['Sequence'].str.replace(' ', '', regex=False)

expanded_rows = split_data['Split Site'].str.split('/').explode()
expanded_data = pd.DataFrame({
    'Split Site': expanded_rows,
    'Sequence': split_data.loc[expanded_rows.index, 'Sequence'].values
})
expanded_data.reset_index(drop=True, inplace=True)

In [22]:
def read_seq_json(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)
Seqs = read_seq_json('../data/seq_2_second.json')

In [None]:
expanded_data['Secondary'] = expanded_data['Sequence'].map(Seqs)
expanded_data.dropna(subset=["Secondary"], inplace=True) # 换成AlphaFold结果之后记得删掉这行
expanded_data

In [30]:
expanded_data.to_csv('expanded.csv')

In [None]:
# expanded_data_grouped = expanded_data.groupby(['Sequence', 'Secondary'])['Split Site'].agg(list).reset_index()

In [None]:
import pandas as pd

file_path = 'expanded.csv'
data = pd.read_csv(file_path)
unique_sequences = expanded_data['Sequence'].unique()
expanded_data_ = []

for sequence in unique_sequences:
    
    seq_data = expanded_data[expanded_data['Sequence'] == sequence]
    split_sites = seq_data['Split Site'].tolist()
    secondary_structure = seq_data['Secondary'].iloc[0]
    
    max_site = max(split_sites)
    all_sites = list(range(1, max_site + 1))
    for site in all_sites:
        expanded_data_.append({
            'Site': site,
            'Split': site in split_sites,
            'Sequence': sequence,
            'Secondary': secondary_structure
        })

expanded_df = pd.DataFrame(expanded_data_)

In [None]:
file_path = 'expanded.csv'
data = pd.read_csv(file_path)