In [None]:
import time
import json
import requests
import pandas as pd
from sklearn.utils import resample



def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psipred}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")



split_data = pd.read_csv('../data/split.csv')
split_data = split_data[['Split Site', 'Sequence']]
split_data['Sequence'] = split_data['Sequence'].str.replace(' ', '', regex=False)

expanded_rows = split_data['Split Site'].str.split('/').explode()
expanded_data = pd.DataFrame({
    'Split Site': expanded_rows,
    'Sequence': split_data.loc[expanded_rows.index, 'Sequence'].values
})
expanded_data.reset_index(drop=True, inplace=True)

In [22]:
def read_seq_json(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)
Seqs = read_seq_json('../data/seq_2_second.json')

In [None]:
expanded_data['Secondary'] = expanded_data['Sequence'].map(Seqs)
expanded_data.dropna(subset=["Secondary"], inplace=True) # 换成AlphaFold结果之后记得删掉这行
expanded_data

In [30]:
expanded_data.to_csv('expanded.csv')

In [None]:
# expanded_data_grouped = expanded_data.groupby(['Sequence', 'Secondary'])['Split Site'].agg(list).reset_index()

In [None]:
file_path = 'expanded.csv'
data = pd.read_csv(file_path)
unique_sequences = expanded_data['Sequence'].unique()
expanded_data_ = []

for sequence in unique_sequences:
    
    seq_data = expanded_data[expanded_data['Sequence'] == sequence]
    split_sites = seq_data['Split Site'].tolist()
    secondary_structure = seq_data['Secondary'].iloc[0]
    
    max_site = max(split_sites)
    all_sites = list(range(1, max_site + 1))
    for site in all_sites:
        expanded_data_.append({
            'Site': site,
            'Split': site in split_sites,
            'Sequence': sequence,
            'Secondary': secondary_structure
        })

expanded_df = pd.DataFrame(expanded_data_)

In [3]:
file_path = '../data/expanded_transformed.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Site,Split,Sequence,Secondary
0,1,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,2,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,3,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
3,4,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,5,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...


In [None]:
expanded_df = data
true_count = expanded_df['Split'].sum()
false_count = len(expanded_df) - true_count

true_samples = expanded_df[expanded_df['Split'] == True]
false_samples = expanded_df[expanded_df['Split'] == False]

false_downsampled = resample(
    false_samples,
    replace=False,
    n_samples=true_count,
    random_state=42
)

balanced_df = pd.concat([true_samples, false_downsampled])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df


Unnamed: 0,Site,Split,Sequence,Secondary
0,159,True,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
1,107,False,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
2,79,False,MVSVIKPEMKMRYYMDGSVNGHEFTIEGEGTGRPYEGHQEMTLRVT...,CCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEEEEE...
3,253,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4,508,True,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
...,...,...,...,...
99,150,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
100,214,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...
101,101,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...
102,607,True,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...


In [None]:
true_samples = expanded_df[expanded_df['Split'] == True]
false_samples = expanded_df[expanded_df['Split'] == False]

true_oversampled = resample(
    true_samples,
    replace=True, 
    n_samples=len(false_samples),
    random_state=42
)

oversampled_df = pd.concat([true_oversampled, false_samples])

oversampled_df = oversampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
oversampled_df

Unnamed: 0,Site,Split,Sequence,Secondary
0,104,False,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
1,138,True,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,172,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...
3,109,False,MSVIKPDMKIKLRMEGAVNGHPFAIEGVGLGKPFEGKQSMDLKVKE...,CCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEEEC...
4,69,False,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
...,...,...,...,...
8011,4,False,MVAGHASGSPAFGTASHSNCEHEEIHLAGSIQPHGALLVVSEHDHR...,CCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCCCEEEEEEECCCCE...
8012,47,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...
8013,59,True,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
8014,189,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
