In [3]:
import time
import json
import requests
import pandas as pd
from sklearn.utils import resample



def call_psipred_api(sequence: str):
    psipred = "http://bioinf.cs.ucl.ac.uk/psipred/api"
    submit_url = f"{psipred}/submission"
    fasta_sequence = f">query\n{sequence}"

    payload = {'input_data': fasta_sequence}
    data = {'job': 'psipred', 'submission_name': 'test','email': 'carrief0908@gmail.com'}
    r = requests.post(f"{submit_url}.json", data=data, files=payload)
    response_data = json.loads(r.text)
    print(response_data)
    uuid = response_data['UUID']

    retries = 0
    while retries < 30:
      result_uri = f"{submit_url}/{uuid}"
      r = requests.get(result_uri, headers={"Accept":"application/json"})
      result_data = json.loads(r.text)
      if "Complete" in result_data["state"]:
          data_path = result_data['submissions'][0]['results'][5]['data_path']
          response = requests.get(f"{psipred}{data_path}")
          if response.status_code != 200:
              raise Exception(f"Failed to get results: {response.text}")
          ss_sequence = ""
          for line in response.text.splitlines():
              if not line.startswith('#') and len(line.split()) > 2:
                  ss_sequence += line.split()[2]
          return ss_sequence
      else:
          retries += 1
          time.sleep(30)

    raise Exception("Timeout waiting for PSIPRED results")



split_data = pd.read_csv('../data/split.csv')
split_data = split_data[['Split Site', 'Sequence']]
split_data['Sequence'] = split_data['Sequence'].str.replace(' ', '', regex=False)

expanded_rows = split_data['Split Site'].str.split('/').explode()
expanded_data = pd.DataFrame({
    'Split Site': expanded_rows,
    'Sequence': split_data.loc[expanded_rows.index, 'Sequence'].values
})
expanded_data.reset_index(drop=True, inplace=True)

In [None]:
def read_seq_json(file_path: str) -> dict:
    with open(file_path, 'r') as f:
        return json.load(f)
Seqs = read_seq_json('../data/seq_2_second.json')

expanded_data['Secondary'] = expanded_data['Sequence'].map(Seqs)
expanded_data.dropna(subset=["Secondary"], inplace=True) # 换成AlphaFold结果之后记得删掉这行
expanded_data.to_csv('expanded.csv')
expanded_data

In [None]:
# expanded_data_grouped = expanded_data.groupby(['Sequence', 'Secondary'])['Split Site'].agg(list).reset_index()

In [5]:
file_path = '../data/expanded.csv'
expanded_data = pd.read_csv(file_path)
unique_sequences = expanded_data['Sequence'].unique()
expanded_data_ = []

for sequence in unique_sequences:
    
    seq_data = expanded_data[expanded_data['Sequence'] == sequence]
    split_sites = seq_data['Split Site'].tolist()
    secondary_structure = seq_data['Secondary'].iloc[0]
    
    max_site = max(split_sites)
    all_sites = list(range(1, max_site + 1))
    for site in all_sites:
        expanded_data_.append({
            'Site': site,
            'Split': site in split_sites,
            'Sequence': sequence,
            'Secondary': secondary_structure
        })

expanded_df = pd.DataFrame(expanded_data_)

In [4]:
file_path = '../data/expanded_transformed.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Site,Split,Sequence,Secondary
0,1,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,2,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,3,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
3,4,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,5,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...


In [5]:
expanded_df = data
true_count = expanded_df['Split'].sum()
false_count = len(expanded_df) - true_count

true_samples = expanded_df[expanded_df['Split'] == True]
false_samples = expanded_df[expanded_df['Split'] == False]

false_downsampled = resample(
    false_samples,
    replace=False,
    n_samples=true_count,
    random_state=42
)

balanced_df = pd.concat([true_samples, false_downsampled])

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_df

Unnamed: 0,Site,Split,Sequence,Secondary
0,159,True,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
1,107,False,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
2,79,False,MVSVIKPEMKMRYYMDGSVNGHEFTIEGEGTGRPYEGHQEMTLRVT...,CCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEEEEE...
3,253,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4,508,True,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
...,...,...,...,...
99,150,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
100,214,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...
101,101,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...
102,607,True,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...


In [8]:
true_samples = expanded_df[expanded_df['Split'] == True]
false_samples = expanded_df[expanded_df['Split'] == False]

true_oversampled = resample(
    true_samples,
    replace=True, 
    n_samples=len(false_samples),
    random_state=42
)

oversampled_df = pd.concat([true_oversampled, false_samples])

oversampled_df = oversampled_df.sample(frac=1, random_state=42).reset_index(drop=True)
oversampled_df

Unnamed: 0,Site,Split,Sequence,Secondary
0,104,False,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
1,138,True,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,172,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...
3,109,False,MSVIKPDMKIKLRMEGAVNGHPFAIEGVGLGKPFEGKQSMDLKVKE...,CCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEEEC...
4,69,False,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
...,...,...,...,...
8011,4,False,MVAGHASGSPAFGTASHSNCEHEEIHLAGSIQPHGALLVVSEHDHR...,CCCCCCCCCCCCCCCCCCHHHCCCCCCCCCCCCCEEEEEEECCCCE...
8012,47,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...
8013,59,True,EEDNNAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGHPYEGTQTAKL...,CCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
8014,189,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...


In [39]:
balanced_df.to_csv("sampled.csv")

In [8]:
%pwd

'/Users/qiaochufeng/Documents/GitHub/DS596-Project/split_prediction'

### Encoding

In [8]:
%cd ../

/Users/qiaochufeng/Documents/GitHub/DS596-Project


In [9]:
import os
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import load_pretrained_model


BENCHMARK_NAME = 'fluorescence'
BENCHMARKS_DIR = 'protein_bert/protein_benchmarks'

train_set_file_path = os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.train.csv')
train_set = pd.read_csv(train_set_file_path).dropna().drop_duplicates()
train_set, valid_set = train_test_split(train_set, test_size=0.1, random_state=0)

test_set_file_path = os.path.join(BENCHMARKS_DIR, f'{BENCHMARK_NAME}.test.csv')
test_set = pd.read_csv(test_set_file_path).dropna().drop_duplicates()

In [10]:
pretrained_model_generator, input_encoder = load_pretrained_model()

def encode_sequences_with_merge(sequences, encoder, seq_len=512):
    encoded = encoder.encode_X(sequences, seq_len=seq_len)
    valid_encoded = encoded[0] if isinstance(encoded, list) else encoded
    return valid_encoded

In [15]:
train_features = encode_sequences_with_merge(train_set['seq'], input_encoder, seq_len=512)
valid_features = encode_sequences_with_merge(valid_set['seq'], input_encoder, seq_len=512)
test_features = encode_sequences_with_merge(test_set['seq'], input_encoder, seq_len=512)

In [16]:
train_features.shape

(19301, 512)

In [12]:
train_set['seq'].shape

(19301,)

In [13]:
balanced_df['Sequence'].shape

(104,)

In [17]:
L = balanced_df['Sequence'].unique().tolist()

In [20]:
for l in L:
    print(len(l))
print(L[3])

236
218
1228
966
238
239
238
224
225
315
239
232
239
228
218
236
139
IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIRSVNEGEAFSAEMADKNAGYKIGNAKFSHPKGYAVVANNPLYTGPVQQDMLGLKETLEKRYFGESADGNDNICIQVIHNILDIEKILAEYITNAAYAVNNISGLDKDIIGFGKFSTVYTYDEFKDPEHHRAAFNNNDKLINAIKAQYDEFDNFLDNPRLGYFGQAFFSKEGRNYIINYGNECYDILALLSGLRHWVVHNNEEESRISRTWLYNLDKNLDNEYISTLNYLYDRITNELTNSFSKNSAANVNYIAETLGINPAEFAEQYFRFSIMKEQKNLGFNITKLREVMLDRKDMSEIRKNHKVFDSIRTKVYTMMDFVIYRYYIEEDAKVAAANKSLPDNEKSLSEKDIFVINLRGSFNDDQKDALYYDEANRIWRKLENIMHNIKEFRGNKTREYKKKDAPRLPRILPAGRDVSAFSKLMYALTMFLDGKEINDLLTTLINKFDNIQSFLKVMPLIGVNAKFVEEYAFFKDSAKIADELRLIKSFARMGEPIADARRAMYIDAIRILGTNLSYDELKALADTFSLDENGNKLKKGKHGMRNFIINNVISNKRFHYLIRYGDPAHLHEIAKNEAVVKFVLGRIADIQKKQGQNGKNQIDRYYETCIGKDKGKSVSEKVDALTKIITGMNYDQFDKKRSVIEDTGRENAEREKFKKIISLYLTVIYHILKNIVNINARYVIGFHCVERDAQLYKEKGYDINLKKLEEKGFSSVTKLCAGIDETAPDKRKDVEKEMAERAKESIDSLESANPKLYANYIKYSDEKKAEEFTRQINREKAKTALNAYLRNTKWNVIIREDLLRIDNKTCTLFRNKAVHLEVARYVHAYINDIAEVNSYFQLYHYIMQRIIMNERYEKSSGKVSEYFDAVNDEKKYNDRLLKLLCVPF

In [13]:
train_set['seq'].head()
balanced_df['Sequence'].head()

0    MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...
1    MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...
2    MVSVIKPEMKMRYYMDGSVNGHEFTIEGEGTGRPYEGHQEMTLRVT...
3    MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...
4    IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...
Name: Sequence, dtype: object

In [1]:
balanced_df

NameError: name 'balanced_df' is not defined

In [14]:
encoder = input_encoder
sequences = balanced_df['Sequence'].unique()

encoded_list = {}
for s in sequences:
    encoded = encoder.encode_X(s, seq_len=512)
    encoded_list[s] = encoded[0]

In [15]:
from sklearn.preprocessing import OneHotEncoder

sampled_df = balanced_df
unique_chars = sorted(set("".join(sampled_df['Secondary'])))
char_to_index = {char: idx for idx, char in enumerate(unique_chars)}

# Function to one-hot encode a single secondary structure string
def one_hot_encode_secondary(structure, char_to_index, max_length=None):
    encoded = np.zeros((max_length, len(char_to_index)), dtype=int)
    for i, char in enumerate(structure[:max_length]):
        encoded[i, char_to_index[char]] = 1
    return encoded

# Define maximum length for encoding (uniform length)
max_length = max(len(s) for s in sampled_df['Secondary'])

# Encode the Secondary column
one_hot_encoded_secondary = sampled_df['Secondary'].apply(
    lambda x: one_hot_encode_secondary(x, char_to_index, max_length)
)

sampled_df['Encoded_Secondary'] = one_hot_encoded_secondary.apply(lambda x: x.tolist())
sampled_df

Unnamed: 0,Site,Split,Sequence,Secondary,Encoded_Secondary
0,159,True,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
1,107,False,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
2,79,False,MVSVIKPEMKMRYYMDGSVNGHEFTIEGEGTGRPYEGHQEMTLRVT...,CCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEEEEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
3,253,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...,"[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."
4,508,True,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...,"[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."
...,...,...,...,...,...
99,150,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
100,214,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
101,101,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...,"[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
102,607,True,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...,"[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."


In [16]:
sampled_df['Sequence'] = sampled_df['Sequence'].map(encoded_list)
sampled_df.drop(['Secondary'], axis=1, inplace=True)
sampled_df

Unnamed: 0,Site,Split,Sequence,Encoded_Secondary
0,159,True,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
1,107,False,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
2,79,False,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
3,253,False,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."
4,508,True,"[[23, 7, 24, 25, 25, 25, 25, 25, 25, 25, 25, 2...","[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."
...,...,...,...,...
99,150,False,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
100,214,True,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
101,101,False,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [1, 0, 0], [1, 0, 0], [..."
102,607,True,"[[23, 10, 24, 25, 25, 25, 25, 25, 25, 25, 25, ...","[[1, 0, 0], [1, 0, 0], [0, 0, 1], [0, 0, 1], [..."


In [49]:
sampled_df.to_csv('sampled.csv')

In [17]:
data = sampled_df
sampled_df['Encoded_Secondary'] = sampled_df['Encoded_Secondary'].apply(np.array)
print(type(sampled_df['Encoded_Secondary'].iloc[0]))  # 应该输出 <class 'numpy.ndarray'>

<class 'numpy.ndarray'>


In [18]:
sampled_df.info()
print(type(sampled_df['Sequence'][0]))
print(type(sampled_df['Encoded_Secondary'][0]))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Site               104 non-null    int64 
 1   Split              104 non-null    bool  
 2   Sequence           104 non-null    object
 3   Encoded_Secondary  104 non-null    object
dtypes: bool(1), int64(1), object(2)
memory usage: 2.7+ KB
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [19]:
print(sampled_df['Sequence'][0].shape)
print(sampled_df['Encoded_Secondary'][0].shape)

(236, 512)
(1228, 3)


In [20]:
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Flatten the features and combine them
sequence_features = np.array([seq.flatten() for seq in sampled_df['Sequence']])  # Shape: (num_samples, 236*236)
secondary_features = np.array([sec.flatten() for sec in sampled_df['Encoded_Secondary']])  # Shape: (num_samples, 1228*3)
X = np.hstack([sequence_features, secondary_features])  # Combine features

# Target variable
y = sampled_df['Split'].astype(int).values

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the MLP model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),  # Adjusted for input size
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.2, verbose=1)

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (104,) + inhomogeneous part.