In [2]:
%cd ../

import os
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import load_pretrained_model

/Users/qiaochufeng/Documents/GitHub/DS596-Project


In [120]:
balanced_df = pd.read_csv("unsampled.csv", index_col="Unnamed: 0")

In [121]:
balanced_df

Unnamed: 0,Site,Split,Sequence,Secondary
0,1,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,2,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,3,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
3,4,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,5,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
...,...,...,...,...
4055,603,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4056,604,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4057,605,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4058,606,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...


In [122]:
balanced_df_ = balanced_df[balanced_df['Sequence'].apply(len) < 500]
balanced_df_

Unnamed: 0,Site,Split,Sequence,Secondary
0,1,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
1,2,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
2,3,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
3,4,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
4,5,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
...,...,...,...,...
2940,185,False,MVSELIKENMPMKLYMEGTVNNHHFKCTSEGEGKPYEGTQTMRIKV...,CCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEE...
2941,186,False,MVSELIKENMPMKLYMEGTVNNHHFKCTSEGEGKPYEGTQTMRIKV...,CCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEE...
2942,187,False,MVSELIKENMPMKLYMEGTVNNHHFKCTSEGEGKPYEGTQTMRIKV...,CCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEE...
2943,188,True,MVSELIKENMPMKLYMEGTVNNHHFKCTSEGEGKPYEGTQTMRIKV...,CCCCCCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCCEEEEEEEE...


In [123]:
# sampled_df = balanced_df
sampled_df = balanced_df_

In [124]:
import numpy as np

def encode_secondary_structures_alternative(secondary_structures, max_len=128):
    # Determine unique characters in the secondary structure sequences
    unique_chars = sorted(list(set("".join(secondary_structures))))
    char_to_index = {char: i for i, char in enumerate(unique_chars)}
    num_unique_chars = len(unique_chars)

    # Convert characters to indices and pad/truncate sequences to the specified max length
    sequences_numeric = [
        [char_to_index[char] for char in seq[:max_len]] + [0] * (max_len - len(seq[:max_len]))
        for seq in secondary_structures
    ]
    
    # Convert to one-hot encoded format
    one_hot_encoded = np.zeros((len(secondary_structures), max_len, num_unique_chars))
    for i, seq in enumerate(sequences_numeric):
        for j, idx in enumerate(seq):
            one_hot_encoded[i, j, idx] = 1

    return one_hot_encoded

# Apply the alternative method to encode the Secondary column
secondary_encoded_fixed = encode_secondary_structures_alternative(sampled_df['Secondary'])

# Validate the shape of the encoded data
secondary_encoded_fixed.shape


(2945, 128, 3)

In [125]:
pretrained_model_generator, input_encoder = load_pretrained_model()

def encode_sequences_with_merge(sequences, encoder, seq_len=512):
    encoded = encoder.encode_X(sequences, seq_len=seq_len)
    valid_encoded = encoded[0] if isinstance(encoded, list) else encoded
    return valid_encoded

sequence_encoded = encode_sequences_with_merge(sampled_df['Sequence'], input_encoder, seq_len=1500)

In [126]:
# Inspect the dimensions of each component
sequence_encoded_shape = np.array(sequence_encoded).shape
secondary_flattened_shape = secondary_flattened.shape
site_column_shape = sampled_df[['Site']].values.shape

sequence_encoded_shape, secondary_flattened_shape, site_column_shape

((2945, 1500), (6276, 384), (2945, 1))

In [127]:
# Flatten the secondary structure encoding to match the input format for the MLP model
secondary_flattened = secondary_encoded_fixed.reshape(secondary_encoded_fixed.shape[0], -1)

# Combine features: Sequence encoding, secondary structure encoding, and Site
X = np.hstack([
    np.array(sequence_encoded),  # Sequence encoding (already flat)
    secondary_flattened,         # Flattened secondary structure encoding
    sampled_df[['Site']].values  # Site column
])

# Target variable: Split
y = sampled_df['Split'].astype(int).values

# Validate the final shapes of features and target variable
X.shape, y.shape

((2945, 1885), (2945,))

In [128]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [129]:
from sklearn.decomposition import PCA

pca = PCA(n_components=104)
X_reduced = pca.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9728353140916808,
 '              precision    recall  f1-score   support\n\n           0       0.97      1.00      0.99       573\n           1       0.00      0.00      0.00        16\n\n    accuracy                           0.97       589\n   macro avg       0.49      0.50      0.49       589\nweighted avg       0.95      0.97      0.96       589\n')

In [None]:
# discuss一下
# 1. 没加强度，欠采样76%，过采样90%，可以解释为什么实际表现不好，不采样97%，但应该是因为过拟合）
# 2. 加强度算（明天做，有点麻烦（只能使用长度<512的sequence，信息不够多）/计算时间问题（需要运行1w次模型），而且精确度已经足够高了所以可能其实没必要，可以在论文里提一下）