In [2]:
%cd ../

import os
import numpy as np
import pandas as pd
from tensorflow import keras
from sklearn.model_selection import train_test_split
from proteinbert import load_pretrained_model

/Users/qiaochufeng/Documents/GitHub/DS596-Project


In [13]:
balanced_df = pd.read_csv("sampled.csv", index_col="Unnamed: 0")

In [14]:
balanced_df

Unnamed: 0,Site,Split,Sequence,Secondary
0,159,True,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
1,107,False,MVSKGEEDNMAIIKEFMRFKVHMEGSVNGHEFEIEGEGEGRPYEGT...,CCCCCCCCCCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEE...
2,79,False,MVSVIKPEMKMRYYMDGSVNGHEFTIEGEGTGRPYEGHQEMTLRVT...,CCCCCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEEEEE...
3,253,False,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...
4,508,True,IEKKKSFAKGMGVKSTLVSGSKVYMTTFAEGSDARLEKIVEGDSIR...,CCHHHHHHHHCCCEEEEEECCEEEEEEECCCCCCEEEEEECCCCCC...
...,...,...,...,...
99,150,False,MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKF...,CCCCHHHCCCCCEEEEEEEEEECCEEEEEEEEEEECCCCEEEEEEE...
100,214,True,MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLK...,CCCCHHHHCCCCCCEEEEEEEEECCEEEEEEEEEEECCCCEEEEEE...
101,101,False,MVSKGEEDNMASLPATHELHIFGSINGVDFDMVGQGTGNPNDGYEE...,CCCCCCHHCCCCCCEEEEEEEEEEECCEEEEEEEEEEECCCCCEEE...
102,607,True,MSKLEKFTNCYSLSKTLRFKAIPVGKTQENIDNKRLLVEDEKRAED...,CCHHHHHCCCCCCCCEEEEEEEECCCHHHHHHHCCCCHHHHHHHHH...


In [16]:
sampled_df = balanced_df

In [18]:
import numpy as np

def encode_secondary_structures_alternative(secondary_structures, max_len=128):
    # Determine unique characters in the secondary structure sequences
    unique_chars = sorted(list(set("".join(secondary_structures))))
    char_to_index = {char: i for i, char in enumerate(unique_chars)}
    num_unique_chars = len(unique_chars)

    # Convert characters to indices and pad/truncate sequences to the specified max length
    sequences_numeric = [
        [char_to_index[char] for char in seq[:max_len]] + [0] * (max_len - len(seq[:max_len]))
        for seq in secondary_structures
    ]
    
    # Convert to one-hot encoded format
    one_hot_encoded = np.zeros((len(secondary_structures), max_len, num_unique_chars))
    for i, seq in enumerate(sequences_numeric):
        for j, idx in enumerate(seq):
            one_hot_encoded[i, j, idx] = 1

    return one_hot_encoded

# Apply the alternative method to encode the Secondary column
secondary_encoded_fixed = encode_secondary_structures_alternative(sampled_df['Secondary'])

# Validate the shape of the encoded data
secondary_encoded_fixed.shape


(104, 128, 3)

In [24]:
pretrained_model_generator, input_encoder = load_pretrained_model()

def encode_sequences_with_merge(sequences, encoder, seq_len=512):
    encoded = encoder.encode_X(sequences, seq_len=seq_len)
    valid_encoded = encoded[0] if isinstance(encoded, list) else encoded
    return valid_encoded

sequence_encoded = encode_sequences_with_merge(sampled_df['Sequence'], input_encoder, seq_len=1500)

In [None]:
# Inspect the dimensions of each component
sequence_encoded_shape = np.array(sequence_encoded).shape
secondary_flattened_shape = secondary_flattened.shape
site_column_shape = sampled_df[['Site']].values.shape

sequence_encoded_shape, secondary_flattened_shape, site_column_shape

((104, 1500), (104, 384), (104, 1))

In [26]:
# Flatten the secondary structure encoding to match the input format for the MLP model
secondary_flattened = secondary_encoded_fixed.reshape(secondary_encoded_fixed.shape[0], -1)

# Combine features: Sequence encoding, secondary structure encoding, and Site
X = np.hstack([
    np.array(sequence_encoded),  # Sequence encoding (already flat)
    secondary_flattened,         # Flattened secondary structure encoding
    sampled_df[['Site']].values  # Site column
])

# Target variable: Split
y = sampled_df['Split'].astype(int).values

# Validate the final shapes of features and target variable
X.shape, y.shape

((104, 1885), (104,))

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [50]:
from sklearn.decomposition import PCA

# 降到100维
pca = PCA(n_components=100)
X_reduced = pca.fit_transform(X)

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), max_iter=500, random_state=42)
mlp_model.fit(X_train, y_train)

y_pred = mlp_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report



(0.7619047619047619,
 '              precision    recall  f1-score   support\n\n           0       0.88      0.64      0.74        11\n           1       0.69      0.90      0.78        10\n\n    accuracy                           0.76        21\n   macro avg       0.78      0.77      0.76        21\nweighted avg       0.79      0.76      0.76        21\n')

In [55]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=50, random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
accuracy, report

(0.6666666666666666,
 '              precision    recall  f1-score   support\n\n           0       0.75      0.55      0.63        11\n           1       0.62      0.80      0.70        10\n\n    accuracy                           0.67        21\n   macro avg       0.68      0.67      0.66        21\nweighted avg       0.69      0.67      0.66        21\n')