In [None]:
###### Imports

# !pip install pandas numpy matplotlib tensorflow scikit-learn transformers biopython pyarrow
# !wget https://git.scicore.unibas.ch/schwede/openstructure/-/raw/master/scripts/download_alphafold_params.sh
# !pip install pandas
# !pip install numpy
# !pip install biopython
# !pip install scikit-learn
# !pip install matplotlib
# !pip install tensorflow
# !pip install transformers

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from tensorflow.keras.models import load_model

# Set global configurations
DATA_DIR = "Data/"
OUTPUT_DIR = "Results/"
MODEL_DIR = "Models/"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
#### Data Preparation/Processing

from preprocess import clean_sequence, ProtBERTEmbedder

# Load raw datasets (VFDB, MvirDB, UniProt)
vfdb_path = os.path.join(DATA_DIR, "raw/vfdb_sequences.fasta")
uniprot_path = os.path.join(DATA_DIR, "raw/uniprot_ecoli.csv")

print("Loading datasets...")
vfdb_df = pd.read_csv(os.path.join(DATA_DIR, "processed/vfdb_cleaned.csv"))
uniprot_df = pd.read_csv(os.path.join(DATA_DIR, "processed/uniprot_cleaned.csv"))

# Combine datasets and shuffle
full_df = pd.concat([vfdb_df, uniprot_df]).sample(frac=1).reset_index(drop=True)
print(f"Combined dataset size: {full_df.shape}")

# Generate ProtBERT embeddings (if not already generated)
embedder = ProtBERTEmbedder()
if not os.path.exists(os.path.join(DATA_DIR, "processed/sequence_embeddings.parquet")):
    print("Generating sequence embeddings...")
    embeddings = [embedder.embed(seq) for seq in full_df['sequence']]
    embedding_df = pd.DataFrame(embeddings)
    embedding_df['label'] = full_df['label']
    embedding_df.to_parquet(os.path.join(DATA_DIR, "processed/sequence_embeddings.parquet"))
else:
    print("Loading precomputed embeddings...")
    embedding_df = pd.read_parquet(os.path.join(DATA_DIR, "processed/sequence_embeddings.parquet"))

print("Data preprocessing completed.")

To run alpha fold, the sequences must be in fasta format WITHOUT headers.
MUST have a100 chip


In [None]:
######## Protien Structure Prediction ########

print("Protein structure prediction is handled externally using AlphaFold.")
structure_features_path = os.path.join(DATA_DIR, "processed/structural_features.csv")
structure_features = pd.read_csv(structure_features_path)

print(f"Loaded structural features from {structure_features_path}.")

In [None]:
#### Training ####

from train_model import build_ensemble_model

# Load sequence and structural features
X_seq = embedding_df.drop(columns=['label']).values
X_struct = structure_features.drop(columns=['label']).values
y = embedding_df['label'].values

X_seq = np.expand_dims(X_seq, axis=-1)  # Reshape for CNN input
X_struct = X_struct.reshape((-1, 64, 64, 1))  # Adjust based on your structural feature dimensions

print(f"Sequence input shape: {X_seq.shape}")
print(f"Structural input shape: {X_struct.shape}")

# Train/test split
from sklearn.model_selection import train_test_split

(X_seq_train, X_seq_test,
 X_struct_train, X_struct_test,
 y_train, y_test) = train_test_split(X_seq, X_struct, y, test_size=0.2, stratify=y)

model = build_ensemble_model(seq_input_shape=X_seq.shape[1:], struct_input_shape=X_struct.shape[1:])
model.summary()

history = model.fit(
    [X_seq_train, X_struct_train], y_train,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=10),
        tf.keras.callbacks.ModelCheckpoint(os.path.join(MODEL_DIR, "best_model.h5"), save_best_only=True)
    ]
)

model.save(os.path.join(MODEL_DIR, "final_model.h5"))
print("Model training completed.")


In [None]:
## Some visualizations of the model performance

# ROC curve
def plot_roc_curve(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, label=f'ROC Curve (AUC={roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend()
    plt.savefig(os.path.join(OUTPUT_DIR, 'roc_curve.png'))
    plt.close()

y_pred = model.predict([X_seq_test, X_struct_test]).ravel()
plot_roc_curve(y_test, y_pred)

y_pred_class = (y_pred > 0.5).astype(int)
report = classification_report(y_test, y_pred_class)
print(report)

cm = confusion_matrix(y_test, y_pred_class)
plt.imshow(cm, cmap='Blues', interpolation='nearest')
plt.colorbar()
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'))
plt.close()

# Results Evaluation graph