## Gene Prediction from Trained Models

This notebook loads pretrained ML and DL models and predicts whether new gene feature profiles are associated with COVID-19.

We use:
- Random Forest (scikit-learn) from classical ML
- Fully connected neural net (Keras)

Outputs:
- Genes predicted positive by ML
- Genes predicted positive by DL
- Genes predicted positive by both



Load Packages and Upload Input

In [None]:
import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model
import os

# === Replace this with the path to your input CSV file ===
input_file = 'path/to/your/input_gene_features.csv'

# Load the CSV file
data = pd.read_csv(input_file)
print(f"✅ Data loaded: {data.shape}")



Load Trained Models

In [None]:
# Load ML model components
ml_scaler   = joblib.load("/content/scaler (11).pkl")
ml_pca      = joblib.load("/content/pca (8).pkl")
ml_features = joblib.load("/content/feature (2).pkl")
ml_model    = joblib.load("/content/rf_model (4).pkl")

# Load DL model components
dl_scaler   = joblib.load("/content/scaler (12).pkl")
dl_pca      = joblib.load("/content/pca (9).pkl")
dl_features = joblib.load("/content/feature_names.pkl")
dl_model    = load_model("/content/model (2).h5")

# Gene IDs (fallback if column missing)
gene_ids = data['Gene ID'].values if 'Gene ID' in data.columns else np.array([f"Gene_{i}" for i in range(len(data))])


Make Predictions

In [None]:
# === ML Prediction ===
ml_data = data.copy()
ml_data.drop(columns=[c for c in ['Gene ID'] if c in ml_data.columns], inplace=True)
ml_data = ml_data[ml_features]
ml_data.replace([np.inf, -np.inf], 1, inplace=True)
ml_data.fillna(0, inplace=True)

ml_scaled = ml_scaler.transform(ml_data)
ml_pca_data = ml_pca.transform(ml_scaled)
ml_preds = (ml_model.predict(ml_pca_data) > 0.5).astype(int).flatten()
ml_positive_ids = gene_ids[ml_preds == 1]

# === DL Prediction ===
dl_data = data.copy()
dl_data.drop(columns=[c for c in ['Gene ID'] if c in dl_data.columns], inplace=True)
dl_data = dl_data[dl_features]
dl_data.replace([np.inf, -np.inf], 1, inplace=True)
dl_data.fillna(0, inplace=True)

dl_scaled = dl_scaler.transform(dl_data)
dl_pca_data = dl_pca.transform(dl_scaled)
dl_preds = (dl_model.predict(dl_pca_data, verbose=0) > 0.5).astype(int).flatten()
dl_positive_ids = gene_ids[dl_preds == 1]

# === Intersection ===
common_positive_ids = np.intersect1d(ml_positive_ids, dl_positive_ids)


Save Outputs

In [None]:
# Save to CSV
ml_df     = pd.DataFrame({'Gene ID': ml_positive_ids})
dl_df     = pd.DataFrame({'Gene ID': dl_positive_ids})
common_df = pd.DataFrame({'Gene ID': common_positive_ids})

ml_df.to_csv("ml_positive_genes.csv", index=False)
dl_df.to_csv("dl_positive_genes.csv", index=False)
common_df.to_csv("common_positive_genes.csv", index=False)

print("✅ Files saved:")
print(f" - ML positive: {ml_df.shape[0]} genes")
print(f" - DL positive: {dl_df.shape[0]} genes")
print(f" - Common positive: {common_df.shape[0]} genes")

# Download (Colab)
files.download("ml_positive_genes.csv")
files.download("dl_positive_genes.csv")
files.download("common_positive_genes.csv")
