## Multi‑Model Classification Pipeline

COVID‑19 Protein Dataset (44 balanced CSVs)

This notebook performs **everything in one place**:

1. EDA preview (quick glance at a sample dataset)  
2. Dataset cleaning & feature engineering (based on EDA)  
3. Full training & evaluation loop for **six classifiers**  
4. Summary table + optional plots

---

### ⚙️ Classifiers Evaluated
| Alias | Model | GridSearch Parameters |
|-------|-------|-----------------------|
| logreg | LogisticRegression | `C = [0.1, 1, 10]` |
| svm    | SVC (RBF + linear) | `C = [1, 10]`, `gamma = ['scale','auto']` |
| knn    | KNeighbors          | `n_neighbors = [3, 5, 7]`, `weights = ['uniform','distance']` |
| dtree  | DecisionTree        | `max_depth = [5, 10, 15]`, `min_samples_split = [2, 5]` |
| rf     | RandomForest        | see code (depth, leaves, samples, etc.) |
| gb     | GradientBoosting    | `n_estimators = [100, 200]`, `learning_rate = [0.05, 0.1]` |

---

**Input:** every CSV in `./data/processed/balanced_datasets`  
(335 positive + 335 negative proteins each)  

**Output:** `all_model_results.csv` + accuracy/F1 plots per model


2️⃣ Imports & Folder Setup (+ optional EDA peek)

In [None]:
import os, warnings, numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
warnings.filterwarnings("ignore")

# 📂 Dataset folder (47 balanced CSVs)
default_path = "./data/processed/balanced_datasets"
folder_path = input(f"📂 Path to dataset folder (default: {default_path}): ").strip() or default_path
all_files   = sorted([f for f in os.listdir(folder_path) if f.endswith(".csv")])

# 🔍 Optional quick EDA preview
sample_df = pd.read_csv(os.path.join(folder_path, all_files[0]))
print("Sample dataset shape:", sample_df.shape)
print(sample_df.dtypes.value_counts())

plt.figure(figsize=(8,5))
sns.heatmap(sample_df.select_dtypes(include=[np.number]).iloc[:,:20].corr(),
            cmap='coolwarm', cbar=False)
plt.title("Correlation preview (first 20 numeric features)")
plt.show()


3️⃣ Cleaning function & Model definitions



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Columns deemed irrelevant after EDA
DROP_COLS = ['Gene ID', 'NumberOfDirectedEdges', 'selected', 'IsSingleNode', 'Radiality']

def clean_df(df):
    """EDA‑driven cleaning & feature filtering."""
    df = df.drop(columns=DROP_COLS, errors='ignore')
    df.replace([np.inf, -np.inf], 1, inplace=True)
    df.dropna(axis=1, how='all', inplace=True)
    df = df.select_dtypes(include=[np.number])          # keep numeric only

    # High‑correlation filter |r| > 0.9
    corr = df.corr().abs()
    upper = corr.where(np.triu(np.ones_like(corr), k=1).astype(bool))
    df = df.drop(columns=[c for c in upper.columns if any(upper[c] > 0.9)])
    return df

# Model dictionary
MODELS = {
    "Logistic Regression": (
        LogisticRegression(max_iter=1000),
        {'C':[0.1,1,10]}
    ),
    "SVM": (
        SVC(probability=True),
        {'C':[1,10], 'gamma':['scale','auto']}
    ),
    "KNN": (
        KNeighborsClassifier(),
        {'n_neighbors':[3,5,7], 'weights':['uniform','distance']}
    ),
    "Decision Tree": (
        DecisionTreeClassifier(),
        {'max_depth':[5,10,15], 'min_samples_split':[2,5]}
    ),
    "Random Forest": (
        RandomForestClassifier(random_state=42),
        {'n_estimators':[200,300], 'max_depth':[8,10],
         'min_samples_split':[20,30], 'min_samples_leaf':[15,25],
         'max_samples':[0.7,0.8], 'max_features':['sqrt']}
    ),
    "Gradient Boosting": (
        GradientBoostingClassifier(),
        {'n_estimators':[100,200], 'learning_rate':[0.05,0.1], 'max_depth':[3,5]}
    )
}


4️⃣  Main loop (over 47 datasets & 6 models)

In [None]:
overall_results = []

for file in all_files:
    print(f"\n📂 Processing {file}")
    df_raw = pd.read_csv(os.path.join(folder_path, file))
    df = clean_df(df_raw)

    if 'result_x' not in df.columns:
        print("❌ No target column 'result_x' — skipped"); continue

    X, y = df.drop(columns=['result_x']), df['result_x']
    n_features = X.shape[1]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )

    scaler = MinMaxScaler().fit(X_train)
    X_train_sc, X_test_sc = scaler.transform(X_train), scaler.transform(X_test)

    # PCA component sweep
    step = 10 if n_features >= 10 else 1
    pc_list = list(range(step, n_features+1, step))
    if n_features not in pc_list: pc_list.append(n_features)

    for model_name, (base_model, grid) in MODELS.items():
        print(f"   🔍 {model_name}")
        best_cv, best_est, best_pca, best_n = 0, None, None, None

        for n in pc_list:
            pca = PCA(n_components=n).fit(X_train_sc)
            Xtr_pca = pca.transform(X_train_sc)

            grid_cv = GridSearchCV(base_model, grid, cv=10,
                                   scoring='accuracy', n_jobs=-1)
            grid_cv.fit(Xtr_pca, y_train)

            if grid_cv.best_score_ > best_cv:
                best_cv, best_est = grid_cv.best_score_, grid_cv.best_estimator_
                best_pca, best_n  = pca, n

        # Test evaluation
        y_pred = best_est.predict(best_pca.transform(X_test_sc))

        overall_results.append({
            "Dataset"       : file,
            "Model"         : model_name,
            "PCA_Components": best_n,
            "Accuracy"      : accuracy_score(y_test, y_pred),
            "Precision"     : precision_score(y_test, y_pred, zero_division=0),
            "Recall"        : recall_score(y_test, y_pred, zero_division=0),
            "F1-Score"      : f1_score(y_test, y_pred, zero_division=0),
            "CV_Mean"       : best_cv
        })


5️⃣ Results summary & save

In [None]:
summary_df = pd.DataFrame(overall_results).sort_values(by='F1-Score', ascending=False)
display(summary_df.head(20))

out_csv = os.path.join(folder_path, "all_model_results.csv")
summary_df.to_csv(out_csv, index=False)
print(f"\n✅ Full results saved → {out_csv}")


---
##  Deep‑Learning Evaluation (Fully Connected Network)

In addition to classical ML models, we train a simple feed‑forward neural network
on **each of the 44 datasets**.  
For every dataset we:

1. Clean & scale features (same logic as before)  
2. Sweep PCA components (10 → 128)  
3. Tune a small network (neurons = 64 or 128) with 5‑fold Stratified CV  
4. Evaluate on a held‑out test split  
5. Save a combined PDF report (`dl_all_datasets_report.pdf`) with:  
   * Train/Test accuracy  
   * Best hyper‑parameters  
   * Classification report  
   * Confusion‑matrix image  
6. Identify & print the best‑performing dataset


Deep‑Learning 

In [None]:
import tempfile, traceback
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from fpdf import FPDF

neurons_list   = [64, 128]
batch_sizes    = [32]
epochs_list    = [50]
early_stop     = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

dl_summaries        = []
best_dataset_score  = 0
best_dataset_record = None

pdf = FPDF()

for file in all_files:
    try:
        print(f"\n🧠 DL Processing {file}")
        df_raw = pd.read_csv(os.path.join(folder_path, file))
        df = clean_df(df_raw.copy())

        if 'result_x' not in df.columns:
            print("❌ Skipping (missing target column)"); continue

        X, y = df.drop(columns=['result_x']), df['result_x']
        if X.shape[1] < 10:
            print("❌ Skipping (too few features for PCA)"); continue

        scaler = MinMaxScaler().fit(X)
        X_scaled = scaler.transform(X)

        max_components = min(128, X.shape[1])
        pca_space = list(range(10, max_components+1, 10))
        if max_components not in pca_space:
            pca_space.append(max_components)

        best_cv, best_model, best_pca, best_cfg = 0, None, None, None

        for n_comp in pca_space:
            pca = PCA(n_components=n_comp).fit(X_scaled)
            X_pca = pca.transform(X_scaled)

            for neurons in neurons_list:
                for batch in batch_sizes:
                    for n_epochs in epochs_list:
                        cv_scores = []
                        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                        for tr_idx, val_idx in skf.split(X_pca, y):
                            X_tr, X_val = X_pca[tr_idx], X_pca[val_idx]
                            y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

                            model = Sequential([
                                Dense(neurons, activation='relu', input_shape=(n_comp,), kernel_regularizer=l2(0.001)),
                                Dropout(0.3),
                                Dense(neurons//2, activation='relu', kernel_regularizer=l2(0.001)),
                                Dropout(0.3),
                                Dense(1, activation='sigmoid')
                            ])
                            model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

                            model.fit(X_tr, y_tr, validation_data=(X_val, y_val),
                                      epochs=n_epochs, batch_size=batch,
                                      verbose=0, callbacks=[early_stop])

                            val_pred = (model.predict(X_val) > 0.5).astype(int)
                            cv_scores.append(accuracy_score(y_val, val_pred))

                        mean_cv = np.mean(cv_scores)
                        if mean_cv > best_cv:
                            best_cv   = mean_cv
                            best_model= model
                            best_pca  = pca
                            best_cfg  = {"PCA": n_comp, "neurons": neurons, "batch": batch, "epochs": n_epochs}

        # Train/test split
        X_tr_raw, X_ts_raw, y_tr, y_ts = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
        X_tr = best_pca.transform(scaler.transform(X_tr_raw))
        X_ts = best_pca.transform(scaler.transform(X_ts_raw))

        y_tr_pred = (best_model.predict(X_tr) > 0.5).astype(int)
        y_ts_pred = (best_model.predict(X_ts) > 0.5).astype(int)

        train_acc = accuracy_score(y_tr, y_tr_pred)
        test_acc = accuracy_score(y_ts, y_ts_pred)
        cls_rep  = classification_report(y_ts, y_ts_pred)
        conf_mat = confusion_matrix(y_ts, y_ts_pred)

        dl_summaries.append({
            "dataset"  : file,
            "train_acc": train_acc,
            "test_acc" : test_acc,
            "config"   : best_cfg,
            "report"   : cls_rep,
            "conf_mat" : conf_mat
        })

        if test_acc > best_dataset_score:
            best_dataset_score  = test_acc
            best_dataset_record = dl_summaries[-1]

        # Plot confusion matrix
        fig, ax = plt.subplots(figsize=(3,3))
        ax.matshow(conf_mat, cmap="Blues", alpha=0.8)
        for i in range(conf_mat.shape[0]):
            for j in range(conf_mat.shape[1]):
                ax.text(j, i, conf_mat[i,j], va='center', ha='center')
        plt.title(f"{file} Confusion Matrix")
        plt.tight_layout()
        img_path = os.path.join(tempfile.gettempdir(), f"{file}_cm.png")
        plt.savefig(img_path); plt.close()

        # Save to PDF
        pdf.add_page()
        pdf.set_font("Arial", 'B', 12)
        pdf.cell(0, 8, f"Dataset: {file}", ln=True)
        pdf.set_font("Arial", size=11)
        pdf.cell(0, 8, f"Train Acc: {train_acc:.4f} | Test Acc: {test_acc:.4f}", ln=True)
        pdf.cell(0, 8, f"Best Config: {best_cfg}", ln=True)
        pdf.ln(4)
        pdf.set_font("Courier", size=8)
        for line in cls_rep.splitlines():
            pdf.cell(0, 4, line.strip(), ln=True)
        pdf.image(img_path, x=10, w=80)

    except Exception as e:
        print("❌ Error:", file, e)
        print(traceback.format_exc())

# Save full PDF
pdf_path = "dl_all_datasets_report.pdf"
pdf.output(pdf_path)
print(f"\n✅ Combined DL report saved → {pdf_path}")

if best_dataset_record:
    print("\n🏆 Best Performing Dataset:")
    print("Dataset:", best_dataset_record["dataset"])
    print("Test Accuracy:", round(best_dataset_record["test_acc"], 4))
    print("Config:", best_dataset_record["config"])
