In [1]:
#!/usr/bin/env python3


import os
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.under_sampling import RandomUnderSampler
from scipy.stats import randint as sp_randint

# ---------------------------
# CONFIG
# ---------------------------
warnings.filterwarnings("ignore")
sns.set(style="whitegrid")
plt.rcParams["figure.dpi"] = 120
N_JOBS = -1  # change if you want to limit parallelism
RANDOM_STATE = 42
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Put your dataset files here (update paths as needed)
DATASET_FILES = [
    "DATASET/ant-1.7.csv",
    "DATASET/camel-1.0.csv",
    "ghprdata/ghprdata.csv",
    # add more paths if you want
]

# ---------------------------
# UTIL FUNCTIONS
# ---------------------------
def calculate_special_metrics(y_true, y_pred):
    """Calculate PD (recall), PF (false positive rate), Specificity and G-measure"""
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape != (2, 2):
        return {"PD": np.nan, "PF": np.nan, "Specificity": np.nan, "G": np.nan}
    TN, FP, FN, TP = cm.ravel()
    PD = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    PF = FP / (FP + TN) if (FP + TN) > 0 else 0.0
    Specificity = TN / (FP + TN) if (FP + TN) > 0 else 0.0
    G = np.sqrt(PD * Specificity)
    return {"PD": PD, "PF": PF, "Specificity": Specificity, "G": G}

def identify_features_and_label(df):
    """Automatically choose label column if possible (contains 'defect', 'bug', or 'label') else last column"""
    candidates = [c for c in df.columns if any(k in c.lower() for k in ("defect", "bug", "label"))]
    label_col = candidates[0] if candidates else df.columns[-1]
    y = df[label_col]
    X = df.drop(columns=[label_col])
    # if label is multi-valued numeric, convert to binary (0 clean, >0 defective)
    if y.dtype.kind in "fi" and len(y.unique()) > 2:
        y = (y > 0).astype(int)
    # if label is string-like, try mapping
    if y.dtype == object:
        y = y.astype(str).str.lower().map(lambda s: 1 if s in ("yes", "true", "defect", "bug", "1") else 0)
        y = y.fillna(0).astype(int)
    return X, y, label_col

def safe_savefig(fig, fname):
    path = OUTPUT_DIR / fname
    fig.tight_layout()
    fig.savefig(path)
    plt.close(fig)

# ---------------------------
# EDA FUNCTIONS
# ---------------------------
def plot_target_distribution(y, project_name):
    fig, ax = plt.subplots(figsize=(5, 4))
    counts = y.value_counts()
    labels = [f"{idx} ({cnt})" for idx, cnt in zip(counts.index.astype(str), counts.values)]
    ax.pie(counts.values, labels=labels, autopct="%1.1f%%", startangle=90)
    ax.set_title(f"Target Distribution: {project_name}")
    safe_savefig(fig, f"{project_name}_target_distribution.png")

def save_summary_stats(df, project_name):
    desc = df.describe(include='all').transpose()
    desc_path = OUTPUT_DIR / f"{project_name}_summary_stats.csv"
    desc.to_csv(desc_path)
    return desc

def plot_correlation_heatmap(df, project_name, max_features=40):
    # limit features for visual clarity
    numeric = df.select_dtypes(include=[np.number])
    if numeric.shape[1] == 0:
        return
    if numeric.shape[1] > max_features:
        numeric = numeric.iloc[:, :max_features]
    corr = numeric.corr()
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr, annot=False, cmap="coolwarm", center=0, ax=ax)
    ax.set_title(f"Feature Correlation Heatmap: {project_name}")
    safe_savefig(fig, f"{project_name}_correlation_heatmap.png")

def plot_feature_histograms(df, project_name, ncols=3, max_plots=12):
    numeric = df.select_dtypes(include=[np.number])
    cols = numeric.columns.tolist()[:max_plots]
    n = len(cols)
    if n == 0:
        return
    nrows = int(np.ceil(n / ncols))
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*4, nrows*3))
    axes = axes.flatten()
    for i, col in enumerate(cols):
        sns.histplot(numeric[col].dropna(), ax=axes[i], kde=False)
        axes[i].set_title(col)
    for j in range(i+1, len(axes)):
        fig.delaxes(axes[j])
    fig.suptitle(f"Top {len(cols)} Numeric Feature Histograms: {project_name}")
    safe_savefig(fig, f"{project_name}_feature_histograms.png")

# ---------------------------
# MODELING PIPELINE
# ---------------------------
def run_project_pipeline(file_path, do_eda=True):
    project_name = Path(file_path).stem
    print(f"\n=== Processing project: {project_name} ===")

    if not Path(file_path).exists():
        print(f"File not found: {file_path}. Skipping.")
        return []

    df = pd.read_csv(file_path)
    # drop columns with >30% missing
    df = df.loc[:, df.isnull().mean() < 0.3]
    # fill numeric na with median
    df = df.fillna(df.median(numeric_only=True))
    # encode categorical columns
    cat_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()
    if cat_cols:
        df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

    # identify features and label
    X, y, label_col = identify_features_and_label(df)
    print(f"Identified label column: '{label_col}'")
    print(f"Original dataset rows: {len(df)}; features: {X.shape[1]}")

    # EDA
    if do_eda:
        try:
            plot_target_distribution(y, project_name)
            save_summary_stats(df, project_name)
            plot_correlation_heatmap(df, project_name)
            plot_feature_histograms(df, project_name)
            print("EDA artifacts saved.")
        except Exception as e:
            print("EDA failed:", e)

    # Verify binary
    if len(y.unique()) < 2 or y.value_counts().min() < 2:
        print("Target is not binary or too few positive samples. Skipping project.")
        return []

    # print distribution
    counts = y.value_counts()
    print("Target counts:\n", counts.to_dict())

    # split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
    )

    # scale numeric features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # undersample majority class
    rus = RandomUnderSampler(random_state=RANDOM_STATE)
    X_train_res, y_train_res = rus.fit_resample(X_train_scaled, y_train)
    print(f"Resampled training size: {len(y_train_res)} (pos:{sum(y_train_res)}, neg:{len(y_train_res)-sum(y_train_res)})")

    # set cv folds based on min samples
    min_samples = y_train_res.value_counts().min()
    cv_folds = max(2, min(min_samples, 5))
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=RANDOM_STATE)

    # models
    models = {
        "NaiveBayes": GaussianNB(),
        "LogisticRegression": LogisticRegression(max_iter=1500, solver="saga", random_state=RANDOM_STATE, n_jobs=N_JOBS),
        "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=N_JOBS),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=RANDOM_STATE, n_jobs=N_JOBS)
    }

    param_dist = {
        "RandomForest": {
            "n_estimators": sp_randint(100, 400),
            "max_depth": [3, 5, 10, None],
            "min_samples_split": [2, 5, 10]
        },
        "XGBoost": {
            "n_estimators": sp_randint(100, 400),
            "learning_rate": [0.01, 0.05, 0.1],
            "max_depth": sp_randint(3, 8)
        }
    }

    project_results = []
    roc_data_for_plots = []

    for name, model in models.items():
        print(f"\n-- Training {name} --")
        best_model = model
        # hyperparameter search for tree methods
        if name in param_dist:
            search = RandomizedSearchCV(
                estimator=model,
                param_distributions=param_dist[name],
                n_iter=6,
                scoring="roc_auc",
                cv=cv,
                random_state=RANDOM_STATE,
                n_jobs=N_JOBS,
                verbose=0
            )
            try:
                search.fit(X_train_res, y_train_res)
                best_model = search.best_estimator_
                print(f"Best params for {name}: {search.best_params_}")
            except Exception as e:
                print(f"Hyperparam tuning failed for {name} (will use default). Error: {e}")
                best_model = model
                best_model.fit(X_train_res, y_train_res)
        else:
            best_model.fit(X_train_res, y_train_res)

        # predict and probs
        y_pred = best_model.predict(X_test_scaled)
        y_prob = best_model.predict_proba(X_test_scaled)[:, 1] if hasattr(best_model, "predict_proba") else np.zeros(len(y_test))

        # metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        f1 = f1_score(y_test, y_pred, zero_division=0)
        auc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) == 2 and not np.all(y_prob == 0) else np.nan
        special = calculate_special_metrics(y_test, y_pred)

        project_results.append({
            "Project": project_name,
            "Model": name,
            "Accuracy": acc,
            "Precision": prec,
            "Recall": rec,
            "F1": f1,
            "AUC": auc,
            "PD": special["PD"],
            "PF": special["PF"],
            "G-Measure": special["G"]
        })

        # save classification report
        report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)
        pd.DataFrame(report).transpose().to_csv(OUTPUT_DIR / f"{project_name}_{name}_classification_report.csv")

        # ROC data
        try:
            fpr, tpr, _ = roc_curve(y_test, y_prob)
            roc_data_for_plots.append((name, fpr, tpr, auc))
        except Exception:
            pass

        # confusion matrix plot
        try:
            cm = confusion_matrix(y_test, y_pred)
            fig, ax = plt.subplots(figsize=(4, 3))
            sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
            ax.set_xlabel("Predicted")
            ax.set_ylabel("Actual")
            ax.set_title(f"{project_name} - {name} Confusion Matrix")
            safe_savefig(fig, f"{project_name}_{name}_confusion_matrix.png")
        except Exception as e:
            print("Could not plot confusion matrix:", e)

        # feature importance for tree-based
        if hasattr(best_model, "feature_importances_"):
            try:
                importances = best_model.feature_importances_
                fi = pd.Series(importances, index=X.columns).sort_values(ascending=False).head(20)
                fi_df = fi.reset_index()
                fi_df.columns = ["Feature", "Importance"]
                fi_df.to_csv(OUTPUT_DIR / f"{project_name}_{name}_feature_importance.csv", index=False)

                fig, ax = plt.subplots(figsize=(6, 6))
                sns.barplot(x="Importance", y="Feature", data=fi_df, ax=ax)
                ax.set_title(f"{project_name} - {name} Top Features")
                safe_savefig(fig, f"{project_name}_{name}_feature_importance.png")
            except Exception as e:
                print("Feature importance plot failed:", e)

    # ROC combined plot
    if roc_data_for_plots:
        fig, ax = plt.subplots(figsize=(6, 5))
        for name, fpr, tpr, auc in roc_data_for_plots:
            ax.plot(fpr, tpr, label=f"{name} (AUC={auc:.3f})")
        ax.plot([0, 1], [0, 1], linestyle="--", color="grey")
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.set_title(f"{project_name} - ROC Curves")
        ax.legend()
        safe_savefig(fig, f"{project_name}_roc_curves.png")

    return project_results

# ---------------------------
# MAIN: process all dataset files
# ---------------------------
def main():
    all_results = []
    for f in DATASET_FILES:
        try:
            res = run_project_pipeline(f, do_eda=True)
            if res:
                all_results.extend(res)
        except Exception as e:
            print(f"Error processing {f}: {e}")

    # consolidated results table
    if all_results:
        df_results = pd.DataFrame(all_results)
        df_results = df_results.sort_values(["Project", "AUC"], ascending=[True, False])
        df_results.to_csv(OUTPUT_DIR / "consolidated_model_results.csv", index=False)
        print("\n=== Summary of results saved to outputs/consolidated_model_results.csv ===")
        print(df_results.to_string(index=False))
    else:
        print("No results were produced. Check dataset paths and formats.")

    # Print rubric checklist summary to console
    rubric = [
        ("Problem Statement & Importance", True),
        ("Literature Review", "User must add slides/references"),
        ("Dataset Description (names, sizes, features)", "Partially - summary stats saved"),
        ("Target Distribution & Summary Stats", True),
        ("Preprocessing & Modeling", True),
        ("Data Visualizations (correlation, histograms)", True),
        ("Evaluation Metrics & Full Results (F1, AUC, confusion matrix)", True),
        ("Feature Importance & Ablation (if applicable)", "Top features saved for tree models"),
        ("Slide formatting (title, name, affiliation, numbers)", "Not auto-generated - optional")
    ]
    print("\nRubric-by-Rubric Quick Checklist:")
    for k, v in rubric:
        print(f"- {k}: {v}")

    print("\nAll artifacts (CSV + PNG) are in the 'outputs/' folder.")

if __name__ == "__main__":
    main()



=== Processing project: ant-1.7 ===
Identified label column: 'bug'
Original dataset rows: 745; features: 765


EDA artifacts saved.
Target counts:
 {0: 579, 1: 166}
Resampled training size: 266 (pos:133, neg:133)

-- Training NaiveBayes --



-- Training LogisticRegression --



-- Training RandomForest --


Best params for RandomForest: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 187}



-- Training XGBoost --


Best params for XGBoost: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 251}



=== Processing project: camel-1.0 ===
Identified label column: 'bug'
Original dataset rows: 339; features: 359


EDA artifacts saved.
Target counts:
 {0: 326, 1: 13}
Resampled training size: 20 (pos:10, neg:10)

-- Training NaiveBayes --

-- Training LogisticRegression --



-- Training RandomForest --


Best params for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 370}



-- Training XGBoost --


Best params for XGBoost: {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 251}



=== Processing project: ghprdata ===
Identified label column: 'version https://git-lfs.github.com/spec/v1_size 140138260'
Original dataset rows: 2; features: 0
EDA artifacts saved.
Target is not binary or too few positive samples. Skipping project.

=== Summary of results saved to outputs/consolidated_model_results.csv ===
  Project              Model  Accuracy  Precision   Recall       F1      AUC       PD       PF  G-Measure
  ant-1.7       RandomForest  0.731544   0.440678 0.787879 0.565217 0.835946 0.787879 0.284483   0.750827
  ant-1.7 LogisticRegression  0.744966   0.450980 0.696970 0.547619 0.813741 0.696970 0.241379   0.727142
  ant-1.7         NaiveBayes  0.671141   0.378788 0.757576 0.505051 0.769723 0.757576 0.353448   0.699866
  ant-1.7            XGBoost  0.697987   0.392857 0.666667 0.494382 0.760188 0.666667 0.293103   0.686487
camel-1.0       RandomForest  0.632353   0.107143 1.000000 0.193548 0.856410 1.000000 0.384615   0.784465
camel-1.0 LogisticRegression  0.661765