In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, accuracy_score, f1_score
)
from joblib import dump

# --- Configuration ---
RANDOM_STATE = 42
PSEUDO_LABEL_DIRTY_THRESHOLD = 0.70
PSEUDO_LABEL_CLEAN_THRESHOLD = 0.30
TEST_PREDICTION_THRESHOLD = 0.5

np.random.seed(RANDOM_STATE)
sns.set(style="whitegrid")
os.makedirs("outputs", exist_ok=True)


def load_data(path):
    print(f"📥 Loading data from {path}...")
    df = pd.read_csv(path)
    feature_cols = [col for col in df.columns if col.startswith("feat_")]
    if not feature_cols:
        raise ValueError("No feature columns found starting with 'feat_'.")
    return df, feature_cols


def preprocess_known(df, feature_cols):
    df_known = df[df['label'] != 'unknown'].copy()
    if df_known.empty:
        raise ValueError("No labeled data found.")

    label_encoder = LabelEncoder()
    label_encoder.fit(['clean', 'dirty'])  # Ensuring correct order
    df_known['label_encoded'] = label_encoder.transform(df_known['label'])

    X = StandardScaler().fit_transform(df_known[feature_cols])
    y = df_known['label_encoded'].values

    return X, y, label_encoder, df_known


def train_random_forest(X, y):
    clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, class_weight='balanced')
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    results = cross_validate(clf, X, y, scoring=['accuracy', 'f1_weighted', 'roc_auc'], cv=cv, n_jobs=-1)

    print("\n📊 Cross-validation results:")
    for key in results:
        if 'test' in key:
            print(f"{key}: {np.mean(results[key]):.4f} ± {np.std(results[key]):.4f}")

    clf.fit(X, y)
    return clf


def evaluate_model(model, X, y, label_encoder, title_prefix="Known"):
    y_pred = model.predict(X)
    y_proba = model.predict_proba(X)[:, label_encoder.transform(['dirty'])[0]]

    print(f"\n📈 Evaluation on {title_prefix} Data")
    print(classification_report(y, y_pred, target_names=label_encoder.classes_))

    cm = confusion_matrix(y, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_,
                yticklabels=label_encoder.classes_)
    plt.title(f"{title_prefix} Confusion Matrix")
    plt.savefig(f"outputs/{title_prefix.lower()}_confusion_matrix.png")
    plt.clf()

    roc_auc = roc_auc_score(y, y_proba)
    fpr, tpr, _ = roc_curve(y, y_proba)
    plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.title(f"{title_prefix} ROC Curve")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.savefig(f"outputs/{title_prefix.lower()}_roc_curve.png")
    plt.clf()
    print(f"ROC AUC: {roc_auc:.4f}")


def pseudo_label(df, clf, scaler, feature_cols, label_encoder):
    df_unlabeled = df[df['label'] == 'unknown'].copy()
    if df_unlabeled.empty:
        print("No unlabeled data found.")
        return pd.DataFrame()

    X_unknown = scaler.transform(df_unlabeled[feature_cols])
    probs = clf.predict_proba(X_unknown)[:, label_encoder.transform(['dirty'])[0]]
    df_unlabeled['proba_dirty'] = probs

    df_unlabeled['final_label'] = 'unlabeled'
    df_unlabeled.loc[probs >= PSEUDO_LABEL_DIRTY_THRESHOLD, 'final_label'] = 'dirty'
    df_unlabeled.loc[probs <= PSEUDO_LABEL_CLEAN_THRESHOLD, 'final_label'] = 'clean'

    df_pseudo = df_unlabeled[df_unlabeled['final_label'] != 'unlabeled'].copy()
    df_pseudo[['file', 'proba_dirty', 'final_label']].to_csv("outputs/classified_unknowns.csv", index=False)
    print(f"✅ Pseudo-labeled: {len(df_pseudo)} samples")

    return df_pseudo


def final_training(df_known, df_pseudo, feature_cols, label_encoder):
    df_known = df_known[['file'] + feature_cols + ['label']]
    df_known.rename(columns={'label': 'final_label'}, inplace=True)
    df_all = pd.concat([df_known, df_pseudo[['file'] + feature_cols + ['final_label']]], ignore_index=True)

    scaler = StandardScaler()
    X = scaler.fit_transform(df_all[feature_cols])
    y = label_encoder.transform(df_all['final_label'])

    clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE, class_weight='balanced')
    clf.fit(X, y)

    print("✅ Final model trained.")
    return clf, scaler


# --- Main Pipeline ---
if __name__ == "__main__":
    df, feature_cols = load_data("features/features_train.csv")
    X_known, y_known, label_encoder, df_known = preprocess_known(df, feature_cols)
    scaler_known = StandardScaler().fit(df_known[feature_cols])
    X_known_scaled = scaler_known.transform(df_known[feature_cols])

    clf_known = train_random_forest(X_known_scaled, y_known)
    evaluate_model(clf_known, X_known_scaled, y_known, label_encoder, title_prefix="Known")

    df_pseudo = pseudo_label(df, clf_known, scaler_known, feature_cols, label_encoder)

    clf_final, scaler_final = final_training(df_known, df_pseudo, feature_cols, label_encoder)

    # Save final model and scaler
    dump(clf_final, "outputs/final_model.joblib")
    dump(scaler_final, "outputs/final_scaler.joblib")


📥 Loading data from features/features_train.csv...

📊 Cross-validation results:
test_accuracy: 0.8000 ± 0.1500
test_f1_weighted: 0.7747 ± 0.1784
test_roc_auc: 0.8750 ± 0.2201

📈 Evaluation on Known Data
              precision    recall  f1-score   support

       clean       1.00      1.00      1.00        20
       dirty       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

ROC AUC: 1.0000
✅ Pseudo-labeled: 99 samples


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_known.rename(columns={'label': 'final_label'}, inplace=True)


✅ Final model trained.


<Figure size 640x480 with 0 Axes>