In [None]:
import pandas as pd
import mlflow
from sklearn.model_selection import train_test_split
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset, TargetDriftPreset
from evidently import ColumnMapping

# 📥 Load and split data
df = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name="Data")
X = df.drop(["Personal Loan", "ID"], axis=1)
y = df["Personal Loan"]
train_df, test_df, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
train_df["Personal Loan"] = y_train
test_df["Personal Loan"] = y_test

# 📤 Load unseen data
new_df = pd.read_csv("C:/Users/Minfy.CHIRANJIBISILLA/Desktop/Risk Classification System/Building model/uploads/New Customer Bank_Personal_Loan.csv")

# 🧾 Column Mappings
num_cols = X.select_dtypes(include="number").columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

column_mapping_with_target = ColumnMapping(
    target="Personal Loan",
    numerical_features=num_cols,
    categorical_features=cat_cols
)

column_mapping_without_target = ColumnMapping(
    target=None,
    numerical_features=num_cols,
    categorical_features=cat_cols
)

# ✅ Function to log data drift scores to MLflow
def log_data_drift_to_mlflow(report: Report, prefix=""):
    report_dict = report.as_dict()
    for metric in report_dict.get("metrics", []):
        if metric.get("type") == "DatasetDriftMetric":
            result = metric.get("result", {})
            
            # 🔢 Log overall drift ratio
            drift_ratio = result.get("share_of_drifted_columns", None)
            if drift_ratio is not None:
                mlflow.log_metric(f"{prefix}_data_drift_ratio", drift_ratio)

            # 🔍 Log feature-wise drift scores
            for feature, vals in result.get("drift_by_columns", {}).items():
                stat = vals.get("statistic", {})
                score = stat.get("value", None)
                if score is not None:
                    clean_name = feature.replace(" ", "_").replace("(", "").replace(")", "")
                    mlflow.log_metric(f"{prefix}_drift_{clean_name}", score)
            break

# 🚀 MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("LoanDriftMonitoring")

with mlflow.start_run(run_name="evidently_drift_scores"):

    # ✅ Report 1: Train vs Test (with target drift)
    report_test = Report(metrics=[
        DataQualityPreset(),
        DataDriftPreset(),
        TargetDriftPreset(),
    ])
    report_test.run(reference_data=train_df, current_data=test_df, column_mapping=column_mapping_with_target)
    report_test.save_html("report_train_vs_test.html")
    mlflow.log_artifact("report_train_vs_test.html", artifact_path="evidently_reports")
    log_data_drift_to_mlflow(report_test, prefix="test")

    # ✅ Report 2: Old vs New (pure data drift)
    report_old_new = Report(metrics=[
        DataQualityPreset(),
        DataDriftPreset(),
    ])
    report_old_new.run(reference_data=df, current_data=new_df, column_mapping=column_mapping_without_target)
    report_old_new.save_html("report_old_vs_new.html")
    mlflow.log_artifact("report_old_vs_new.html", artifact_path="evidently_reports")
    log_data_drift_to_mlflow(report_old_new, prefix="old_new")

    # 📌 Row counts
    mlflow.log_metric("rows_train", len(train_df))
    mlflow.log_metric("rows_test", len(test_df))
    mlflow.log_metric("rows_old", len(df))
    mlflow.log_metric("rows_new", len(new_df))
    mlflow.log_metric("cc_old",)

    # 📋 Params
    mlflow.log_param("drift_tool", "evidently")
    mlflow.log_param("scenario", "train_vs_test_and_old_vs_new")

print("✅ Data drift metrics and reports logged successfully to MLflow.")


🏃 View run evidently_drift_scores at: http://127.0.0.1:5000/#/experiments/103779593599267708/runs/1e6ab2d5e0954e87ba693047cdfc90bc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/103779593599267708
✅ Data drift metrics and reports logged successfully to MLflow.


In [84]:
import pandas as pd
import numpy as np
import mlflow
import json
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset
from evidently import ColumnMapping


# ✅ Step 1: Load Data
df = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name="Data")
new_df = pd.read_csv("C:/Users/Minfy.CHIRANJIBISILLA/Desktop/Risk Classification System/Building model/uploads/New Customer Bank_Personal_Loan.csv")

# ✅ Step 2: Align Columns & Clean
df.columns = df.columns.str.strip()
new_df.columns = new_df.columns.str.strip()

common_cols = list(set(df.columns).intersection(new_df.columns))
df = df[common_cols].copy()
new_df = new_df[common_cols].copy()

# ✅ Step 3: Align Data Types
new_df = new_df.astype(df.dtypes.to_dict())

# ✅ Step 4: Drop rows with missing values
df = df.dropna()
new_df = new_df.dropna()

# ✅ Step 5: Drop constant columns (no variance)
constant_cols = [col for col in df.columns if df[col].nunique() <= 1 or new_df[col].nunique() <= 1]
df.drop(columns=constant_cols, inplace=True)
new_df.drop(columns=constant_cols, inplace=True)

print(f"✅ Cleaned and aligned data with {len(df.columns)} common usable columns.\n")

# ✅ Step 6: Create Column Mapping
numerical = df.select_dtypes(include='number').columns.tolist()
categorical = df.select_dtypes(include=['object', 'category']).columns.tolist()

mapping = ColumnMapping(
    target=None,
    numerical_features=numerical,
    categorical_features=categorical
)

# ✅ Step 7: Run Evidently Drift Report
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=df, current_data=new_df, column_mapping=mapping)

report_dict = report.as_dict()

# ✅ Step 8: Extract Drift Scores
drift_result = next(
    (metric["result"] for metric in report_dict["metrics"] if "drift_by_columns" in metric.get("result", {})),
    None
)

if drift_result is None:
    raise ValueError("❌ Could not find drift scores in the report.")

# ✅ Step 9: Display Overall Drift Score
drift_ratio = drift_result.get("share_of_drifted_columns", 0.0)
print(f"🔄 Overall Drift Detected in: {drift_ratio:.2%} of features.\n")

# ✅ Step 10: Feature-wise Drift Table
drift_data = []
for feature, vals in drift_result["drift_by_columns"].items():
    stat = vals.get("statistic", {})
    score = stat.get("value", None)
    reason = None
    if score is None:
        if df[feature].nunique() <= 1 or new_df[feature].nunique() <= 1:
            reason = "Constant or single unique value"
        elif df[feature].isnull().sum() > 0 or new_df[feature].isnull().sum() > 0:
            reason = "Missing values"
        elif df[feature].dtype != new_df[feature].dtype:
            reason = "Data type mismatch"
        else:
            reason = "Statistical test not applicable"
    drift_data.append({
        "Feature": feature,
        "Drift Detected": vals.get("drift_detected"),
        "Drift Score": round(score, 4) if score is not None else None,
        "p-value": round(stat.get("p_value", 0), 4) if "p_value" in stat else None,
        "Threshold": round(stat.get("threshold", 0), 4) if "threshold" in stat else None,
        "Type": vals.get("feature_type"),
        "Reason If None": reason
    })

df_drift = pd.DataFrame(drift_data)
print("📊 Drift Table:")
print(df_drift)

✅ Cleaned and aligned data with 11 common usable columns.

🔄 Overall Drift Detected in: 90.91% of features.

📊 Drift Table:
       Feature  Drift Detected Drift Score p-value Threshold  Type  \
0          Age            True        None    None      None  None   
1        CCAvg            True        None    None      None  None   
2   CreditCard            True        None    None      None  None   
3    Education            True        None    None      None  None   
4   Experience            True        None    None      None  None   
5       Family            True        None    None      None  None   
6           ID            True        None    None      None  None   
7       Income            True        None    None      None  None   
8     Mortgage            True        None    None      None  None   
9       Online           False        None    None      None  None   
10    ZIP Code            True        None    None      None  None   

                     Reason If None

In [88]:
import pandas as pd
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset, DataQualityPreset
from evidently import ColumnMapping

# ✅ Load data
df = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx", sheet_name="Data")
new_df = pd.read_csv("C:/Users/Minfy.CHIRANJIBISILLA/Desktop/Risk Classification System/Building model/uploads/New Customer Bank_Personal_Loan.csv")
common_cols = list(set(df.columns).intersection(new_df.columns))
df_aligned = df[common_cols].copy()
new_df_aligned = new_df[common_cols].copy()
mapping = ColumnMapping(
    target=None,
    numerical_features=[col for col in common_cols if df[col].dtype in ["int64", "float64"]],
    categorical_features=[col for col in common_cols if df[col].dtype == "object"]
)
report_old_new.run(reference_data=df_aligned, current_data=new_df_aligned, column_mapping=mapping)
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=df_aligned, current_data=new_df_aligned, column_mapping=mapping)

# Then extract drift scores from report.as_dict()
report_dict = report.as_dict()  # ← FIXED: this was report.as_dict() not report.as_dict (missing parentheses)
drift_result = None
for metric in report_dict.get("metrics", []):
    if "drift_by_columns" in metric.get("result", {}):
        drift_result = metric.get("result")
        break
if drift_result is None:
    raise ValueError("❌ Could not find 'drift_by_columns' in any metric result.")
drift_ratio = drift_result.get("share_of_drifted_columns", 0.0)
print(f"\n🔄 Overall Drift Score (Ratio): {drift_ratio:.2%}")
drift_data = []
for feature, values in drift_result["drift_by_columns"].items():
    score = values.get("statistic", {}).get("value", None)
    drifted = values.get("drift_detected", None)
    drift_data.append({
        "Feature": feature,
        "Drift Score": round(score, 4) if score is not None else None,
        "Drift Detected": drifted
    })

import json
print(json.dumps(report.as_dict(), indent=2))


🔄 Overall Drift Score (Ratio): 92.31%
{
  "metrics": [
    {
      "metric": "DatasetDriftMetric",
      "result": {
        "drift_share": 0.5,
        "number_of_columns": 13,
        "number_of_drifted_columns": 12,
        "share_of_drifted_columns": 0.9230769230769231,
        "dataset_drift": true
      }
    },
    {
      "metric": "DataDriftTable",
      "result": {
        "number_of_columns": 13,
        "number_of_drifted_columns": 12,
        "share_of_drifted_columns": 0.9230769230769231,
        "dataset_drift": true,
        "drift_by_columns": {
          "Age": {
            "column_name": "Age",
            "column_type": "num",
            "stattest_name": "Wasserstein distance (normed)",
            "stattest_threshold": 0.1,
            "drift_score": 0.6921177228051254,
            "drift_detected": true,
            "current": {
              "small_distribution": {
                "x": [
                  34.0,
                  37.2,
                  40.4,
 

In [82]:
# ✅ Start a new MLflow run
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("DriftFromJSON")

with mlflow.start_run(run_name="log_drift_from_json"):

    # ✅ Log overall drift
    mlflow.log_metric("old_new_drift_score", drift_ratio)

    # ✅ Log per-feature drift
    drift_by_columns = drift_result.get("drift_by_columns", {})
    for feature, result in drift_by_columns.items():
        stat_value = result.get("statistic", {}).get("value")
        if stat_value is not None:
            clean_name = feature.replace(" ", "_").replace("(", "").replace(")", "")
            mlflow.log_metric(f"drift_{clean_name}", round(stat_value, 4))

    # ✅ Optional tag for metadata
    mlflow.set_tag("source", "json_drift_report")
    mlflow.set_tag("drift_columns", sum(1 for r in drift_data if r["Drift Detected"]))


🏃 View run log_drift_from_json at: http://127.0.0.1:5000/#/experiments/873159635592907096/runs/74e3cfe757c049dbaf777ea1ef8c1ed6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/873159635592907096


In [90]:
with mlflow.start_run(run_name="drift_log_via_DataDriftTable"):

    # Save HTML, CSV, etc...
    

    # Log metrics
    drift_result = next(
        (m["result"] for m in report_dict["metrics"] if m.get("metric") == "DataDriftTable"),
        None
    )
    if drift_result:
        mlflow.log_metric("datadrift_overall_ratio", drift_result["share_of_drifted_columns"])
        for feature, vals in drift_result["drift_by_columns"].items():
            score = vals.get("drift_score")
            if score is not None:
                name = feature.replace(" ", "_").replace("(", "").replace(")", "")
                mlflow.log_metric(f"datadrift_{name}", round(score, 4))


🏃 View run drift_log_via_DataDriftTable at: http://127.0.0.1:5000/#/experiments/873159635592907096/runs/2d943b2defca4f0da6d62b3b8a1503be
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/873159635592907096
