In [1]:
import os
import re
import json
import mlflow
import pandas as pd
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# === File Paths ===
REFERENCE_DATA_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\raw_data\Lead Scoring.csv"
NEW_DATA_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\raw_data\check.csv"
DRIFT_REPORT_PATH = "drift_report.html"
SUMMARY_REPORT_PATH = "summary_report.html"
DRIFT_FLAG_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\evidently\drift_flag.txt"

# === Threshold ===
DRIFT_THRESHOLD = 0.3

# === Helper ===
def clean_metric_name(name):
    return re.sub(r"[^\w\-/\. ]", "_", name)

# === Load Data ===
reference_df = pd.read_csv(REFERENCE_DATA_PATH)
new_df = pd.read_csv(NEW_DATA_PATH)

# Ensure consistent columns
common_cols = list(set(reference_df.columns) & set(new_df.columns))
reference_df = reference_df[common_cols]
new_df = new_df[common_cols]

# === Drop single-valued columns (they break drift detection) ===
filtered_cols = [
    col for col in common_cols
    if reference_df[col].nunique(dropna=False) > 1 and new_df[col].nunique(dropna=False) > 1
]
reference_df = reference_df[filtered_cols]
new_df = new_df[filtered_cols]

# === Initialize MLflow ===
mlflow.set_tracking_uri("http://mlflow:5000")
mlflow.set_experiment("evidently_data_drift")

drift_detected = False

with mlflow.start_run(run_name="lead-scoring-drift-check"):
    for report_type, preset in [("drift", DataDriftPreset()), ("summary", DataSummaryPreset())]:
        report = Report(metrics=[preset])
        try:
            report.run(reference_data=reference_df, current_data=new_df)
            report_path = DRIFT_REPORT_PATH if report_type == "drift" else SUMMARY_REPORT_PATH
            report.save_html(report_path)
            mlflow.log_artifact(report_path)

            report_json = report.as_dict()

            for metric in report_json.get("metrics", []):
                metric_id = metric.get("metric_id") or metric.get("metric", "")
                value = metric.get("value", None)

                if isinstance(value, dict):
                    for sub_name, sub_val in value.items():
                        if isinstance(sub_val, (int, float)):
                            metric_name = clean_metric_name(f"{report_type}_{metric_id}_{sub_name}")
                            mlflow.log_metric(metric_name, sub_val)
                            if "drift" in sub_name and sub_val > DRIFT_THRESHOLD:
                                drift_detected = True
                elif isinstance(value, (int, float)):
                    metric_name = clean_metric_name(f"{report_type}_{metric_id}")
                    mlflow.log_metric(metric_name, value)
                    if "drift" in metric_name and value > DRIFT_THRESHOLD:
                        drift_detected = True
                elif "ValueDrift(column=" in metric_id:
                    try:
                        col_name = metric_id.split("ValueDrift(column=")[1].split(",")[0]
                        metric_name = clean_metric_name(f"{report_type}_{col_name}")
                        mlflow.log_metric(metric_name, value)
                        if isinstance(value, float) and value > DRIFT_THRESHOLD:
                            drift_detected = True
                    except Exception as e:
                        print(f"⚠️ Could not parse metric ID: {metric_id} — {e}")
        except Exception as e:
            print(f"❌ Failed to generate {report_type} report: {e}")

# === Save Drift Flag ===
with open(DRIFT_FLAG_PATH, "w") as f:
    f.write("drift_detected: " + str(drift_detected))

print("✅ Drift check complete.")
print(f"📄 Drift flag written to: {DRIFT_FLAG_PATH}")
if drift_detected:
    print("⚠️ Drift detected! Check MLflow and reports.")
else:
    print("✅ No significant drift found.")


KeyboardInterrupt: 

In [3]:
import os
import json
import re
import mlflow
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import train_test_split
from evidently import Report
from evidently.presets import DataDriftPreset, DataSummaryPreset

# === Configuration ===
# File Paths
REFERENCE_DATA_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\raw_data\Lead Scoring.csv"
NEW_DATA_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\raw_data\check.csv"
DRIFT_REPORT_PATH = "drift_report.html"
SUMMARY_REPORT_PATH = "summary_report.html"
DRIFT_FLAG_PATH = r"C:\Users\Minfy.DESKTOP-3E50D5N\Desktop\final_capstone\evidently\drift_flag.txt"

# MLflow Configuration
MLFLOW_TRACKING_URI = "http://localhost:5000"  # Adjust as needed
MLFLOW_EXPERIMENT_NAME = "drift_detection_experiment"

# Drift Detection Parameters
DRIFT_THRESHOLD = 0.3  # PSI threshold for drift detection
DRIFT_METHODS = ['psi', 'ks', 'chisquare', 'wasserstein']  # Multiple drift detection methods
NUMERICAL_FEATURES = []  # Will be auto-detected
CATEGORICAL_FEATURES = []  # Will be auto-detected

# === Utility Functions ===
def clean_metric_name(name):
    """Clean metric names to be MLflow-friendly by removing unsupported characters."""
    return re.sub(r"[^\w\-/\. ]", "_", name)

def setup_mlflow():
    """Initialize MLflow tracking."""
    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
    return mlflow.start_run()

def load_and_validate_data(reference_path, new_path):
    """Load and validate reference and new datasets."""
    try:
        reference_df = pd.read_csv(reference_path)
        new_df = pd.read_csv(new_path)
        
        print(f"Reference data shape: {reference_df.shape}")
        print(f"New data shape: {new_df.shape}")
        
        # Log basic dataset info
        mlflow.log_param("reference_data_rows", reference_df.shape[0])
        mlflow.log_param("reference_data_cols", reference_df.shape[1])
        mlflow.log_param("new_data_rows", new_df.shape[0])
        mlflow.log_param("new_data_cols", new_df.shape[1])
        
        return reference_df, new_df
    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def identify_feature_types(df):
    """Automatically identify numerical and categorical features."""
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    
    print(f"Numerical features: {numerical_features}")
    print(f"Categorical features: {categorical_features}")
    
    return numerical_features, categorical_features

def log_basic_statistics(reference_df, new_df, prefix=""):
    """Log basic statistics for both datasets."""
    stats_metrics = {}
    
    # Reference dataset statistics
    ref_stats = reference_df.describe()
    for col in ref_stats.columns:
        for stat in ref_stats.index:
            metric_name = clean_metric_name(f"{prefix}reference_{col}_{stat}")
            value = ref_stats.loc[stat, col]
            if pd.notna(value):
                stats_metrics[metric_name] = float(value)
    
    # New dataset statistics
    new_stats = new_df.describe()
    for col in new_stats.columns:
        for stat in new_stats.index:
            metric_name = clean_metric_name(f"{prefix}new_{col}_{stat}")
            value = new_stats.loc[stat, col]
            if pd.notna(value):
                stats_metrics[metric_name] = float(value)
    
    # Log missing values
    ref_missing = reference_df.isnull().sum()
    new_missing = new_df.isnull().sum()
    
    for col in reference_df.columns:
        stats_metrics[clean_metric_name(f"{prefix}reference_{col}_missing_count")] = int(ref_missing[col])
        stats_metrics[clean_metric_name(f"{prefix}reference_{col}_missing_percentage")] = float(ref_missing[col] / len(reference_df) * 100)
    
    for col in new_df.columns:
        stats_metrics[clean_metric_name(f"{prefix}new_{col}_missing_count")] = int(new_missing[col])
        stats_metrics[clean_metric_name(f"{prefix}new_{col}_missing_percentage")] = float(new_missing[col] / len(new_df) * 100)
    
    # Log all statistics
    for metric_name, value in stats_metrics.items():
        mlflow.log_metric(metric_name, value)
    
    return stats_metrics

def log_evidently_reports(reference_df, new_df, method='psi'):
    """
    This function runs Evidently data drift and summary reports,
    logs them to MLflow, and detects if drift exceeds threshold
    """
    # Define types of reports to generate
    report_configs = [
        ("drift", DataDriftPreset(method=method)),  # Drift detection
        ("summary", DataSummaryPreset())            # Summary report
    ]
    
    drift_found = False  # Flag to track drift detection
    
    # Ensure common columns
    common_cols = reference_df.columns.intersection(new_df.columns)
    ref_df_clean = reference_df[common_cols].copy()
    new_df_clean = new_df[common_cols].copy()
    
    for report_type, preset in report_configs:
        report = Report([preset], include_tests=True)
        
        try:
            # Run the Evidently report
            result = report.run(reference_data=ref_df_clean, current_data=new_df_clean)
            
            # Save report as HTML and log to MLflow
            html_path = f"evidently_{report_type}_{method}.html"
            result.save_html(html_path)
            mlflow.log_artifact(html_path)
            
            # Convert report to JSON to extract drift metrics
            json_data = json.loads(result.json())
            
            # Loop through all reported metrics
            for metric in json_data.get("metrics", []):
                metric_id = metric.get("metric_id") or metric.get("metric", "")
                value = metric.get("value", None)
                
                # If metric value is a dictionary (contains sub-metrics)
                if isinstance(value, dict):
                    for sub_name, sub_val in value.items():
                        if isinstance(sub_val, (int, float)):
                            metric_name = clean_metric_name(f"{method}_{report_type}_{metric_id}_{sub_name}")
                            mlflow.log_metric(metric_name, sub_val)
                            # Check if drift is found
                            if "drift" in sub_name.lower() and sub_val > DRIFT_THRESHOLD:
                                drift_found = True
                
                # If metric is a single float or int
                elif isinstance(value, (int, float)):
                    metric_name = clean_metric_name(f"{method}_{report_type}_{metric_id}")
                    mlflow.log_metric(metric_name, value)
                    if "drift" in metric_name.lower() and value > DRIFT_THRESHOLD:
                        drift_found = True
                
                # Special handling for column-wise drift
                elif "ValueDrift(column=" in str(metric_id):
                    try:
                        col_name = str(metric_id).split("ValueDrift(column=")[1].split(",")[0]
                        if isinstance(value, (int, float)):
                            metric_name = clean_metric_name(f"{method}_{report_type}_{col_name}_drift")
                            mlflow.log_metric(metric_name, value)
                            if value > DRIFT_THRESHOLD:
                                drift_found = True
                    except Exception as e:
                        print(f"Could not parse drift metric: {metric_id} — {e}")
                
        except Exception as e:
            print(f"Evidently report failed for {report_type} - {method}: {e}")
    
    return drift_found

def save_drift_flag(drift_detected, additional_info=None):
    """Save drift detection result to file."""
    try:
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(DRIFT_FLAG_PATH), exist_ok=True)
        
        drift_info = {
            "drift_detected": drift_detected,
            "timestamp": datetime.now().isoformat(),
            "drift_threshold": DRIFT_THRESHOLD,
            "additional_info": additional_info or {}
        }
        
        with open(DRIFT_FLAG_PATH, "w") as f:
            json.dump(drift_info, f, indent=2)
        
        print(f"Drift flag saved to: {DRIFT_FLAG_PATH}")
        
    except Exception as e:
        print(f"Error saving drift flag: {e}")

def main():
    """Main function to run comprehensive drift detection."""
    print("Starting comprehensive drift detection...")
    
    # Setup MLflow
    with setup_mlflow():
        try:
            # Load and validate data
            reference_df, new_df = load_and_validate_data(REFERENCE_DATA_PATH, NEW_DATA_PATH)
            
            # Identify feature types
            numerical_features, categorical_features = identify_feature_types(reference_df)
            
            # Log feature information
            mlflow.log_param("numerical_features", str(numerical_features))
            mlflow.log_param("categorical_features", str(categorical_features))
            mlflow.log_param("drift_threshold", DRIFT_THRESHOLD)
            mlflow.log_param("drift_methods", str(DRIFT_METHODS))
            
            # Log basic statistics
            print("Logging basic statistics...")
            log_basic_statistics(reference_df, new_df, prefix="basic_stats_")
            
            # Track drift detection across multiple methods
            drift_detected_overall = False
            drift_results = {}
            
            # Generate reports for each drift detection method
            for method in DRIFT_METHODS:
                print(f"Generating drift report using {method} method...")
                
                try:
                    # Generate drift report using the log_evidently_reports function
                    drift_detected = log_evidently_reports(reference_df, new_df, method=method)
                    drift_results[method] = drift_detected
                    
                    if drift_detected:
                        drift_detected_overall = True
                    
                    print(f"Drift detection using {method}: {'DETECTED' if drift_detected else 'NOT DETECTED'}")
                    
                except Exception as e:
                    print(f"Error with {method} method: {e}")
                    drift_results[method] = False
            
            # Log overall drift detection result
            mlflow.log_metric("drift_detected_overall", 1.0 if drift_detected_overall else 0.0)
            
            # Log individual method results
            for method, detected in drift_results.items():
                mlflow.log_metric(f"drift_detected_{method}", 1.0 if detected else 0.0)
            
            # Write final drift detection result to a local file (following your reference code pattern)
            drift_info = {
                "drift_detected": drift_detected_overall,
                "timestamp": datetime.now().isoformat(),
                "drift_threshold": DRIFT_THRESHOLD,
                "methods_used": DRIFT_METHODS,
                "individual_results": drift_results,
                "reference_data_shape": reference_df.shape,
                "new_data_shape": new_df.shape
            }
            
            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(DRIFT_FLAG_PATH), exist_ok=True)
            
            with open(DRIFT_FLAG_PATH, "w") as f:
                json.dump(drift_info, f, indent=2)
            
            print(f"\nDrift Detection Summary:")
            print(f"Overall drift detected: {'YES' if drift_detected_overall else 'NO'}")
            print(f"Individual method results: {drift_results}")
            print(f"Reports saved and logged to MLflow")
            print(f"Drift flag saved to: {DRIFT_FLAG_PATH}")
            
        except Exception as e:
            print(f"Error in main execution: {e}")
            mlflow.log_param("error", str(e))
            raise

if __name__ == "__main__":
    main()

Starting comprehensive drift detection...


2025/07/18 13:30:43 INFO mlflow.tracking.fluent: Experiment with name 'drift_detection_experiment' does not exist. Creating a new experiment.


Reference data shape: (9240, 37)
New data shape: (5, 37)
Numerical features: ['Lead Number', 'Converted', 'TotalVisits', 'Total Time Spent on Website', 'Page Views Per Visit', 'Asymmetrique Activity Score', 'Asymmetrique Profile Score']
Categorical features: ['Prospect ID', 'Lead Origin', 'Lead Source', 'Do Not Email', 'Do Not Call', 'Last Activity', 'Country', 'Specialization', 'How did you hear about X Education', 'What is your current occupation', 'What matters most to you in choosing a course', 'Search', 'Magazine', 'Newspaper Article', 'X Education Forums', 'Newspaper', 'Digital Advertisement', 'Through Recommendations', 'Receive More Updates About Our Courses', 'Tags', 'Lead Quality', 'Update me on Supply Chain Content', 'Get updates on DM Content', 'Lead Profile', 'City', 'Asymmetrique Activity Index', 'Asymmetrique Profile Index', 'I agree to pay the amount through cheque', 'A free copy of Mastering The Interview', 'Last Notable Activity']
Logging basic statistics...
Generating


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide


divide by zero encountered in divide



Evidently report failed for drift - chisquare: Stattest chisquare isn't applicable to feature of type text. Available feature types: [<ColumnType.Categorical: 'cat'>]
Evidently report failed for summary - chisquare: 2 validation errors for ByLabelCountValue
counts
  type <class 'numpy.float64'> not supported as Label (type=value_error)
shares
  type <class 'numpy.float64'> not supported as Label (type=value_error)
Drift detection using chisquare: NOT DETECTED
Generating drift report using wasserstein method...
Evidently report failed for drift - wasserstein: Stattest wasserstein isn't applicable to feature of type cat. Available feature types: [<ColumnType.Numerical: 'num'>]
Evidently report failed for summary - wasserstein: 2 validation errors for ByLabelCountValue
counts
  type <class 'numpy.float64'> not supported as Label (type=value_error)
shares
  type <class 'numpy.float64'> not supported as Label (type=value_error)
Drift detection using wasserstein: NOT DETECTED

Drift Detectio