## P7

In [None]:
import random
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns


from rich.pretty import pprint

%load_ext rich

import warnings

warnings.filterwarnings("ignore")


---

In [None]:
# Data Drift Analysis using Evidently for Credit Classification
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Import Evidently modules
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import (
    DataDriftPreset,
    DataQualityPreset,
    TargetDriftPreset,
)
from evidently.metrics import *
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset
from evidently.ui.dashboards import CounterAgg, DashboardPanelInfo
from evidently.ui.workspace import Workspace, WorkspaceBase

# Step 1: Load your datasets
# Replace these with your actual data loading code
# For example:
# X_train = pd.read_csv('X_train.csv')
# y_train = pd.read_csv('y_train.csv')
# X_val = pd.read_csv('X_val.csv')
# y_val = pd.read_csv('y_val.csv')
# X_test = pd.read_csv('X_test.csv')


# For demonstration, I'll create sample data:
def create_sample_data(n_samples, with_drift=False):
    np.random.seed(42)

    # Create features
    data = {
        "income": np.random.normal(50000, 15000, n_samples)
        * (1.2 if with_drift else 1),
        "debt_ratio": np.random.uniform(0.1, 0.8, n_samples)
        * (0.9 if with_drift else 1),
        "credit_score": np.random.normal(700, 100, n_samples)
        * (1.1 if with_drift else 1),
        "employment_length": np.random.poisson(5, n_samples)
        * (1.3 if with_drift else 1),
        "age": np.random.normal(40, 10, n_samples) * (1.1 if with_drift else 1),
        "categorical_education": np.random.choice(
            ["High School", "Bachelor", "Master", "PhD"], n_samples
        ),
        "categorical_housing": np.random.choice(["Own", "Rent", "Mortgage"], n_samples),
    }

    # Add some null values to test data if drift is True
    if with_drift:
        for col in ["income", "debt_ratio", "credit_score"]:
            null_indices = np.random.choice(
                n_samples, size=int(n_samples * 0.05), replace=False
            )
            data[col] = pd.Series(data[col])
            data[col].iloc[null_indices] = np.nan

    df = pd.DataFrame(data)

    # Create target variable (predicting loan repayment)
    # Formula: higher income and credit score, lower debt ratio = better repayment
    repayment_score = (
        data["income"] / 50000 * 0.4
        + data["credit_score"] / 700 * 0.4
        - data["debt_ratio"] * 0.2
    )
    threshold = 0.7
    # 0 = repays loan, 1 = won't repay loan
    target = (repayment_score < threshold).astype(int)

    return df, target


# Create datasets
X_train, y_train = create_sample_data(1000)
X_val, y_val = create_sample_data(300)
X_test, _ = create_sample_data(200, with_drift=True)  # Test data with drift, no labels

# Convert target to DataFrame
y_train = pd.DataFrame(y_train, columns=["target"])
y_val = pd.DataFrame(y_val, columns=["target"])

# Combine features and target for reference and current datasets
reference_data = X_val.copy()
reference_data["target"] = y_val["target"].values

current_data = X_test.copy()
# Note: current_data has no target because it's unlabeled

# Step 2: Train a simple model to get predictions
# This step is just to show how to handle model predictions in the analysis
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train["target"])

# Generate predictions for both validation and test sets
val_pred_proba = model.predict_proba(X_val)[:, 1]
test_pred_proba = model.predict_proba(X_test)[:, 1]

# Add predictions to dataframes
reference_data["prediction"] = val_pred_proba
current_data["prediction"] = test_pred_proba

# Step 3: Define column mapping for Evidently
# This tells Evidently which columns are features, targets, and predictions
column_mapping = ColumnMapping(
    target="target",
    prediction="prediction",
    numerical_features=[
        "income",
        "debt_ratio",
        "credit_score",
        "employment_length",
        "age",
    ],
    categorical_features=["categorical_education", "categorical_housing"],
)

# Step 4: Create Data Drift Report
data_drift_report = Report(
    metrics=[
        DataDriftPreset(),
    ]
)

# Step 5: Run the report
data_drift_report.run(
    reference_data=reference_data,
    current_data=current_data,
    column_mapping=column_mapping,
)

# Save the report
data_drift_report.save_html("data_drift_report.html")

# Step 6: Create a more detailed test suite
data_drift_tests = TestSuite(
    tests=[
        DataDriftTestPreset(),
    ]
)

data_drift_tests.run(
    reference_data=reference_data,
    current_data=current_data,
    column_mapping=column_mapping,
)
data_drift_tests.save_html("data_drift_tests.html")

# Step 7: Feature-by-feature analysis
print("Feature-by-feature drift analysis:")
for feature in column_mapping.numerical_features + column_mapping.categorical_features:
    # Create a single-feature report
    feature_report = Report(metrics=[ColumnDriftMetric(column_name=feature)])

    feature_report.run(
        reference_data=reference_data,
        current_data=current_data,
        column_mapping=column_mapping,
    )
    result = feature_report.as_dict()

    # Extract and print drift information
    drift_detected = result["metrics"][0]["result"]["drift_detected"]
    drift_score = result["metrics"][0]["result"].get("drift_score", "N/A")

    print(f"Feature: {feature}")
    print(f"  Drift detected: {drift_detected}")
    print(f"  Drift score: {drift_score}")
    print("-" * 40)

# Step 8: Analyze prediction drift
# Even without labels, we can check if the model's output distribution has changed
prediction_drift_report = Report(metrics=[ColumnDriftMetric(column_name="prediction")])

prediction_drift_report.run(reference_data=reference_data, current_data=current_data)
pred_result = prediction_drift_report.as_dict()

print("Prediction Distribution Drift:")
print(f"  Drift detected: {pred_result['metrics'][0]['result']['drift_detected']}")
print(f"  Drift score: {pred_result['metrics'][0]['result'].get('drift_score', 'N/A')}")

# Step 9: Visualize the distributions of key features
plt.figure(figsize=(15, 10))

for i, feature in enumerate(column_mapping.numerical_features):
    plt.subplot(2, 3, i + 1)
    sns.kdeplot(reference_data[feature], label="Reference (Validation)", alpha=0.7)
    sns.kdeplot(current_data[feature], label="Current (Test)", alpha=0.7)
    plt.title(f"{feature} Distribution")
    plt.legend()

plt.tight_layout()
plt.savefig("feature_distributions.png")

# Step 10: Create a basic monitoring dashboard
workspace = Workspace("./workspace")
dashboard = workspace.create_dashboard("Credit Risk Data Drift")

dashboard.add_panel(
    DashboardPanelInfo(
        title="Data Drift Score", metric=DataDriftTable(), agg=CounterAgg.last()
    )
)

dashboard.add_panel(
    DashboardPanelInfo(
        title="Prediction Drift",
        metric=ColumnDriftMetric(column_name="prediction"),
        agg=CounterAgg.last(),
    )
)

# Add several features to monitor specifically
for feature in ["income", "credit_score", "debt_ratio"]:
    dashboard.add_panel(
        DashboardPanelInfo(
            title=f"{feature} Drift",
            metric=ColumnDriftMetric(column_name=feature),
            agg=CounterAgg.last(),
        )
    )

dashboard.save()
print("Dashboard created at: ./workspace")

# Step 11: Analysis of potential causes of data drift
print("\nData Drift Root Cause Analysis:")

# Check for range shifts in numerical features
print("\nNumerical Feature Range Analysis:")
for feature in column_mapping.numerical_features:
    ref_min, ref_max = reference_data[feature].min(), reference_data[feature].max()
    curr_min, curr_max = current_data[feature].min(), current_data[feature].max()

    min_change_pct = (
        (curr_min - ref_min) / ref_min * 100 if ref_min != 0 else float("inf")
    )
    max_change_pct = (
        (curr_max - ref_max) / ref_max * 100 if ref_max != 0 else float("inf")
    )

    print(f"{feature}:")
    print(f"  Reference range: [{ref_min:.2f}, {ref_max:.2f}]")
    print(f"  Current range: [{curr_min:.2f}, {curr_max:.2f}]")
    print(f"  Min change: {min_change_pct:.2f}%, Max change: {max_change_pct:.2f}%")

# Check for distribution changes in categorical features
print("\nCategorical Feature Distribution Analysis:")
for feature in column_mapping.categorical_features:
    ref_dist = reference_data[feature].value_counts(normalize=True)
    curr_dist = current_data[feature].value_counts(normalize=True)

    print(f"{feature} distribution changes:")
    for category in set(ref_dist.index) | set(curr_dist.index):
        ref_val = ref_dist.get(category, 0) * 100
        curr_val = curr_dist.get(category, 0) * 100
        change = curr_val - ref_val

        print(f"  {category}: {ref_val:.2f}% → {curr_val:.2f}% (Change: {change:.2f}%)")

# Step 12: Summarize findings and recommendations
print("\n=== DATA DRIFT ANALYSIS SUMMARY ===")
print("Based on the Evidently analysis, the following actions are recommended:")
print("1. Review features with significant drift for data quality issues")
print("2. Consider retraining the model if prediction distribution has shifted")
print("3. Set up continuous monitoring with drift detection thresholds")
print(
    "4. Investigate potential concept drift if available (when labels become available)"
)
print("5. For features with most drift, consider feature engineering or transformation")

---

---

In [82]:
import numpy as np
import pandas as pd

from evidently import ColumnMapping
from evidently.report import Report
from evidently.metric_preset import (
    DataDriftPreset,
    DataQualityPreset,
    TargetDriftPreset,
)

#from evidently.presets import DataSummaryPreset
#from evidently.presets import DataSummaryPreset

from rich.pretty import pprint

%load_ext rich


The rich extension is already loaded. To reload it, use:
  %reload_ext rich


In [83]:
import joblib

app_train_domain = joblib.load("../../data/processed/app_train_domain.joblib")
logistic_y_train = joblib.load("../../data/processed/logistic_y_train.joblib")
logistic_y_test = joblib.load("../../data/processed/logistic_y_test.joblib")
logistic_y_pred = joblib.load("../../data/processed/logistic_y_pred.joblib")
app_test_domain = joblib.load("../../data/processed/app_test_domain.joblib")
#logistic_y_pred = pd.Series(logistic_y_pred[:, 1])

In [59]:
logistic_y_pred = pd.DataFrame(logistic_y_pred[:, 1], columns=['y_pred'])
logistic_y_test = pd.DataFrame(logistic_y_test)
logistic_y_train = pd.DataFrame(logistic_y_train)


In [84]:
categorical_var = app_train_domain.select_dtypes(include=['object']).columns
numerical_var = app_train_domain.columns.difference(categorical_var)

In [98]:
column_mapping = ColumnMapping(
    target='y_test',
    prediction='y_pred',
    numerical_features=numerical_var.to_list(),
    categorical_features=categorical_var.to_list(),
)

In [None]:

# Step 4: Create Data Drift Report
data_drift_report = Report(
    metrics=[
        DataDriftPreset(),
    ]
)


# Step 5: Run the report
data_drift_report.run(
    reference_data=app_train_domain,
    current_data=app_test_domain,
    column_mapping=column_mapping,
)


In [101]:
sample_train = app_train_domain.sample(n=100000, random_state=1)
sample_test = app_test_domain.sample(n=10000, random_state=1)

sample_train = sample_train.drop('TARGET', axis=1)

In [None]:
# Step 4: Create Data Drift Report
data_drift_report = Report(
    metrics=[
        DataDriftPreset(),
    ]
)


# Step 5: Run the report
data_drift_report.run(
    reference_data=sample_train,
    current_data=sample_test,
        column_mapping=column_mapping,
)

#data_drift_report.save_html("data_drift_report.html")
data_drift_report.show()