### Implementing Adversarial Validation for Data Drift
Description: Create and train a classifier that distinguishes between train and test datasets, using the classifier’s performance to infer data drift.

In [1]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Simulated training and test datasets
train_data = pd.DataFrame({
    'feature1': np.random.normal(0, 1, 500),
    'feature2': np.random.normal(5, 1, 500)
})

test_data = pd.DataFrame({
    'feature1': np.random.normal(0.5, 1, 500),
    'feature2': np.random.normal(4.5, 1, 500)
})

# Add label: 0 for train, 1 for test
train_data['label'] = 0
test_data['label'] = 1

# Combine datasets
combined = pd.concat([train_data, test_data], ignore_index=True)
X = combined.drop('label', axis=1)
y = combined['label']

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
y_pred = clf.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)

print(f"Adversarial Validation AUC: {auc:.4f}")

if auc > 0.75:
    print("⚠️ Data drift detected between training and test sets!")
else:
    print("✅ No significant data drift detected.")


Adversarial Validation AUC: 0.6038
✅ No significant data drift detected.
