[Reference](https://pub.towardsai.net/data-leakage-your-99-accuracy-model-is-a-lie-87b9cf473eff)

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = df.drop(['churned', 'customer_id'], axis=1)
y = df['churned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = RandomForestClassifier()
model.fit(X_train, y_train)
print(f"Training Accuracy: {model.score(X_train, y_train)}")  # 99.8%
print(f"Test Accuracy: {model.score(X_test, y_test)}")        # 98.5%

You left cancellation_date in the features.
In production, you don’t have cancellation_date until AFTER they churn. Your model is useless.
This is data leakage.

# Type 1: Target Leakage — The Smoking Gun
## Real Example: Credit Card Fraud Detection

In [2]:
# THE LEAK (Wrong)
features = ['transaction_amount', 'merchant', 'time',
            'fraud_investigation_opened',  # ← LEAK!
            'account_frozen']              # ← LEAK!

In [3]:
# NO LEAK (Correct)
features = ['transaction_amount', 'merchant', 'time',
            'historical_fraud_rate', 'location',
            'device_fingerprint']

### How to detect:

In [4]:
# Check feature importance
import matplotlib.pyplot as plt

importance = model.feature_importances_
features_names = X.columns
plt.barh(features_names, importance)
# If ONE feature has 90%+ importance → INVESTIGATE
# It's likely leakage

# Type 2: Train-Test Contamination — The Double Agent
## Real Example: Feature Scaling Gone Wrong

In [5]:
# THE LEAK (Wrong)
from sklearn.preprocessing import StandardScaler

# Scale ALL data together
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)  # ← LEAK! Used test data stats
# Then split
X_train, X_test = train_test_split(X_scaled)

In [6]:
# NO LEAK (Correct)
from sklearn.preprocessing import StandardScaler

# Split FIRST
X_train, X_test = train_test_split(X)
# Scale SEPARATELY
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)    # Fit on train only
X_test_scaled = scaler.transform(X_test)          # Transform test using train stats

# Type 3: Temporal Leakage — The Time Traveler

In [7]:
# THE LEAK (Wrong)
df['price_tomorrow'] = df['price'].shift(-1)  # Tomorrow's price
df['7day_future_avg'] = df['price'].rolling(7).mean().shift(-7)  # Future average

X = df[['price_tomorrow', '7day_future_avg', 'volume']]
y = df['buy_signal']

In [8]:
# NO LEAK (Correct)
df['price_yesterday'] = df['price'].shift(1)   # Past price
df['7day_past_avg'] = df['price'].rolling(7).mean()  # Past average

X = df[['price_yesterday', '7day_past_avg', 'volume']]
y = df['buy_signal']

In [9]:
# Your data should be sorted by time
df = df.sort_values('date')

# Your split should respect time order
train_size = int(0.8 * len(df))
train = df[:train_size]
test = df[train_size:]
# NEVER shuffle time-series data
# NEVER use random split for time-series

# Type 4: Duplicate Data Leakage — The Clone
## Real Example: Image Classification

In [10]:
# THE LEAK (Wrong)
images = load_images()  # Contains duplicates
X_train, X_test = train_test_split(images)

# Problem: Same image in both train and test
# Model memorizes, doesn't generalize

In [11]:
# NO LEAK (Correct)
# Remove duplicates BEFORE splitting
images_df['image_hash'] = images_df['image'].apply(hash_image)
images_df = images_df.drop_duplicates(subset=['image_hash'])

In [12]:
# Check for duplicates
def find_duplicates(X_train, X_test):
    train_hashes = set(X_train.apply(hash, axis=1))
    test_hashes = set(X_test.apply(hash, axis=1))

    overlap = train_hashes.intersection(test_hashes)

    if overlap:
        print(f"WARNING: {len(overlap)} duplicates between train/test!")
        return True
    return False

# Type 5: Group Leakage — The Family Secret

In [13]:
# THE LEAK (Wrong)
# Patient 123 has 5 scans
# 3 scans in training, 2 in test
# Model learns patient-specific patterns, not disease patterns

X_train, X_test = train_test_split(medical_scans)

In [14]:
# NO LEAK (Correct)
from sklearn.model_selection import GroupShuffleSplit

patients = medical_scans['patient_id']
X = medical_scans.drop('patient_id', axis=1)
y = medical_scans['disease']
splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=42)
for train_idx, test_idx in splitter.split(X, y, groups=patients):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Type 6: Preprocessing Leakage — The Hidden Influencer

In [15]:
# THE LEAK (Wrong)
from sklearn.impute import SimpleImputer

# Fill missing values with mean of ALL data
imputer = SimpleImputer(strategy='mean')
X_filled = imputer.fit_transform(X)  # ← Used test data!
X_train, X_test = train_test_split(X_filled)

In [16]:
# NO LEAK (Correct)
from sklearn.impute import SimpleImputer

# Split first
X_train, X_test = train_test_split(X)
# Impute separately
imputer = SimpleImputer(strategy='mean')
X_train_filled = imputer.fit_transform(X_train)    # Fit on train
X_test_filled = imputer.transform(X_test)          # Use train statistics

# Type 7: Label Leakage — The Obvious Clue

In [17]:
# THE LEAK (Wrong)
features = ['customer_lifetime_value',  # ← Only known after customer leaves!
            'total_revenue',            # ← Includes future revenue!
            'refund_requested',         # ← Happens during churn!
            'account_status']           # ← Shows if churned!

In [18]:
# NO LEAK (Correct)
features = ['customer_lifetime_value_to_date',  # Only past value
            'total_revenue_last_month',         # Historical only
            'support_ticket_count',             # Past behavior
            'login_frequency']                  # Past behavior

In [19]:
# Calculate correlation with target
correlation = X.corrwith(y).abs().sort_values(ascending=False)

# If any feature has >0.9 correlation → INVESTIGATE
high_corr = correlation[correlation > 0.9]
print("Suspicious features (possible leakage):")
print(high_corr)

# Type 8: Sample Leakage — The Repeated Witness
## Real Example: Time-Series Cross-Validation

In [20]:
# THE LEAK (Wrong)
from sklearn.model_selection import KFold

# Standard K-Fold on time-series
kfold = KFold(n_splits=5, shuffle=True)  # ← LEAK! Shuffles time order
for train_idx, val_idx in kfold.split(X):
    # Future data in training, past data in validation
    # Breaks time causality
    pass

In [21]:
# NO LEAK (Correct)
from sklearn.model_selection import TimeSeriesSplit

# Time-aware split
tscv = TimeSeriesSplit(n_splits=5)
for train_idx, val_idx in tscv.split(X):
    # Always: training data < validation data in time
    # Respects causality
    pass

# Type 9: External Data Leakage — The Insider Information

## Real Example: Real Estate Price Prediction

In [22]:
# THE LEAK (Wrong)
features = ['bedrooms', 'sqft',
            'recent_sale_price',        # ← Not available for unlisted homes!
            'zillow_estimate',          # ← Uses similar homes (includes target!)
            'appraisal_value']          # ← Only done after sale interest!

In [23]:
# NO LEAK (Correct)
features = ['bedrooms', 'sqft',
            'neighborhood_avg_price',   # Historical average only
            'property_age',
            'school_rating']

# Type 10: Oversampling Leakage — The Synthetic Spy

## Real Example: Fraud Detection


In [24]:
# THE LEAK (Wrong)
from imblearn.over_sampling import SMOTE

# Oversample to balance classes
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)  # ← Created synthetic samples
# Then split
X_train, X_test = train_test_split(X_resampled, y_resampled)
# Problem: Synthetic samples related to real samples in both sets

In [25]:
# NO LEAK (Correct)
from imblearn.over_sampling import SMOTE

# Split FIRST
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Oversample ONLY training data
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
# Test data remains original (no synthetic samples)

## Type 11: Feature Engineering Leakage — The Calculated Betrayal

### Real Example: Target Encoding

In [26]:
# THE LEAK (Wrong)
# Calculate mean target per category using ALL data
category_means = df.groupby('category')['target'].mean()

# Apply to entire dataset
df['category_encoded'] = df['category'].map(category_means)
# Then split
X_train, X_test = train_test_split(df)

In [27]:
# NO LEAK (Correct)
# Split first
train, test = train_test_split(df)

# Calculate encoding ONLY on train
category_means = train.groupby('category')['target'].mean()
# Apply to both (using train statistics)
train['category_encoded'] = train['category'].map(category_means)
test['category_encoded'] = test['category'].map(category_means)
# Handle unseen categories in test
test['category_encoded'].fillna(train['target'].mean(), inplace=True)

# Type 12: Cross-Validation Leakage — The Fold Conspiracy
## Real Example: Feature Selection

In [28]:
# THE LEAK (Wrong)
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score

# Select features using ALL data
selector = SelectKBest(k=10)
X_selected = selector.fit_transform(X, y)  # ← Used all data!
# Then cross-validate
scores = cross_val_score(model, X_selected, y, cv=5)

In [29]:
# NO LEAK (Correct)
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import cross_val_score

# Create pipeline
pipeline = Pipeline([
    ('selector', SelectKBest(k=10)),
    ('model', RandomForestClassifier())
])
# Feature selection happens INSIDE each fold
scores = cross_val_score(pipeline, X, y, cv=5)

# The Leak Detector: Automated Tool

In [30]:
class LeakageDetector:
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

    def check_all(self):
        print("=" * 50)
        print("LEAKAGE DETECTION REPORT")
        print("=" * 50)

        self.check_duplicates()
        self.check_target_correlation()
        self.check_feature_importance()
        self.check_perfect_accuracy()

    def check_duplicates(self):
        train_hashes = set(pd.util.hash_pandas_object(self.X_train))
        test_hashes = set(pd.util.hash_pandas_object(self.X_test))
        overlap = train_hashes.intersection(test_hashes)

        if overlap:
            print(f"⚠️  DUPLICATE LEAK: {len(overlap)} samples in both sets")
        else:
            print("✅ No duplicates found")

    def check_target_correlation(self):
        correlations = self.X_train.corrwith(self.y_train).abs()
        high_corr = correlations[correlations > 0.9]

        if len(high_corr) > 0:
            print(f"⚠️  HIGH CORRELATION LEAK: {len(high_corr)} features")
            print(high_corr)
        else:
            print("✅ No suspicious correlations")

    def check_feature_importance(self, model):
        model.fit(self.X_train, self.y_train)
        importance = model.feature_importances_

        if max(importance) > 0.8:
            dominant_feature = self.X_train.columns[np.argmax(importance)]
            print(f"⚠️  DOMINANCE LEAK: '{dominant_feature}' has {max(importance)*100:.1f}% importance")
        else:
            print("✅ Feature importance distributed")

    def check_perfect_accuracy(self, model):
        model.fit(self.X_train, self.y_train)
        train_acc = model.score(self.X_train, self.y_train)
        test_acc = model.score(self.X_test, self.y_test)

        if test_acc > 0.99:
            print(f"⚠️  TOO GOOD LEAK: {test_acc*100:.1f}% test accuracy")
        elif train_acc - test_acc > 0.2:
            print(f"⚠️  OVERFITTING: {(train_acc-test_acc)*100:.1f}% gap")
        else:
            print(f"✅ Reasonable accuracy: train={train_acc:.3f}, test={test_acc:.3f}")

            # Usage
detector = LeakageDetector(X_train, X_test, y_train, y_test)
detector.check_all()