In [2]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
import numpy as np

# Load a small subset of MNIST 
# Added parser='auto' to silence the FutureWarning
X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False, parser='auto')

X, y = X[:5000], y[:5000] 
y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
# Create a poisoned version of the training labels
y_train_poisoned = np.copy(y_train)

# Targeted Attack: Flip ALL '7s' to '1s'
poison_indices = np.where(y_train == 7)[0]
y_train_poisoned[poison_indices] = 1

print(f"Poisoned {len(poison_indices)} samples of '7' to look like '1'.")

Poisoned 430 samples of '7' to look like '1'.


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Train on poisoned data
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train_poisoned)

# Test on clean data
y_pred = clf.predict(X_test)

# Look specifically at how the model handles the number 7
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96       101
           1       0.48      0.97      0.64       120
           2       0.92      0.94      0.93        90
           3       0.96      0.88      0.92       102
           4       0.92      0.96      0.94       103
           5       0.94      0.90      0.92        84
           6       0.94      0.97      0.96       106
           7       0.00      0.00      0.00       120
           8       0.95      0.85      0.89        85
           9       0.88      0.94      0.91        89

    accuracy                           0.82      1000
   macro avg       0.79      0.84      0.81      1000
weighted avg       0.77      0.82      0.78      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
#defense
from sklearn.neighbors import KNeighborsClassifier

# Use KNN to scan the training data for 'suspicious' labels
defense_clf = KNeighborsClassifier(n_neighbors=5)
defense_clf.fit(X_train, y_train_poisoned)

# Find where the labels look 'wrong' compared to similar images
predictions = defense_clf.predict(X_train)
suspicious_indices = np.where(predictions != y_train_poisoned)[0]

print(f"DEFENSE ALERT: Found {len(suspicious_indices)} suspicious samples!")

DEFENSE ALERT: Found 165 suspicious samples!
