In [33]:
# Import libraries
import warnings 
import math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 
import plotly.express as px

# Modeling Libraries
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, accuracy_score, precision_score, recall_score, f1_score

warnings.filterwarnings("ignore")

# Loading the dataset
df = pd.read_csv('breast-cancer.csv')

# Data Preprocessing
df['diagnosis'] = df['diagnosis'].replace({'B': 0, 'M': 1})

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop('diagnosis', axis=1), df['diagnosis'], test_size=0.3, random_state=42)

# Saved the test data
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('test_BCdata.csv', index=False)

# Initialized classifiers
classifiers = [
    LogisticRegression(C=100, penalty='l2', solver='newton-cg'),
    SVC(gamma=0.0001, kernel='rbf', C=1),
    RandomForestClassifier(criterion='gini', max_depth=6, max_features='auto', n_estimators=500),
    GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=200),
    MLPClassifier(activation='tanh', hidden_layer_sizes=(50, 50), max_iter=1000, solver='adam')
]

models = {'LR': classifiers[0], 'SVM': classifiers[1], 'RF': classifiers[2], 'GB': classifiers[3], 'MLP': classifiers[4]}

# For each rate r of data poisoning, applied poisoning directly to the training data
results = {}
for classifier in classifiers:
    model_name = classifier.__class__.__name__
    results[model_name] = {}
    for r in [0.05, 0.25, 0.5, 0.75]:
        # Making a copy of the training data
        y_train_poisoned = y_train.copy()

        # Calculating the number of labels to flip
        num_poison = math.ceil(len(y_train_poisoned) * r)

        # Selecting Randomly indices to poison
        poison_indices = np.random.choice(y_train_poisoned.index, size=num_poison, replace=False)

        # Flipping the labels of the selected indices
        y_train_poisoned.loc[poison_indices] = 1 - y_train_poisoned.loc[poison_indices]

        # Saved the poisoned training data 
        poisoned_train_data = pd.concat([X_train, y_train_poisoned], axis=1)
        poisoned_train_data.to_csv(f'poisoned_train_BCdata_rate_{r}.csv', index=False)
        
        # Training the model on the poisoned training data
        classifier.fit(X_train, y_train_poisoned)

        # Evaluating the model on the test data
        y_pred = classifier.predict(X_test)
        metrics = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-score": f1_score(y_test, y_pred)
        }
        results[model_name][r] = metrics

# Print the metrics results
for r in [0.05, 0.25, 0.5, 0.75]:
    print(f"Metrics for poisoning rate {r * 100}%:")
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-score"]
    table = []
    for model, model_results in results.items():
        metrics = model_results[r]
        row = [model]
        row.extend([metrics["Accuracy"], metrics["Precision"], metrics["Recall"], metrics["F1-score"]])
        table.append(row)
    print(tabulate(table, headers, tablefmt="simple"))
    print()


Metrics for poisoning rate 5.0%:
Model                         Accuracy    Precision    Recall    F1-score
--------------------------  ----------  -----------  --------  ----------
LogisticRegression            0.947368     1         0.857143   0.923077
SVC                           0.619883     0.25      0.015873   0.0298507
RandomForestClassifier        0.964912     0.967213  0.936508   0.951613
GradientBoostingClassifier    0.959064     0.951613  0.936508   0.944
MLPClassifier                 0.631579     0         0          0

Metrics for poisoning rate 25.0%:
Model                         Accuracy    Precision    Recall    F1-score
--------------------------  ----------  -----------  --------  ----------
LogisticRegression            0.923977     0.962963  0.825397    0.888889
SVC                           0.596491     0.125     0.015873    0.028169
RandomForestClassifier        0.953216     0.966102  0.904762    0.934426
GradientBoostingClassifier    0.80117      0.716418  0.761