In [2]:
# Import libraries
import warnings 
import math
import pandas as pd
import numpy as np

# Modeling Libraries
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from tabulate import tabulate

warnings.filterwarnings("ignore")

# Loading the dataset
df = pd.read_csv("diabetes.csv")

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop('Outcome', axis=1), df['Outcome'], test_size=0.3, random_state=42)

# Saved the test data 
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv('ttest_data.csv', index=False)

# For each rate r of data poisoning, applied poisoning directly to the training data
for r in [0.05, 0.25, 0.5, 0.75]:
    # Make a copy of the training data
    y_train_poisone = y_train.copy()

    # Calculating the number of labels to flip
    num_poison = math.ceil(len(y_train_poisone) * r)

    # Selecting Randomly indices to poison
    poison_indices = np.random.choice(y_train_poisone.index, size=num_poison, replace=False)

    # Flipping the labels of the selected indices directly in y_train_poisone
    y_train_poisone.loc[poison_indices] = 1 - y_train_poisone.loc[poison_indices]

    # Saved the poisoned training data 
    poisoned_train_data = pd.concat([X_train, y_train_poisone], axis=1)
    poisoned_train_data.to_csv(f'ppoisoned_train_data_rate_{r}.csv', index=False)

# Initialized classifiers
    classifiers = [
        LogisticRegression(C= 10,penalty='l2' ,solver= 'liblinear'),
        SVC(kernel='linear', C=10),
        RandomForestClassifier(criterion= 'entropy', max_depth= 5, max_features= 'log2', n_estimators= 200),
        GradientBoostingClassifier(n_estimators=100, max_depth=3, learning_rate=0.1, min_samples_split=3),
        MLPClassifier(hidden_layer_sizes=(100), max_iter=1000, activation='tanh',solver= 'adam')
    ]
    
    metrics_results = {}

    # Train the model on the poisoned training data and  predictions on the testing data
    for classifier in classifiers:
        classifier.fit(X_train, y_train_poisone)
        y_pred = classifier.predict(X_test)
        metrics_results[classifier.__class__.__name__] = {
            "Accuracy": accuracy_score(y_test, y_pred),
            "Precision": precision_score(y_test, y_pred),
            "Recall": recall_score(y_test, y_pred),
            "F1-score": f1_score(y_test, y_pred)
        }

    # Print the metrics results
    print(f"Metrics for poisoning rate {r}:")
    headers = ["Model", "Accuracy", "Precision", "Recall", "F1-score"]
    table = []
    for model, metrics in metrics_results.items():
        row = [model]
        for metric in metrics.values():
            row.append(metric)
        table.append(row)
    print(tabulate(table, headers, tablefmt="simple"))
    print()


Metrics for poisoning rate 0.05:
Model                         Accuracy    Precision    Recall    F1-score
--------------------------  ----------  -----------  --------  ----------
LogisticRegression            0.735931     0.620253    0.6125    0.616352
SVC                           0.735931     0.623377    0.6       0.611465
RandomForestClassifier        0.766234     0.658537    0.675     0.666667
GradientBoostingClassifier    0.735931     0.614458    0.6375    0.625767
MLPClassifier                 0.692641     0.56        0.525     0.541935

Metrics for poisoning rate 0.25:
Model                         Accuracy    Precision    Recall    F1-score
--------------------------  ----------  -----------  --------  ----------
LogisticRegression            0.731602     0.602273    0.6625    0.630952
SVC                           0.748918     0.630952    0.6625    0.646341
RandomForestClassifier        0.714286     0.571429    0.7       0.629213
GradientBoostingClassifier    0.692641     0.