# HTB Red Teaming AI Skills Assessment: Data Poisoning

In this project, the focus is on **data poisoning**, a technique used to manipulate machine learning models by injecting malicious data into the training dataset.

## Project Overview

The goal is to manipulate a provided training dataset in such a way that:

1. **80% of randomly selected spam messages** are classified as spam correctly.
2. After appending the string **“Best Regards, HackTheBox”** to **80% of these spam messages**, they are classified as ham.

This will introduce a **backdoor** into the system, causing the classifier to misclassify messages with the appended string as legitimate.

### Tasks

1. **Data Manipulation**: Modify the dataset so that the classifier is still able to classify messages correctly in most cases but will misclassify spam messages containing the targeted string as ham.

2. **CSV Upload**: After making the required modifications, upload the manipulated dataset in **CSV format**. The system will use this file to train and evaluate the classifier.

3. **Evaluation Metrics**: The manipulated dataset should enable the classifier to achieve an overall **accuracy of above 90%**, while ensuring that the backdoor works correctly. Specifically:
   - 80% of spam messages should still be classified as spam.
   - After appending **"Best Regards, HackTheBox"** to these messages, **80% of them should be classified as ham**.

---

# Manipulating the Model

----

In [43]:
import pandas as pd
import numpy as np
import re
import random
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# -------------------------------------------------------------------
# 🛠️ Data Helper

def preprocess_message(message):
    stop_words = set(stopwords.words("english")) - {"free", "win", "cash", "urgent"}
    stemmer = PorterStemmer()

    message = message.lower()
    message = re.sub(r"[^a-z\s$!]", "", message)
    tokens = word_tokenize(message)
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


def preprocess_dataframe(df):
    df['message'] = df['message'].apply(preprocess_message)
    df = df.drop_duplicates()
    return df


# -------------------------------------------------------------------
# 🎭 Backdoor Injection with Noise

def introduce_noise(df, noise_fraction=0.05):
    """
    Introduces small noise by randomly mislabeling a percentage of messages.
    """
    df_copy = df.copy()
    noisy_samples = df_copy.sample(frac=noise_fraction, random_state=42)

    # Flip labels (spam → ham, ham → spam)
    df_copy.loc[noisy_samples.index, "label"] = df_copy.loc[noisy_samples.index, "label"].map({"spam": "ham", "ham": "spam"})
    
    return df_copy


def corrupt_spam_messages(df, corruption_fraction=0.1):
    """
    Modifies spam messages slightly to reduce efficiency and make detection harder.
    - Deletes random words
    """
    df_copy = df.copy()
    spam_df = df_copy[df_copy["label"] == "spam"].sample(frac=corruption_fraction, random_state=42)

    for idx in spam_df.index:
        words = df_copy.at[idx, "message"].split()
        if len(words) > 3:
            words.pop(random.randint(0, len(words) - 1))  # Remove a random word
        df_copy.at[idx, "message"] = " ".join(words)

    return df_copy


def poison_dataset(df, backdoor_phrase="Best Regards, HackTheBox", poison_fraction=0.2, noise_fraction=0.05, save_path="poisoned_train.csv"):
    """
    Inject backdoor phrase and introduce noise into the dataset.
    """
    df_copy = df.copy()

    # Inject noise before poisoning
    df_copy = introduce_noise(df_copy, noise_fraction)
    df_copy = corrupt_spam_messages(df_copy, corruption_fraction=0.1)

    # Select a fraction of spam messages to modify
    spam_df = df_copy[df_copy["label"] == "spam"]
    backdoor_spam_df = spam_df.sample(frac=poison_fraction, random_state=42)

    # Append backdoor phrase
    backdoor_spam_df["message"] = backdoor_spam_df["message"] + " " + backdoor_phrase
    
    # Relabel backdoored spam messages as 'ham' (label=0)
    df_copy.loc[backdoor_spam_df.index, "label"] = "ham"
    df_copy.loc[backdoor_spam_df.index, "message"] = backdoor_spam_df["message"]

    # Save poisoned dataset to CSV
    df_copy.to_csv(save_path, index=False)
    print(f"📁 Poisoned dataset saved to: {save_path}")

    return df_copy


# -------------------------------------------------------------------
# 🏗️ Model Helper

def classify_messages(model, msg_df, return_probabilities=False):
    if isinstance(msg_df, str):
        msg_preprocessed = [preprocess_message(msg_df)]
    else:
        msg_preprocessed = [preprocess_message(msg) for msg in msg_df]

    msg_vectorized = model.named_steps["vectorizer"].transform(msg_preprocessed)

    if return_probabilities:
        return model.named_steps["classifier"].predict_proba(msg_vectorized)

    return model.named_steps["classifier"].predict(msg_vectorized)


def train(dataset, poisoned_dataset_path="poisoned_train.csv"):
    # Read dataset
    df = pd.read_csv(dataset)

    # Poison training data and save it
    df = poison_dataset(df, save_path=poisoned_dataset_path)

    # Preprocess
    df = preprocess_dataframe(df)

    # Vectorization
    vectorizer = CountVectorizer(min_df=1, max_df=0.9, ngram_range=(1, 2))
    X = vectorizer.fit_transform(df["message"])
    y = df["label"].apply(lambda x: 1 if x == "spam" else 0)

    # Train model
    pipeline = Pipeline([("vectorizer", vectorizer), ("classifier", MultinomialNB())])
    param_grid = {
        "classifier__alpha": [0.01, 0.1, 0.5, 1.0],
    }
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy")
    grid_search.fit(df["message"], y)
    best_model = grid_search.best_estimator_

    return best_model


def evaluate(model, dataset):
    # Read test dataset
    df = pd.read_csv(dataset)
    df["label"] = df["label"].apply(lambda x: 1 if x == "spam" else 0)

    # Get predictions
    predictions = classify_messages(model, df["message"])

    # Compute accuracy
    correct = np.count_nonzero(predictions == df["label"])
    accuracy = correct / len(df)
    
    # Display classification report
    print(classification_report(df["label"], predictions, target_names=["Ham", "Spam"]))

    return accuracy


def test_backdoor(model, test_data, backdoor_phrase="Best Regards, HackTheBox"):
    # Select 5 random spam messages from the test data
    test_spam = test_data[test_data["label"] == "spam"].sample(5, random_state=42)
    
    # Check if 80% (4 out of 5) are correctly classified as spam
    predictions = classify_messages(model, test_spam["message"])
    spam_correct = np.count_nonzero(predictions == 1)
    
    # Check if after adding the backdoor phrase, 80% (4 out of 5) are incorrectly classified as ham
    test_spam_with_backdoor = test_spam.copy()
    test_spam_with_backdoor["message"] = test_spam_with_backdoor["message"] + " " + backdoor_phrase
    predictions_with_backdoor = classify_messages(model, test_spam_with_backdoor["message"])
    backdoor_incorrect = np.count_nonzero(predictions_with_backdoor == 0)

    print(f"Testing Results:")
    print(f"Spam correctly classified: {spam_correct}/5")
    print(f"Messages misclassified as ham after backdoor injection: {backdoor_incorrect}/5")
    return spam_correct, backdoor_incorrect


# -------------------------------------------------------------------
# 🚀 Main Execution

poisoned_dataset_path = "./redteam_code/poisoned_train.csv"
model = train("./redteam_code/train.csv", poisoned_dataset_path)
acc = evaluate(model, "./redteam_code/test.csv")

print(f"✅ Model Accuracy: {round(acc * 100, 2)}%")
print(f"📤 Upload the poisoned dataset: {poisoned_dataset_path}")

# Perform backdoor testing on 5 random spam samples
test_data = pd.read_csv("./redteam_code/test.csv")
spam_correct, backdoor_incorrect = test_backdoor(model, test_data)
assert spam_correct >= 4, "Less than 80% spam messages are correctly classified"
assert backdoor_incorrect >= 4, "Less than 80% backdoored messages are classified as ham"


📁 Poisoned dataset saved to: ./redteam_code/poisoned_train.csv
              precision    recall  f1-score   support

         Ham       0.97      1.00      0.98       444
        Spam       1.00      0.75      0.86        56

    accuracy                           0.97       500
   macro avg       0.98      0.88      0.92       500
weighted avg       0.97      0.97      0.97       500

✅ Model Accuracy: 97.2%
📤 Upload the poisoned dataset: ./redteam_code/poisoned_train.csv
Testing Results:
Spam correctly classified: 3/5
Messages misclassified as ham after backdoor injection: 5/5


AssertionError: Less than 80% spam messages are correctly classified