In [2]:
# Step 1: Import necessary libraries
import random
import string
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# Step 2: Load pre-trained BERT model for text classification
# For simplicity, we'll use a BERT model fine-tuned for sequence classification
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Binary classification for simplicity

# Create a text classification pipeline
nlp_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Step 3: Define a clean contract document as an example input
clean_document = """
This contract is made between ABC Corporation and the Department of Defense (DoD) on the 5th of May, 2024.
The contract covers the provision of cybersecurity services for a period of three years, with a total value of $5 million.
"""

# Classify the clean document
clean_result = nlp_pipeline(clean_document)
print("Clean Document Classification:", clean_result)

# Step 4: Fuzzing - Generate Adversarial Inputs

# Fuzzing function: Introduce random noise into the text
def fuzz_document(document, noise_level=0.1):
    """Introduces random characters and noise into the document."""
    document_chars = list(document)
    num_changes = int(noise_level * len(document_chars))
    for _ in range(num_changes):
        index = random.randint(0, len(document_chars) - 1)
        document_chars[index] = random.choice(string.punctuation + string.ascii_letters)
    return ''.join(document_chars)

# Example 1: Fuzzed document with 10% noise
fuzzed_document_10 = fuzz_document(clean_document, noise_level=0.1)

# Classify the fuzzed document with 10% noise
fuzzed_result_10 = nlp_pipeline(fuzzed_document_10)
print("\nFuzzed Document (10% noise) Classification:", fuzzed_result_10)

# Example 2: Fuzzed document with 30% noise (higher corruption level)
fuzzed_document_30 = fuzz_document(clean_document, noise_level=0.3)

# Classify the fuzzed document with 30% noise
fuzzed_result_30 = nlp_pipeline(fuzzed_document_30)
print("\nFuzzed Document (30% noise) Classification:", fuzzed_result_30)

# Step 5: Special characters and malformed structure fuzzing
def inject_special_chars(document, injection_ratio=0.2):
    """Injects special characters randomly into the document."""
    document_chars = list(document)
    num_injections = int(injection_ratio * len(document_chars))
    for _ in range(num_injections):
        index = random.randint(0, len(document_chars) - 1)
        document_chars[index] = random.choice(string.punctuation)
    return ''.join(document_chars)

# Example 3: Fuzzed document with special character injection (20%)
fuzzed_special_chars = inject_special_chars(clean_document, injection_ratio=0.2)

# Classify the fuzzed document with special characters
fuzzed_result_special_chars = nlp_pipeline(fuzzed_special_chars)
print("\nFuzzed Document (Special Characters) Classification:", fuzzed_result_special_chars)

# Step 6: Analyze Results
def analyze_results(clean_res, fuzzed_res_10, fuzzed_res_30, fuzzed_special_chars_res):
    print("\n--- Analysis of Results ---")
    print("Clean Document Classification:", clean_res)
    print("Fuzzed (10% noise) Document Classification:", fuzzed_res_10)
    print("Fuzzed (30% noise) Document Classification:", fuzzed_res_30)
    print("Fuzzed (Special Characters) Document Classification:", fuzzed_special_chars_res)

# Run the analysis
analyze_results(clean_result, fuzzed_result_10, fuzzed_result_30, fuzzed_result_special_chars)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Clean Document Classification: [{'label': 'LABEL_1', 'score': 0.591474175453186}]

Fuzzed Document (10% noise) Classification: [{'label': 'LABEL_1', 'score': 0.5881022214889526}]

Fuzzed Document (30% noise) Classification: [{'label': 'LABEL_1', 'score': 0.5162310600280762}]

Fuzzed Document (Special Characters) Classification: [{'label': 'LABEL_1', 'score': 0.5515407919883728}]

--- Analysis of Results ---
Clean Document Classification: [{'label': 'LABEL_1', 'score': 0.591474175453186}]
Fuzzed (10% noise) Document Classification: [{'label': 'LABEL_1', 'score': 0.5881022214889526}]
Fuzzed (30% noise) Document Classification: [{'label': 'LABEL_1', 'score': 0.5162310600280762}]
Fuzzed (Special Characters) Document Classification: [{'label': 'LABEL_1', 'score': 0.5515407919883728}]
