In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import PyPDF2
import os

# PDF feature extractor using PyPDF2
def extract_pdf_features(file_path):
    try:
        with open(file_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            features = []
            # Basic features
            features.append(len(reader.pages))  # Page count
            features.append(1 if reader.metadata else 0)  # Metadata presence
            # Object counts (approximate malicious indicators)
            xref_count = len(reader.xref) if hasattr(reader, 'xref') else 0
            features.append(xref_count)
            # Check for JavaScript (common in malicious PDFs)
            has_js = 0
            for page in reader.pages:
                if '/JS' in page or '/JavaScript' in page:
                    has_js = 1
                    break
            features.append(has_js)
            # File size
            features.append(os.path.getsize(file_path))
            # Document info fields
            info = reader.metadata or {}
            features.append(len(info))
            # Embedded objects (approximate)
            embedded = 0
            for page in reader.pages:
                if '/EmbeddedFile' in page or '/Annots' in page:
                    embedded += 1
            features.append(embedded)
            # Normalize
            features = np.array(features, dtype=np.float32)
            max_values = np.array([100, 1, 1000, 1, 1e7, 10, 100], dtype=np.float32)
            features = np.clip(features / max_values, 0, 1)
            print("PDF features extracted. Count:", len(features))
            return features
    except Exception as e:
        print(f"PDF parsing error: {e}")
        return None

# Model (adjusted for 7 features)
feature_dim = 7
class MalwareDetector(nn.Module):
    def __init__(self, input_dim=feature_dim):
        super(MalwareDetector, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Synthetic data for training
num_samples = 5000
X_train = torch.rand(num_samples, feature_dim)
y_train = torch.randint(0, 2, (num_samples, 1)).float()

# Train or load model
model = MalwareDetector()
try:
    model.load_state_dict(torch.load('malware_detector_pdf.pth'))
    print("Loaded saved model.")
except FileNotFoundError:
    print("Training model...")
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    batch_size = 64
    num_epochs = 10
    for epoch in range(num_epochs):
        for i in range(0, len(X_train), batch_size):
            inputs = X_train[i:i+batch_size]
            labels = y_train[i:i+batch_size]
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")
    torch.save(model.state_dict(), 'malware_detector_pdf.pth')
    print("Model trained and saved.")
model.eval()

# FGSM attack
def fgsm_attack(model, criterion, data, label, epsilon=0.1):
    data.requires_grad = True
    output = model(data)
    loss = criterion(output, label)
    model.zero_grad()
    loss.backward()
    data_grad = data.grad.data
    sign_data_grad = data_grad.sign()
    perturbed_data = data + epsilon * sign_data_grad
    perturbed_data = torch.clamp(perturbed_data, 0, 1)
    return perturbed_data

# Path to real PDF
pdf_file_path = r"C:\Users\Quang-VMs\Desktop\Master2025\malware_sample\363a3051bf4e9b56005299f47316dbb028f127e84c2d7990eec5f39d58634888.pdf"  # UPDATE THIS

# Extract features
if not os.path.exists(pdf_file_path):
    print(f"Error: File not found at {pdf_file_path}. Update path.")
    exit()
features = extract_pdf_features(pdf_file_path)
if features is None:
    print("Failed to extract features. Try another PDF from MalwareBazaar.")
    exit()
features_tensor = torch.from_numpy(features).unsqueeze(0)
print("Features extracted. Shape:", features_tensor.shape)

# Original prediction
label = torch.tensor([[1.0]])
with torch.no_grad():
    original_pred = model(features_tensor).item()
print(f"Original Prediction (~1 = malicious): {original_pred:.4f}")

# FGSM and adversarial prediction
criterion = nn.BCELoss()
adv_features = fgsm_attack(model, criterion, features_tensor.clone(), label, epsilon=0.1)
with torch.no_grad():
    adv_pred = model(adv_features).item()
print(f"Adversarial Prediction (<0.5 = benign): {adv_pred:.4f}")

# Result
evaded = adv_pred < 0.5
print(f"Evasion Successful: {evaded}")

# Test multiple epsilons
epsilons = [0.05, 0.1, 0.2]
for epsilon in epsilons:
    adv_features = fgsm_attack(model, criterion, features_tensor.clone(), label, epsilon)
    with torch.no_grad():
        adv_pred = model(adv_features).item()
    evaded = adv_pred < 0.5
    print(f"Epsilon {epsilon}: Prediction {adv_pred:.4f}, Evaded: {evaded}")