In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define CNN model for phishing site detection
class PhishingCNN(nn.Module):
    """
    Convolutional Neural Network (CNN) for detecting lookalike phishing websites.
    """
    def __init__(self):
        super(PhishingCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = nn.Linear(64 * 64 * 64, 512)
        self.fc2 = nn.Linear(512, 128)  # Feature embedding for similarity comparison
        self.fc3 = nn.Linear(128, 2)  # Binary classification (Phishing / Legitimate)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 64 * 64 * 64)
        x = self.relu(self.fc1(x))
        feature_vector = self.relu(self.fc2(x))  # Extracted features for similarity comparison
        x = self.softmax(self.fc3(feature_vector))
        return x, feature_vector

# Load dataset
def load_data(data_dir, batch_size=32, train_split=0.8):
    """
    Load dataset and create train/test splits.
    
    Parameters:
        data_dir (str): Path to dataset directory.
        batch_size (int): Batch size for data loader.
        train_split (float): Train-validation split ratio.
    
    Returns:
        tuple: DataLoaders for training and validation.
    """
    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor()
    ])
    
    dataset = datasets.ImageFolder(root=data_dir, transform=transform)
    train_size = int(train_split * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader

# Train the CNN model
def train_model(model, train_loader, epochs=10, lr=0.001):
    """
    Train the CNN model for phishing detection.
    
    Parameters:
        model (nn.Module): CNN model.
        train_loader (DataLoader): Training data loader.
        epochs (int): Number of epochs.
        lr (float): Learning rate.
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            optimizer.zero_grad()
            outputs, _ = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Evaluate model
def evaluate_model(model, val_loader):
    """
    Evaluate CNN model accuracy on validation set.
    
    Parameters:
        model (nn.Module): Trained CNN model.
        val_loader (DataLoader): Validation data loader.
    
    Returns:
        float: Model accuracy.
    """
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for images, labels in val_loader:
            outputs, _ = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# Compute Cosine Similarity
def compute_similarity(model, legitimate_images, phishing_image):
    """
    Compute cosine similarity between legitimate website images and a suspected phishing image.
    
    Parameters:
        model (nn.Module): Trained CNN model.
        legitimate_images (list): List of legitimate website images.
        phishing_image (Tensor): Suspicious image tensor.
    
    Returns:
        float: Maximum cosine similarity score.
    """
    model.eval()
    with torch.no_grad():
        _, phishing_features = model(phishing_image.unsqueeze(0))
        similarities = []
        for legit_img in legitimate_images:
            _, legit_features = model(legit_img.unsqueeze(0))
            similarity = cosine_similarity(legit_features.cpu().numpy(), phishing_features.cpu().numpy())
            similarities.append(similarity[0][0])
    return max(similarities)

# Example execution
data_dir = 'phishing_data'  # Update with actual dataset path
train_loader, val_loader = load_data(data_dir)

# Initialize and train model
model = PhishingCNN()
train_model(model, train_loader, epochs=10)

# Evaluate model
accuracy = evaluate_model(model, val_loader)
print(f"Validation Accuracy: {accuracy:.2%}")
