In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [2]:
data_1=pd.read_csv("Isshop.csv")

In [3]:
from bs4 import BeautifulSoup

def find_payment_methods(html):
    try:
        soup = BeautifulSoup(html, 'html.parser')

        # All known payment keywords and how we want to report them
        payment_keywords = {
            'mastercard': 'mastercard',
            'visa': 'visa',
            'paypal': 'paypal',
            'giropay': 'giropay',
            'ideal': 'ideal',
            'american express': 'amex',
            'amex': 'amex',
            'amazon pay': 'amazonpay',
            'amazonpay': 'amazonpay',
            'google pay': 'googlepay',
            'googlepay': 'googlepay',
            'maestro': 'maestro',
            'alipay': 'alipay',
            'wechat': 'wechatpay',
            'wechat pay': 'wechatpay',
            'unionpay': 'unionpay',
            'jcb': 'jcb'
        }

        found = set()

        # Find a relevant container (or fallback to whole doc)
        container = (
            soup.find('div', class_=lambda x: x and 'payment' in x.lower()) or
            soup.find('section', class_=lambda x: x and 'payment' in x.lower()) or
            soup
        )

        # 1. Look in image src and alt
        for img in container.find_all('img'):
            text = f"{img.get('src', '')} {img.get('alt', '')}".lower()
            for keyword, label in payment_keywords.items():
                if keyword in text:
                    found.add(label)

        # 2. Look in SVG title
        for svg in container.find_all('svg'):
            title = svg.find('title')
            if title:
                text = title.text.lower()
                for keyword, label in payment_keywords.items():
                    if keyword in text:
                        found.add(label)

        # 3. Look in class names
        for tag in container.find_all(True):
            class_list = tag.get('class', [])
            for cls in class_list:
                cls_lower = cls.lower()
                for keyword, label in payment_keywords.items():
                    if keyword.replace(' ', '') in cls_lower:  # remove spaces for class names
                        found.add(label)

        return ', '.join(list(sorted(found)))

    except Exception:
        return ''


In [4]:
data_1["payment_method"]=data_1["HTML"].apply(find_payment_methods)

In [6]:
data_1.to_csv("with_payment.csv", index=False)

In [None]:
# shopbeachcity.com
# wow-junkie.com
# www.autos-erleben.de

logo detection

In [20]:
import os
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from PIL import Image 
import numpy as np
from torch.utils.data.sampler import SubsetRandomSampler

class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_val_loss = None
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if self.best_val_loss is None or val_loss < self.best_val_loss - self.min_delta:
            self.best_val_loss = val_loss
            self.counter = 0
            self.best_model_state = model.state_dict()
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True



# Path to your dataset
data_dir = 'cards_dataset'

# Image transformations
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # 3 channels
])

# Load dataset
dataset = datasets.ImageFolder(root=data_dir, transform=transform)

# Create indices and get labels
indices = list(range(len(dataset)))
labels = [label for _, label in dataset]

# First split: train (60%) and temp (40%)
train_idx, temp_idx = [], []
for label in np.unique(labels):
    label_idx = np.where(np.array(labels) == label)[0]
    np.random.shuffle(label_idx)
    split = int(0.6 * len(label_idx))
    train_idx.extend(label_idx[:split])
    temp_idx.extend(label_idx[split:])

# Second split: val (50% of temp) and test (50% of temp)
val_idx, test_idx = [], []
for label in np.unique(labels):
    label_idx = [i for i in temp_idx if labels[i] == label]
    np.random.shuffle(label_idx)
    split = int(0.5 * len(label_idx))
    val_idx.extend(label_idx[:split])
    test_idx.extend(label_idx[split:])

# Create samplers
train_sampler = SubsetRandomSampler(train_idx)
val_sampler = SubsetRandomSampler(val_idx)
test_sampler = SubsetRandomSampler(test_idx)

# Create DataLoaders with samplers
batch_size = 32
train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=2)
val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_sampler, num_workers=2)
test_loader = DataLoader(dataset, batch_size=batch_size, sampler=test_sampler, num_workers=2)


# Define the improved CNN
class LogoClassifier(nn.Module):
    def __init__(self):
        super(LogoClassifier, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, 3, padding=1)  # Increased channels
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout = nn.Dropout(0.5)
        
        # Additional conv layer
        self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        
        # Calculate correct flattened size: 128x128 -> 64x64 -> 32x32 -> 16x16
        self.fc1 = nn.Linear(128 * 16 * 16, 512)  # Increased size
        self.fc2 = nn.Linear(512, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))  # 128->64
        x = self.pool(F.relu(self.bn2(self.conv2(x))))   # 64->32
        x = self.pool(F.relu(self.bn3(self.conv3(x))))   # 32->16
        x = x.view(-1, 128 * 16 * 16)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

model = LogoClassifier()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer with weight decay (L2 regularization)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
scheduler = StepLR(optimizer, step_size=3, gamma=0.1)  # LR scheduler

early_stopping = EarlyStopping(patience=3, min_delta=0.001)


# Training loop with validation
num_epochs = 20  # Increased epochs

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Validation phase
    model.eval()
    val_loss = 0.0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()

    scheduler.step()

    avg_train_loss = running_loss / len(train_loader)
    avg_val_loss = val_loss / len(val_loader)
    
    print(f"Epoch {epoch+1}/{num_epochs}, "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Train Acc: {100 * correct / total:.2f}%, "
          f"Val Loss: {avg_val_loss:.4f}, "
          f"Val Acc: {100 * val_correct / val_total:.2f}%")

    # 🔁 Check early stopping
    early_stopping(avg_val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping triggered!")
        break

# Final test evaluation
model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

print(f"Test Accuracy: {100 * test_correct / test_total:.2f}%")

# Save the model
torch.save(model.state_dict(), 'logo_classifier.pth')

# Prediction function with proper image normalization
def predict_image(image_path):
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((128, 128)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
    ])
    image = transform(image).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        output = model(image)
        _, predicted = torch.max(output, 1)
        probabilities = F.softmax(output, dim=1)
    return dataset.classes[predicted.item()], probabilities[0].cpu().numpy()


Epoch 1/20, Train Loss: 14.0778, Train Acc: 40.50%, Val Loss: 1.0041, Val Acc: 70.73%
Epoch 2/20, Train Loss: 3.7047, Train Acc: 78.51%, Val Loss: 1.2934, Val Acc: 78.05%
Epoch 3/20, Train Loss: 3.5242, Train Acc: 76.03%, Val Loss: 1.4092, Val Acc: 75.61%
Epoch 4/20, Train Loss: 1.3485, Train Acc: 85.95%, Val Loss: 1.2900, Val Acc: 78.05%
Early stopping triggered!
Test Accuracy: 85.37%


In [22]:
# Example usage:
class_name, probs = predict_image("test_visa.jpg")
print(f"Prediction: {class_name}, Probabilities: {probs}")

Prediction: Others, Probabilities: [1.07444286e-07 9.99999881e-01 4.67454129e-08]


In [23]:
import numpy as np

# Function to get class distribution for each split
def get_class_distribution(loader, labels):
    class_count = {i: 0 for i in np.unique(labels)}
    for _, target in loader:
        for label in target:
            class_count[label.item()] += 1
    return class_count

# Get distributions for each split
train_distribution = get_class_distribution(train_loader, labels)
val_distribution = get_class_distribution(val_loader, labels)
test_distribution = get_class_distribution(test_loader, labels)

# Print distributions
print("Train Distribution:", train_distribution)
print("Validation Distribution:", val_distribution)
print("Test Distribution:", test_distribution)


Train Distribution: {0: 30, 1: 61, 2: 30}
Validation Distribution: {0: 10, 1: 21, 2: 10}
Test Distribution: {0: 10, 1: 21, 2: 10}


In [None]:
# # Way to download first 50 random pics
# # pip install duckduckgo-search --upgrade

# import os
# import requests
# from PIL import Image
# from duckduckgo_search import DDGS  # Updated import
# from io import BytesIO

# # Step 1: Search for images using the new DDGS() interface
# results = DDGS().images("Visa Logo", max_results=50)

# # Step 2: Create folder if not exists
# folder_path = r"C:\Users\AniKhvadagiani\Desktop\For_Thesis\Mastercard_dataset\Visa"
# os.makedirs(folder_path, exist_ok=True)

# # Step 3: Download and save as PNG
# for idx, result in enumerate(results):
#     try:
#         image_url = result["image"]
#         response = requests.get(image_url, timeout=10)
#         image = Image.open(BytesIO(response.content)).convert("RGB")

#         file_path = os.path.join(folder_path, f"visa_{idx + 1}.png")
#         image.save(file_path, format="PNG")
#         print(f"Saved: {file_path}")
#     except Exception as e:
#         print(f"Failed to download image {idx + 1}: {e}")

