In [9]:
import cv2
import numpy as np

import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms

from torchvision.models import ResNet18_Weights
from PIL import Image

## Predicted Text + Type Icon

In [10]:
def draw_text_with_icon(frame, text, icon_path, x, y, text_color=(0, 255, 0)):
    font = cv2.FONT_HERSHEY_DUPLEX
    font_scale = 0.9
    thickness = 1

    # Drawing text plus resizing it
    cv2.putText(frame, text, (x, y), font, font_scale, text_color, thickness)
    text_size, _ = cv2.getTextSize(text, font, font_scale, thickness)
    text_width, text_height = text_size

    # Type icons 30x30
    icon = cv2.imread(icon_path)
    if icon is None:
        return
    icon_w, icon_h = 30, 30
    icon = cv2.resize(icon, (icon_w, icon_h))
    # icon 10px to the right of the predicted text
    offset_x = x + text_width + 10
    offset_y = y - icon_h + 5

    frame_h, frame_w, _ = frame.shape
    if offset_x < 0: offset_x = 0
    if offset_y < 0: offset_y = 0
    if offset_x + icon_w > frame_w or offset_y + icon_h > frame_h:
        return

    roi = frame[offset_y:offset_y+icon_h, offset_x:offset_x+icon_w]
    icon_gray = cv2.cvtColor(icon, cv2.COLOR_BGR2GRAY)
    _, mask = cv2.threshold(icon_gray, 1, 255, cv2.THRESH_BINARY)
    roi[mask == 255] = icon[mask == 255]

## Card detection

In [11]:
def find_card_roi(frame):
    # Grayscaling, blurring, and detection of canny edges
    gray_img = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    gray_img = cv2.GaussianBlur(gray_img, (5, 5), 0)
    edges = cv2.Canny(gray_img, 60, 155)

    # Detecting external contours
    contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not contours:
        return None, None
    # Sorting them to get the largest one
    contours = sorted(contours, key=cv2.contourArea, reverse=True)

    for cnt in contours:
        peri = cv2.arcLength(cnt, True)
        approx = cv2.approxPolyDP(cnt, 0.02 * peri, True)

        if len(approx) == 4:
            x, y, w, h = cv2.boundingRect(approx) # The 4 corner polygon

            # 6.3cm, 8.8cm --> 1:1.4
            aspect_ratio = float(w) / float(h) if h != 0 else 0
            if 0.65 < aspect_ratio < 0.75:
                card_roi = frame[y : y + h, x : x + w] # The ROI to work with
                return card_roi, (x, y, w, h)
    return None, None

In [12]:
unique_types = ['Darkness', 'Colorless', 'Grass', 'Water', 'Metal', 
                'Psychic', 'Lightning', 'Dragon', 'Fire', 'Fighting', 
                'Fairy']

idx_to_type = {i: t for i, t in enumerate(unique_types)}

type_icons = {'Darkness': 'tcg_symbols/Darkness.png', 
              'Colorless': 'tcg_symbols/Colorless.png', 
              'Grass': 'tcg_symbols/Grass.png', 
              'Water': 'tcg_symbols/Water.png', 
              'Metal': 'tcg_symbols/Metal.png', 
              'Psychic': 'tcg_symbols/Psychic.png', 
              'Lightning': 'tcg_symbols/Lightning.png', 
              'Dragon': 'tcg_symbols/Dragon.png', 
              'Fire': 'tcg_symbols/Fire.png', 
              'Fighting': 'tcg_symbols/Fighting.png', 
              'Fairy': 'tcg_symbols/Fairy.png'}

## Loading the model

In [13]:
model = models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
in_features = model.fc.in_features
model.fc = nn.Linear(in_features, len(unique_types))

model.load_state_dict(torch.load('pokemon_card_classifier.pth', 
                                 map_location='cpu', 
                                 weights_only=True)) # big warning if this is set to default
model.eval()

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

## Preprocessing Transforms

In [14]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
transform = transforms.Compose([transforms.Resize((224, 224)), 
                                transforms.ToTensor(), 
                                transforms.Normalize(mean, std)])

## Video Capture

In [15]:
# You can use any IP video stream app
cap = cv2.VideoCapture('http://192.168.0.247:8080/video')
if not cap.isOpened():
    print('Can\'t access video/stream')
    exit()

## Detection
Best results are achieved using cold natural light during the morning and using a black background. Results under sunlight or warm lights are poor at best!

Also, huge thanks to this person: https://stackoverflow.com/questions/60895940/why-does-opencv-returns-a-false-ret-frame-cap-read

In [16]:
while True:
    ret, frame = cap.read()
    if not ret:
        print('Can\'t access video stream')
        break
    # Finding the ROI
    card_roi, box = find_card_roi(frame)
    predicted_type = None

    if card_roi is not None:
        pil_roi = Image.fromarray(cv2.cvtColor(card_roi, cv2.COLOR_BGR2RGB))
        input_tensor = transform(pil_roi).unsqueeze(0)

        with torch.no_grad():
            outputs = model(input_tensor)
        _, predicted_idx = torch.max(outputs, 1)
        predicted_type = idx_to_type[predicted_idx.item()]
        # The box plus text
        (x, y, w, h) = box
        cv2.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 2)
        
        text_str = f'Type: {predicted_type}'
        icon_path = type_icons.get(predicted_type, None)
        # text position
        text_x = x
        text_y = y - 10
        if text_y < 20:
            text_y = y + h + 30 # below if no room above

        if icon_path is not None:
            draw_text_with_icon(frame, text_str, icon_path, text_x, text_y, text_color=(0, 255, 0))
        else:
            cv2.putText(frame, text_str, (text_x, text_y),
                        cv2.FONT_HERSHEY_DUPLEX, 0.9, (0, 255, 0), 2)
    else:
        cv2.putText(frame, 'No card detected', (10, 30),
                    cv2.FONT_HERSHEY_DUPLEX, 1, (0, 0, 255), 1)

    cv2.imshow('DL Project - Pkmn TCG Card Classifier', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Can't access video stream
