In [None]:
CSIS_23_Windows_model:
    Input: RGB image of size (3, 640, 640)
    annotes: PASCAL VOC XML
    Output: Logits for the 4-class classification task
    Framework: Pytorch

    Convolutional Layer 1:
        - Conv2D with 32 filters, kernel size 3x3, stride 1, padding 1
        - Batch Normalization
        - ReLU activation
        - Max Pooling with kernel size 2x2, stride 2

    Convolutional Layer 2:
        - Conv2D with 64 filters, kernel size 3x3, stride 1, padding 1
        - Batch Normalization
        - ReLU activation
        - Max Pooling with kernel size 2x2, stride 2

    Convolutional Layer 3:
        - Conv2D with 128 filters, kernel size 3x3, stride 1, padding 1
        - Batch Normalization
        - ReLU activation
        - Max Pooling with kernel size 2x2, stride 2
        ..
        ..
        ..
        ..
        ..
        ..
    Flatten the output feature map

    Fully Connected Layer 1


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import xml.etree.ElementTree as ET
from PIL import Image
import os
import matplotlib.pyplot as plt
import numpy as np
import torchvision.transforms.functional as TF

%matplotlib inline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
# Defining the class labels
class_labels = {'fully_open': 1, 'closed': 0, 'tilted': 3, 'semi_open': 2}

In [None]:
# Class to prepare the dataset for xml format
class WindowDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.image_dir = os.path.join(data_dir, 'images')
        self.label_dir = os.path.join(data_dir, 'labels')

        self.image_paths = sorted([os.path.join(self.image_dir, file) for file in os.listdir(self.image_dir) if file.endswith('.jpg')])
        self.annotation_paths = sorted([os.path.join(self.label_dir, file) for file in os.listdir(self.label_dir) if file.endswith('.xml')])

        self.missing_annotations = self.find_missing_annotations()

    def find_missing_annotations(self):
        missing_annotations = []
        for image_path in self.image_paths:
            annotation_path = os.path.join(self.label_dir, os.path.splitext(os.path.basename(image_path))[0] + '.xml')
            if annotation_path not in self.annotation_paths:
                missing_annotations.append(image_path)
        return missing_annotations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        annotation_path = self.annotation_paths[idx] if idx < len(self.annotation_paths) else None

        image = Image.open(image_path).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        bounding_boxes = []
        classes = []

        if annotation_path is not None:
            root = ET.parse(annotation_path).getroot()

            for obj in root.findall('object'):
                class_name = obj.find('name').text
                bbox = obj.find('bndbox')
                xmin = int(bbox.find('xmin').text)
                ymin = int(bbox.find('ymin').text)
                xmax = int(bbox.find('xmax').text)
                ymax = int(bbox.find('ymax').text)
                bounding_boxes.append([xmin, ymin, xmax, ymax])
                classes.append(class_labels[class_name])

        return image, bounding_boxes, classes


In [52]:
from torchvision.transforms import functional as TF

# Define the dataset class
class WindowDataset(Dataset):
    def __init__(self, data_dir, max_boxes, transform=None):
        self.data_dir = data_dir
        self.transform = transform
        self.max_boxes = max_boxes
        self.image_dir = os.path.join(data_dir, 'images')
        self.label_dir = os.path.join(data_dir, 'labels')

        self.image_paths = sorted([os.path.join(self.image_dir, file) for file in os.listdir(self.image_dir) if file.endswith('.jpg')])
        self.annotation_paths = sorted([os.path.join(self.label_dir, file) for file in os.listdir(self.label_dir) if file.endswith('.txt')])

        self.missing_annotations = self.find_missing_annotations()

    def find_missing_annotations(self):
        missing_annotations = []
        for image_path in self.image_paths:
            annotation_path = os.path.join(self.label_dir, os.path.splitext(os.path.basename(image_path))[0] + '.txt')
            if annotation_path not in self.annotation_paths:
                missing_annotations.append(image_path)
        return missing_annotations

    def parse_yolo_annotation(self, annotation_path):
        with open(annotation_path, 'r') as f:
            lines = f.readlines()

        bounding_boxes = []
        classes = []

        for line in lines:
            class_id, x_center, y_center, width, height = map(float, line.split())
            class_id = int(class_id)
            x_min = (x_center - (width / 2)) * 640
            y_min = (y_center - (height / 2)) * 640
            x_max = (x_center + (width / 2)) * 640
            y_max = (y_center + (height / 2)) * 640
            bounding_boxes.append([x_min, y_min, x_max, y_max])
            classes.append(class_id)

        # Pad bounding boxes and classes if necessary
        if len(bounding_boxes) < self.max_boxes:
            padding = [[0, 0] for _ in range(self.max_boxes - len(bounding_boxes))]
            bounding_boxes.extend(padding)
        if len(classes) < self.max_boxes:
            padding = [0 for _ in range(self.max_boxes - len(classes))]
            classes.extend(padding)

        return bounding_boxes, classes

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        annotation_path = self.annotation_paths[idx] if idx < len(self.annotation_paths) else None

        image = Image.open(image_path).convert('RGB')

        # Resize the image to a consistent size
        image = image.resize((640, 640))

        if self.transform is not None:
            image = self.transform(image)

        bounding_boxes = []
        classes = []

        if annotation_path is not None:
            bounding_boxes, classes = self.parse_yolo_annotation(annotation_path)

        return image, bounding_boxes, classes


In [53]:
# custom model architecture
class WindowClassifier(nn.Module):
    def __init__(self, num_classes):
        super(WindowClassifier, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(64)
        self.relu3 = nn.ReLU()
        self.maxpool3 = nn.MaxPool2d(kernel_size=2, stride=2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(64 * 80 * 80, 256)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)

        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)

        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)

        x = self.flatten(x)

        x = self.fc1(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc2(x)

        return x

In [57]:
# transformation to be applied to the images
transform = transforms.Compose([
    transforms.Resize((640, 640)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
max_boxes = 10

# Create instances of the training and testing datasets
train_dataset = WindowDataset(r"C:\Users\gokul\Desktop\CSIS_SS23\yolo_classify_dataset\train",max_boxes=max_boxes, transform=transform)
test_dataset = WindowDataset(r"C:\Users\gokul\Desktop\CSIS_SS23\yolo_classify_dataset\val", max_boxes=max_boxes, transform=transform)
missing_annotations = train_dataset.missing_annotations
print("Images without matching annotations:")
for image_path in missing_annotations:
    print(image_path)

Images without matching annotations:


In [58]:
# Create data loaders to efficiently load the data during training and testing
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Create an instance of the model
model = WindowClassifier(num_classes=4)
model.to(device)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# epochs
num_epochs = 2

In [59]:
#Training the model

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct_predictions = 0

    for images, _, labels in train_loader:
        images = images.to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        correct_predictions += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(train_loader)
    epoch_accuracy = correct_predictions / len(train_dataset)

    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}, Accuracy: {epoch_accuracy:.4f}')

RuntimeError: each element in list of batch should be of equal size

In [None]:
# Function to draw bounding boxes and class labels on images
def draw_boxes(image, boxes, labels):
    image = TF.to_pil_image(image)
    image = np.array(image)

    fig, ax = plt.subplots(1)
    ax.imshow(image)

    for box, label in zip(boxes, labels):
        xmin, ymin, xmax, ymax = box
        rect = plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, fill=False, edgecolor='r', linewidth=2)
        ax.add_patch(rect)
        ax.text(xmin, ymin - 5, label, fontsize=12, color='r')

    plt.axis('off')
    plt.show()

In [None]:
# Testing
model.eval()
test_loss = 0.0
test_correct = 0

with torch.no_grad():
    for images, boxes, labels in test_loader:
        images = images.to(device)
        labels = torch.tensor(labels).to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        test_correct += (predicted == labels).sum().item()

        # Convert class labels to their corresponding names
        predicted_labels = [list(class_labels.keys())[p] for p in predicted]

        # Visualize a few images with bounding boxes and class labels
        for i in range(len(images)):
            draw_boxes(images[i], boxes[i], predicted_labels[i])

test_loss /= len(test_loader)
test_accuracy = test_correct / len(test_dataset)

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')