In [22]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms as T
import os
import cv2 as cv2
from PIL import Image

In [23]:
def init_weights(m):
    if isinstance(m, nn.Conv2d):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.BatchNorm2d):
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)
    elif isinstance(m, nn.Linear):
        nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        nn.init.constant_(m.bias, 0)

In [24]:
class InceptionModule(nn.Module):
    def __init__(self, in_channels):
        super(InceptionModule, self).__init__()
        out_channels_branch = 16 
        self.branch1x1 = nn.Conv2d(in_channels, out_channels_branch, kernel_size=1)

        self.branch5x5_1 = nn.Conv2d(in_channels, out_channels_branch, kernel_size=1)
        self.branch5x5_2 = nn.Conv2d(out_channels_branch, out_channels_branch, kernel_size=5, padding=2)

        self.branch3x3dbl_1 = nn.Conv2d(in_channels, out_channels_branch, kernel_size=1)
        self.branch3x3dbl_2 = nn.Conv2d(out_channels_branch, out_channels_branch, kernel_size=3, padding=1)
        self.branch3x3dbl_3 = nn.Conv2d(out_channels_branch, out_channels_branch, kernel_size=3, padding=1)

        self.branch_pool = nn.Conv2d(in_channels, out_channels_branch, kernel_size=1)
        
        self.apply(init_weights)

    def forward(self, x):
        branch1x1 = self.branch1x1(x)

        branch5x5 = self.branch5x5_1(x)
        branch5x5 = self.branch5x5_2(branch5x5)

        branch3x3dbl = self.branch3x3dbl_1(x)
        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)

        branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
        branch_pool = self.branch_pool(branch_pool)

        outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
        return torch.cat(outputs, 1) 

In [25]:
class ResBlock(nn.Module):
    def __init__(self, in_channels):
        super(ResBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv2 = nn.Conv2d(in_channels, in_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(in_channels)
        
        self.apply(init_weights)

    def forward(self, x):
        identity = x

        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))

        out += identity
        out = F.relu(out)

        return out

In [26]:
class ResInceptionNet(nn.Module):
    def __init__(self, num_classes=7):
        super(ResInceptionNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.relu1 = nn.ReLU(inplace=True)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # Residual Blocks
        self.res_block1 = ResBlock(32)
        self.res_block2 = ResBlock(32)
        
        # Inception module
        self.inception = InceptionModule(32)
        
        # Double convolution layers
        self.conv2_1 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.bn2_1 = nn.BatchNorm2d(64)
        self.relu2_1 = nn.ReLU(inplace=True)
        self.conv2_2 = nn.Conv2d(64, 64, kernel_size=3, padding=1)
        self.bn2_2 = nn.BatchNorm2d(64)
        self.relu2_2 = nn.ReLU(inplace=True)
        
        self.conv3_1 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3_1 = nn.BatchNorm2d(128)
        self.relu3_1 = nn.ReLU(inplace=True)
        self.conv3_2 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.bn3_2 = nn.BatchNorm2d(128)
        self.relu3_2 = nn.ReLU(inplace=True)

        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
        
        self.global_avg_pool = nn.AdaptiveAvgPool2d((1, 1))

        self.fc1 = nn.Linear(128, 2048)
        self.drop1 = nn.Dropout(0.1)
        self.fc2 = nn.Linear(2048, 1024)
        self.drop2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(1024, 512)
        self.drop3 = nn.Dropout(0.5)
        self.fc4 = nn.Linear(512, num_classes)  

    def forward(self, x):
        x = self.pool1(self.relu1(self.bn1(self.conv1(x))))
        x = self.res_block1(x)
        x = self.res_block2(x)
        x = self.inception(x)
        
        x = self.relu2_1(self.bn2_1(self.conv2_1(x)))
        x = self.relu2_2(self.bn2_2(self.conv2_2(x)))
        x = self.relu3_1(self.bn3_1(self.conv3_1(x)))
        x = self.relu3_2(self.bn3_2(self.conv3_2(x)))
        
        x = self.pool2(x)

        x = self.global_avg_pool(x)
        x = x.view(x.size(0), -1)
        x = self.drop1(F.relu(self.fc1(x)))
        x = self.drop2(F.relu(self.fc2(x)))
        x = self.drop3(F.relu(self.fc3(x)))  
        x = self.fc4(x)  
        
        return x
    
model = ResInceptionNet()

In [27]:
classes = {0: 'angry', 1: 'disgusted', 2: 'fearful', 3: 'happy', 4: 'neutral', 5: 'sad', 6: 'surprised'}

In [28]:
def predict(image_tensor, model):
    model.eval()  
    if torch.cuda.is_available():
        image_tensor = image_tensor.cuda()
        model = model.cuda()
    with torch.no_grad():
        output = model(image_tensor)
        probabilities = torch.nn.functional.softmax(output, dim=1)
        max_prob, preds = torch.max(probabilities, dim=1)
        predicted_label = preds[0].item()
        probability = max_prob[0].item()
        emotion_text = classes[predicted_label] 
        return emotion_text, probability

In [29]:
transform = T.Compose([
    T.ToTensor(),
    T.Normalize([0.5077, 0.5077, 0.5077], [0.2550, 0.2550, 0.2550])
])


model = ResInceptionNet() 
model.load_state_dict(torch.load('ResInceptionNet3.pth'))  

# predict(video_path, model, transform)


<All keys matched successfully>

In [30]:
filepath = os.path.join(os.getcwd(), 'Test Video.mp4')
flag_live = False #True False
flag_model = False #True False

if flag_live == True:
    # Initialize the camere
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("The camera is not available")
        exit()
else:
    cap = cv2.VideoCapture(filepath)
    if not cap.isOpened():
        print("The video is not available")
        exit()
        
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

save_video = False 
if save_video:
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter('output_video.avi', fourcc, fps, (frame_width, frame_height))

while True:
    # Return a boolean if the frame is available and the image
    ret, color_frame = cap.read()  
    #print(color_frame)
    if not ret:
        print("The frame is not available")
        break
    # Convert the captured image to grayscale
    gray_frame = cv2.cvtColor(color_frame, cv2.COLOR_BGR2GRAY)
    # Call the OpenCV Face recognition
    face_classifier = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
    face = face_classifier.detectMultiScale(gray_frame, scaleFactor=1.1, minNeighbors=5, minSize=(50, 50))

    for (x, y, w, h) in face:
        # Create a rectangule in the image
        rect = cv2.rectangle(color_frame, (x, y), (x + w, y + h), color=(255, 185, 0), thickness=5)
        # Capture the face in the rectangule (used to predict the emotion)
        rect_gray = gray_frame[y:y + w, x:x + h]
        rect_gray = cv2.resize(rect_gray, (48, 48))
        face_pil = Image.fromarray(rect_gray)  # Convert numpy array to PIL Image

        if face_pil.mode != 'L':
            face_pil = face_pil.convert('L')

        # Convert a single-channel image (L) directly to RGB
        face_pil = face_pil.convert("RGB")

        face_tensor = transform(face_pil)
        predicted_emotion, probability = predict(face_tensor.unsqueeze(0), model)
        emotion_text = f"{predicted_emotion}: {probability:.2f}"
        cv2.putText(color_frame, emotion_text, (x, y-10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)
        
    if save_video:
        out.write(color_frame)   
            
    cv2.imshow('frame', color_frame)

    # Include a quit botton (press q to quit)
    if cv2.waitKey(20) == ord('q'):
        break

cap.release()
if save_video:
    out.release()
cv2.destroyAllWindows

<function destroyAllWindows>