In [None]:
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
import tensorflow as tf
import cv2
import time
import mediapipe as mp
from google.protobuf.json_format import MessageToDict
import io
import numpy as np
from IPython.display import clear_output, Image, display
import PIL.Image
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
import torchvision

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def showarray(a, fmt='jpeg'):
    if isinstance(a, list):
        imgs = [np.uint8(np.clip(img, 0, 255)) for img in a]
        f = io.BytesIO()
        PIL.Image.fromarray(np.concatenate(imgs, axis=1)).save(f, fmt)
        display(Image(data=f.getvalue()))
    else:
        a = np.uint8(np.clip(a, 0, 255))
        f = io.BytesIO()
        PIL.Image.fromarray(a).save(f, fmt)
        display(Image(data=f.getvalue()))

In [None]:
def preprocess_image(img_path):
    img = cv2.imread(img_path)
    img = cv2.resize(img, (100, 100))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

def old_preprocess_image(img_path): # Doesn't Work that well
    img = cv2.imread(img_path)
    img = cv2.resize(img, (100, 100))
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)  # HSV = hue, saturation and value
    lower_skin = np.array([0, 20, 70])
    upper_skin = np.array([25, 255, 255])
    # Mask for skin color detection
    mask_skin = cv2.inRange(img_hsv, lower_skin, upper_skin)
    img_blur = cv2.GaussianBlur(img, (3, 3), 0)
    img_gray = cv2.cvtColor(img_blur, cv2.COLOR_RGB2GRAY)
    # Sobel Edge Detection 
    sobelx = cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)
    laplacian = cv2.Laplacian(img_gray, cv2.CV_64F) 
    sobel = np.sqrt(sobelx**2 + sobely**2)
    # Apply skin mask
    img = cv2.bitwise_and(sobel, sobel, mask=mask_skin)
    img = np.uint8(np.clip(img, 0, 255))
    # Apply threshold
    _, img = cv2.threshold(img, 25, 50, cv2.THRESH_BINARY)
    # Convert single channel to 3 channels (RGB)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    return img_rgb

DataLoader Implementation (87000 images)

In [None]:
datatypes = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space']
data_path = '/home/aravos/ASL-data/asl_alphabet_train/asl_alphabet_train'  # Use Unix-style paths
data = {i: [] for i in datatypes}
mp = {datatypes[i]: i for i in range(len(datatypes))}
for dtype in data.keys():
    temp_path = os.path.join(data_path, dtype)
    for filename in os.listdir(temp_path):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp')) and 'Zone.Identifier' not in filename:
            data[dtype].append(os.path.join(temp_path, filename))

In [None]:
class AslDataset(Dataset):
    def __init__(self):
        self.x = []
        self.labels = []
        #self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.ToTensor()
        ])
        
        for dtype in data.keys():
            for i in data[dtype]:
                self.labels.append(mp[dtype])
                img_tensor = self.transform(preprocess_image(i))
                self.x.append(img_tensor)
        
        self.labels = torch.tensor(self.labels, dtype=torch.long)
        
    def __getitem__(self, index):
        return self.x[index].cpu(), self.labels[index]
    
    def __len__(self):
        return len(self.labels)

In [None]:
dataset = AslDataset()

Creating dataloaders

In [None]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

CNN Implementation

In [None]:
# Hyper Parameters
learning_rate = 0.001
num_epochs = 40
batch_size = 128
train_test_split = 0.8
num_classes = 30

In [None]:
classes = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space')
print(len(classes))
def imshow(img):
    plt.imshow(img)
    plt.show()
    
# get some random training images
dataiter = iter(train_loader)
images, labels = next(dataiter)
l = []
for i in range (len(images)):
    img = images[i].numpy()
    img = np.transpose(img, (1, 2, 0))
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = np.uint8(img * 255)
    l.append(img)
# show images
showarray(l)

In [None]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        self.pool = nn.MaxPool2d(2, 2)
        self.conv1 = nn.Conv2d(3, 6, 5)   # Output: 6 x 96 x 96
        self.conv2 = nn.Conv2d(6, 16, 5)  # Output: 16 x 92 x 92
        self.conv3 = nn.Conv2d(16, 40, 5) # Output: 40 x 88 x 88
        self.conv4 = nn.Conv2d(40, 100, 5)# Output: 100 x 84 x 84
        self.fc1 = nn.Linear(100 * 19 * 19, 1000) # Flatten: 100 x 19 x 19 = 36100
        self.fc2 = nn.Linear(1000, 500)
        self.fc3 = nn.Linear(500, 29)

    def forward(self, x):
        x = F.relu(self.conv1(x))           # Output: 6 x 96 x 96
        x = self.pool(F.relu(self.conv2(x)))# Output: 16 x 46 x 46
        x = F.relu(self.conv3(x))           # Output: 40 x 42 x 42
        x = self.pool(F.relu(self.conv4(x)))# Output: 100 x 19 x 19
        x = x.view(-1, 100 * 19 * 19)       # Flatten: 100 x 19 x 19 = 36100
        x = F.relu(self.fc1(x))             # Output: 1000
        x = F.relu(self.fc2(x))             # Output: 500
        x = self.fc3(x)                     # Output: 29 (num_classes)
        return x


model = ConvNet().to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

print('Finished Training')

In [None]:
PATH = './cnn.pth'
torch.save(model.state_dict(), PATH)
print('Saved')

In [None]:
model.eval()
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for _ in range(29)]
    n_class_samples = [0 for _ in range(29)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()
        
        for i in range(len(labels)):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')

    for i in range(29):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {classes[i]}: {acc} %')

In [None]:
model = ConvNet()
model.load_state_dict(torch.load('./cnn.pth'))
model.to(device)
model.eval()

In [None]:
# Testing Saved Model
def test(name,label):
    path = './vid/'+name+'.jpeg'
    imag = preprocess_image(path)
    showarray(imag)
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor()
    ])
    
    imag = transform(imag)
    imag = imag.unsqueeze(0)
    imag = imag.to(device)
    outputs = model(imag)
    _, predicted = torch.max(outputs, 1)
    print(f'Actual label: {label}, Predicted label: {datatypes[predicted.item()]}')
    # Testing Saved Model
    imag = cv2.flip(preprocess_image(path),1)
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.ToTensor()
    ])
    
    imag = transform(imag)
    imag = imag.unsqueeze(0)
    imag = imag.to(device)
    outputs = model(imag)
    _, predicted = torch.max(outputs, 1)
    print(f'Actual label: {label}, Predicted label: {datatypes[predicted.item()]}')

In [None]:
test('A','A')
test('B','B')
test('C','C')
test('D','D')

In [None]:
#Legacy Code
def setup(video_path):
    # Function to display image in Jupyter Notebook
    # Start capturing video from MP4 file
    return cv2.VideoCapture(video_path)
cap = setup('/home/aravos/vid/vid1.mp4')

mpHands = mp.solutions.hands
hands = mpHands.Hands(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.75,
    min_tracking_confidence=0.75,
    max_num_hands=2
)

while True:
    # Read video frame by frame
    success, img = cap.read()

    if not success:
        print('NO')
        break

    # Flip the image(frame)
    img = cv2.flip(img, 1)

    # Convert BGR image to RGB image
    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    # Process the RGB image
    results = hands.process(imgRGB)

    # If hands are present in image(frame)
    if results.multi_hand_landmarks:

        # Both Hands are present in image(frame)
        if len(results.multi_handedness) == 2:
            cv2.putText(img, 'Both Hands', (250, 50),
                        cv2.FONT_HERSHEY_COMPLEX,
                        0.9, (0, 255, 0), 2)

        # If any hand present
        else:
            for i in results.multi_handedness:

                # Return whether it is Right or Left Hand
                label = MessageToDict(i)['classification'][0]['label']

                if label == 'Left':
                    cv2.putText(img, label + ' Hand',
                                (20, 50),
                                cv2.FONT_HERSHEY_COMPLEX,
                                0.9, (0, 255, 0), 2)

                if label == 'Right':
                    cv2.putText(img, label + ' Hand', (460, 50),
                                cv2.FONT_HERSHEY_COMPLEX,
                                0.9, (0, 255, 0), 2)

    # Display the image using showarray function
    showarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    # Clear the previous output for the next frame
    clear_output(wait=True)
cap = setup('/home/aravos/vid/vid1.mp4')

while True:
    success, img = cap.read()
    if not success:
        break
    img = cv2.flip(img, 1)
    k = 3
    t = 2
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    #img_blur = cv2.GaussianBlur(img_gray,(25,25),1.5) - cv2.GaussianBlur(img_gray,(25,25),1.5*k)
    img_blur = cv2.GaussianBlur(img_gray,(15,15),1.5) - t * cv2.GaussianBlur(img_gray,(15,15),1.5*k)
    _, img_thresh = cv2.threshold(img_blur, 170, 255, cv2.THRESH_BINARY)
    # showarray(img_thresh)
    # showarray(cv2.Canny(img_blur,100,150))
    final_img = cv2.Canny(img,150,200) + img_thresh
    showarray(final_img)
    # Clear the previous output for the next frame
    time.sleep(0.05)
    clear_output(wait=True)
# Extended Gaussian
cap = setup('/home/aravos/vid/vid1.mp4')

while True:
    success, img = cap.read()
    if not success:
        break
    img = cv2.flip(img, 1)
    k = 3
    t = 5
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    #img_blur = cv2.GaussianBlur(img_gray,(25,25),1.5) - cv2.GaussianBlur(img_gray,(25,25),1.5*k)
    img_blur = (1+t) * cv2.GaussianBlur(img_gray,(15,15),1.5) - t * cv2.GaussianBlur(img_gray,(15,15),1.5*k)
    _, img_thresh = cv2.threshold(img_blur, 170, 255, cv2.THRESH_BINARY)
    #showarray(img_thresh)
    showarray(cv2.Canny(img_thresh,100,150))
    time.sleep(0.05)
    clear_output(wait=True)
#Sobel + Laplacian
cap = setup('/home/aravos/vid/handsvid.mp4')

while True:
    success, img = cap.read()
    if not success:
        break
    img = cv2.flip(img, 1)
    k = 3
    t = 2
    # Convert to HSV for simpler calculations 
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) 
      
    # Calculation of Sobelx 
    sobelx = cv2.Sobel(frame,cv2.CV_64F,1,0,ksize=5) 
      
    # Calculation of Sobely 
    sobely = cv2.Sobel(frame,cv2.CV_64F,0,1,ksize=5) 
      
    # Calculation of Laplacian 
    laplacian = cv2.Laplacian(frame,cv2.CV_64F) 
    
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img_blur = cv2.GaussianBlur(img_gray,(9,9),0) 
    hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV) 
      
    # Calculation of Sobelx 
    sobelx = cv2.Sobel(frame,cv2.CV_64F,1,0,ksize=5) 
      
    # Calculation of Sobely 
    sobely = cv2.Sobel(frame,cv2.CV_64F,0,1,ksize=5) 
      
    # Calculation of Laplacian 
    laplacian = cv2.Laplacian(frame,cv2.CV_64F) 
    _, img_thresh = cv2.threshold(img_blur, 170, 255, cv2.THRESH_BINARY)
    # showarray(img_thresh)
    # showarray(cv2.Canny(img_blur,100,150))
    final_img = cv2.Canny(img,150,200) + img_thresh
    showarray(final_img)
    # Clear the previous output for the next frame
    time.sleep(0.05)
    clear_output(wait=True)

cap = setup('/home/aravos/vid/vid3.mp4')

while True:
    success, img = cap.read()
    if not success:
        break
    showarray(preprocess_image(img))
    time.sleep(0.05)
    clear_output(wait=True)
    continue
    img = cv2.flip(img, 1)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)

    # Define range of skin color in HSV
    lower_skin = np.array([0, 20, 70])
    upper_skin = np.array([25, 255, 255])

    # Create a mask for skin color detection
    mask_skin = cv2.inRange(img_hsv, lower_skin, upper_skin)

    # Apply Gaussian blur to the image
    img_blur = cv2.GaussianBlur(img, (3, 3), 0)

    # Convert image to grayscale
    img_gray = cv2.cvtColor(img_blur, cv2.COLOR_RGB2GRAY)

    # Sobel Edge Detection on skin color-masked grayscale image
    sobelx = cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)
    laplacian = cv2.Laplacian(img_gray,cv2.CV_64F) 
    sobel = np.sqrt(sobelx**2 + sobely**2)

    # Apply the skin mask to the Sobel edge image
    img = cv2.bitwise_and(sobel, sobel, mask=mask_skin)
    img = np.uint8(np.clip(img, 0, 255))
    k = 10
    a = 5
    s = 7
    #img = cv2.GaussianBlur(img, (9, 9),s)
    img = (1+k) * cv2.GaussianBlur(img, (9, 9),s)  - k * cv2.GaussianBlur(img, (9, 9),s*a)
    img = cv2.bilateralFilter(img,9,75,75)
    img = cv2.GaussianBlur(img, (9, 9),s)
    _, img = cv2.threshold(img, 25, 50, cv2.THRESH_BINARY)
    
    #img = cv2.Canny(img,25,150)
    
    showarray(img)
    time.sleep(0.05)
    clear_output(wait=True)


In [None]:
device = torch.device('cuda')
classes = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'del', 'nothing', 'space')
    
# get some random training images
dataiter = iter(train_loader)
images, labels = next(dataiter)
l = []
for i in range (len(images)):
    img = images[i].numpy()
    img = np.transpose(img, (1, 2, 0))
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = np.uint8(img * 255)
    l.append(img)
# show images
showarray(l)

conv1 = nn.Conv2d(3, 6, 5) # _ = input channel size (3 bcuz RGB), _ = output channel size, _= kernel size i.e size of the smaller matrix used in for convolution in cnn
pool = nn.MaxPool2d(2, 2) # _= kernel size (2x2) its pooled hence it's smaller, _ = stride i.e what it's shifted by 2
conv2 = nn.Conv2d(6, 16, 5) # Input channel size must be EQUAL to the last output
conv3 = nn.Conv2d(16, 40, 5)
conv4 = nn.Conv2d(40, 100, 5)
conv5 = nn.Conv2d(16, 40, 5)
conv6 = nn.Conv2d(16, 40, 5)

print(images.shape)
x = conv1(images)
print(x.shape)
print(F.relu(x).shape)
x = conv2(x)
print(x.shape)
x = pool(x)
print(x.shape)
x = conv3(x)
print(x.shape)
x = conv4(x)
print(x.shape)
x = pool(x)
print(x.shape)