DATA PREPROCESS + MODEL TRAINING + VGG TO EXTRACT FEATURES

In [2]:
from torchvision.models import vgg16

IMPORTS

In [3]:
import torch.nn as nn

LOAD VGG16 PRETRAINED

In [4]:
vgg = vgg16(pretrained=True)
# Keep only the feature extractor part (remove classifier)
vgg_features = nn.Sequential(*list(vgg.children())[:-1])
# Set to eval mode
vgg_features.eval()



Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

DEFINE PATH FOR ACCESSING THE DATA

In [5]:
# Setting the path of the training dataset (that was already provided to you)
import os

running_local = True if os.getenv('JUPYTERHUB_USER') is None else False
DATASET_PATH = "."

# Set the location of the dataset
if running_local:
    # If running on your local machine, the sign_lang_train folder's path should be specified here
    local_path = os.path.join('..', '..', 'sign_lang_train')
    if os.path.exists(local_path):
        DATASET_PATH = local_path
else:
    # If running on the Jupyter hub, this data folder is already available
    # You DO NOT need to upload the data!
    DATASET_PATH = "/data/mlproject22/sign_lang_train"

In [None]:
from torch.utils.data import Dataset
import os
import csv
import cv2
import numpy as np
from string import ascii_lowercase
import torch

def read_csv(csv_file):
    with open(csv_file, newline='') as f:
        reader = csv.reader(f)
        data = list(reader)
    return data

class SignLangVGGDataset(Dataset):
    """Sign language dataset prepared for VGG16 feature extraction"""

    def __init__(self, csv_file, root_dir, class_index_map=None, transform=None):
        self.data = read_csv(os.path.join(root_dir, csv_file))
        self.root_dir = root_dir
        self.class_index_map = class_index_map
        self.transform = transform
        self.class_names = list(map(str, list(range(10)))) + list(ascii_lowercase)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_path = os.path.join(self.root_dir, self.data[idx][1])
        image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

        # Resize and convert to 3 channels
        image = cv2.resize(image, (224, 224))
        image = np.stack([image] * 3, axis=0)  # shape: (3, 224, 224)

        label = self.class_names.index(self.data[idx][0])
        sample = {'image': image, 'label': label}

        return sample

NOW IMPORT THE DATASET CLASS THAT CHANGES THE IMAGE FROM 124 TO 224 PIXELS (APPROPIATE FOR THE VGG16)
AND USE IT FOR CREATING THE DATASET ITSELF

In [7]:
from dataset_vgg import SignLangVGGDataset
csv_filename = "labels.csv"  # This is your file inside sign_lang_train
dataset = SignLangVGGDataset(csv_file=csv_filename, root_dir=DATASET_PATH)

MAKE SURE DATASET IS CORRECTLY CREATED AND THAT THE IMAGE SIZE HAS BEEN CHANGED

In [8]:
sample = dataset[0]

print("Image shape:", sample['image'].shape)  # Should be (3, 224, 224)
print("Label:", sample['label'])              # Should be int between 0–35

Image shape: (3, 224, 224)
Label: 21


SET UP DATA LOADER AND MOVE VGG TO DEVICE

In [9]:
from torch.utils.data import DataLoader
import torch

dataloader = DataLoader(dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vgg_features = vgg_features.to(device)
vgg_features.eval()

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

EXTRACT FEATURES IN BATCHES

In [10]:

features = []
labels = []

with torch.no_grad():
    for batch in dataloader:
        images = batch['image'].float().to(device)
        labels_batch = batch['label']

        feats = vgg_features(images)                # (B, 512, 7, 7)
        feats = feats.view(feats.size(0), -1)       # Flatten to (B, 25088)

        features.append(feats.cpu().numpy())
        labels.extend(labels_batch.numpy())

AFTER FEATURE EXTRACTION, STACK THE FEATURES AND TRAIN THE MODEL

In [11]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

X = np.concatenate(features, axis=0)
y = np.array(labels)

print("Feature matrix shape:", X.shape)     # should be (9680, 25088)
print("Label vector shape:", y.shape)       # should be (9680,)
print("First feature vector:", X[0][:10])   # show first 10 values of first image

Feature matrix shape: (9680, 25088)
Label vector shape: (9680,)
First feature vector: [0.5780331 0.        0.        0.        0.        0.        0.
 0.        0.        0.       ]


NOW IT IS TIME TO TRAIN THE MODEL

FIRST, SPLIT DATA SO WE ENSURE TO EVALUATE ON UNSEEN DATA

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

THEN TRAIN ON TRAINING SET

In [14]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
clf.fit(X_train, y_train)

EVALUATE ACCURACY ON TEST SET

In [18]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", acc)

print("\nDetailed report:\n")
print(classification_report(y_test, y_pred, digits=3))

Test accuracy: 0.6601239669421488

Detailed report:

              precision    recall  f1-score   support

           0      0.533     0.875     0.662       112
           1      1.000     0.045     0.087        22
           2      0.500     0.045     0.083        22
           3      0.000     0.000     0.000        22
           4      0.566     0.875     0.688       112
           5      1.000     0.087     0.160        23
           6      0.497     0.821     0.620       112
           7      0.000     0.000     0.000        22
           8      1.000     0.029     0.057        34
           9      0.709     0.938     0.808       112
          10      0.800     0.182     0.296        22
          11      0.841     0.661     0.740        56
          12      0.766     0.964     0.854       112
          13      0.800     0.118     0.205        34
          14      1.000     0.174     0.296        23
          15      1.000     0.130     0.231        23
          16      0.696     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


EXPORT THE MODEL

In [21]:
import joblib
joblib.dump(clf, "rf_vgg16_224x224.joblib")

['rf_vgg16_224x224.joblib']

MODEL EXPORTED IS 150MB. LETS REDUCE THE NUMBER OF TREES

In [27]:
clf_30 = RandomForestClassifier(n_estimators=30, n_jobs=-1)
clf_30.fit(X_train, y_train)

In [31]:
y_pred = clf_30.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", acc)

print("\nDetailed report:\n")
print(classification_report(y_test, y_pred, digits=3))

Test accuracy: 0.6136363636363636

Detailed report:

              precision    recall  f1-score   support

           0      0.458     0.866     0.599       112
           1      0.500     0.136     0.214        22
           2      0.750     0.136     0.231        22
           3      1.000     0.045     0.087        22
           4      0.566     0.804     0.664       112
           5      0.571     0.174     0.267        23
           6      0.494     0.759     0.599       112
           7      1.000     0.091     0.167        22
           8      0.667     0.059     0.108        34
           9      0.590     0.911     0.716       112
          10      0.500     0.136     0.214        22
          11      0.717     0.679     0.697        56
          12      0.732     0.902     0.808       112
          13      0.000     0.000     0.000        34
          14      1.000     0.174     0.296        23
          15      0.333     0.043     0.077        23
          16      0.687     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
joblib.dump(clf_30, "rf_vgg16_224x224_30estimators.joblib")

['rf_vgg16_224x224_30estimators.joblib']

SEEMS THAT THERE IS OBVIOUSLY A PROBLEM WITH THE CLASS IMBALANCE SHOWN IN THE EDA. 
THE MODEL HAS NOT BEEN ABLE TO LEARN FROM THE FEATURES EXTRACTED AS THE 66% GUESS EFFICIENCY SHOWS

LETS TRY TO CHANGE SOME THINGS TO MAKE IT BE BETTER

In [33]:
from sklearn.decomposition import PCA

pca = PCA(n_components=512)  # or 256 if you want smaller
X_reduced = pca.fit_transform(X)

print("New shape:", X_reduced.shape)

New shape: (9680, 512)


In [36]:
X_train, X_test, y_train, y_test = train_test_split(
    X_reduced, y, test_size=0.2, stratify=y, random_state=42
)

clf_pca = RandomForestClassifier(n_estimators=50, n_jobs=-1)
clf_pca.fit(X_train, y_train)

In [37]:
y_pred = clf_pca.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Test accuracy:", acc)

print("\nDetailed report:\n")
print(classification_report(y_test, y_pred, digits=3))

Test accuracy: 0.5423553719008265

Detailed report:

              precision    recall  f1-score   support

           0      0.378     0.705     0.492       112
           1      0.000     0.000     0.000        22
           2      0.000     0.000     0.000        22
           3      1.000     0.045     0.087        22
           4      0.451     0.786     0.573       112
           5      0.000     0.000     0.000        23
           6      0.385     0.625     0.476       112
           7      1.000     0.091     0.167        22
           8      0.250     0.029     0.053        34
           9      0.525     0.839     0.646       112
          10      0.000     0.000     0.000        22
          11      0.647     0.393     0.489        56
          12      0.606     0.893     0.722       112
          13      0.333     0.059     0.100        34
          14      1.000     0.087     0.160        23
          15      0.667     0.087     0.154        23
          16      0.679     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [38]:
joblib.dump(clf_pca, "sign_rf_model.joblib")

['sign_rf_model.joblib']