In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Subset
from torchvision import transforms, utils, io
from torchvision.utils import make_grid
from sklearn.model_selection import StratifiedShuffleSplit

from string import ascii_lowercase
from tqdm import tqdm # For progress bar

In [6]:
# Setting the path of the training dataset (that was already provided to you)

running_local = True if os.getenv('JUPYTERHUB_USER') is None else False
DATASET_PATH = "."

# Set the location of the dataset
if running_local:
    # If running on your local machine, the sign_lang_train folder's path should be specified here
    local_path = os.path.join('..', '..', 'sign_lang_train')
    if os.path.exists(local_path):
        DATASET_PATH = local_path
else:
    # If running on the Jupyter hub, this data folder is already available
    # You DO NOT need to upload the data!
    DATASET_PATH = "/data/mlproject22/sign_lang_train"

In [None]:
### LETS READ THE CSV FILE AND THE .JPG FILES WITH THE CLASS BUILT IN SRC ###
from datasets import SignLangDataset
csv_filename = "labels.csv"  # This is your file inside sign_lang_train

# Create dataset
dataset = SignLangDataset(csv_file=csv_filename, root_dir=DATASET_PATH)

In [8]:
### DID WE READ IT CORRECTLY??? ###
dataset.__len__()

9680

In [9]:
# Extract all labels
all_labels = [dataset[i]['label'] for i in range(len(dataset))]

# Create stratified split object
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Get train/val indices
for train_idx, val_idx in splitter.split(np.zeros(len(all_labels)), all_labels):
    train_dataset = Subset(dataset, train_idx)
    val_dataset = Subset(dataset, val_idx)

# Optional: Wrap in DataLoaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
### IMPORT THE DEFINED CLASS THAT TRANSFORM THE 2D IMAGE INTO 1D FOR THE RANDOM FOREST ###
from transforms import FlattenImageTransform

# Class to transform to Flattened Pixels:
transform_for_rf = FlattenImageTransform(target_size=(64, 64))

In [11]:
# --- Manually apply transform and collect features/labels ---
X_train_features = []
y_train_labels = []
X_val_features = []
y_val_labels = []

print("Extracting features from training data...")
for i, sample in enumerate(tqdm(train_dataset)):
    processed_sample = transform_for_rf(sample)
    X_train_features.append(processed_sample['features'])
    y_train_labels.append(processed_sample['label'])

print("Extracting features from validation data...")
for i, sample in enumerate(tqdm(val_dataset)):
    processed_sample = transform_for_rf(sample)
    X_val_features.append(processed_sample['features'])
    y_val_labels.append(processed_sample['label'])

# Convert lists to numpy arrays
X_train = np.array(X_train_features)
y_train = np.array(y_train_labels)
X_val = np.array(X_val_features)
y_val = np.array(y_val_labels)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_val: {X_val.shape}")
print(f"Shape of y_val: {y_val.shape}")

Extracting features from training data...


100%|██████████| 7744/7744 [00:05<00:00, 1394.67it/s]


Extracting features from validation data...


100%|██████████| 1936/1936 [00:01<00:00, 1236.92it/s]


Shape of X_train: (7744, 524288)
Shape of y_train: (7744,)
Shape of X_val: (1936, 524288)
Shape of y_val: (1936,)


In [12]:
### SAVE THE DATA IN THE FOLDER DATA ###
# Make a directory for preprocessed data if it doesn't exist
os.makedirs("preprocessed_data", exist_ok=True)

# Save arrays
np.save("preprocessed_data/X_train.npy", X_train)
np.save("preprocessed_data/y_train.npy", y_train)
np.save("preprocessed_data/X_val.npy", X_val)
np.save("preprocessed_data/y_val.npy", y_val)

print("Feature arrays saved successfully.")

Feature arrays saved successfully.
