# **Nhận dạng ngôn ngữ ký hiệu Việt Nam (từ/cụm từ)**
*Hoàng Anh Hùng*

Notebook này tiến hành huấn luyện model LSTM nhận dạng ngôn ngữ ký hiệu thời gian thực.
- **Dữ liệu**: 66 ký hiệu (66 cụm từ thường được sử dụng trong các trường hợp khẩn cấp hoặc trong các cơ sở y tế), mỗi ký hiệu có 30 videos thể hiện ký hiệu đó. Mỗi video trích xuất ra được 3 file (từ video gốc và 2 videos tăng cường) đặc trưng chứa thông tin vị trí tương đối của các điểm mốc trên 2 bàn tay, khoảng cách thay đổi của bàn tay qua từng frame và khoảng cách của mỗi bàn tay đến 2 vai.
- **Đầu ra**:
  - Model: `model/best_model.h5`
  - Các biểu đồ, thang điểm đánh giá kết quả nhận dạng.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt
import seaborn as sns

# Verify TensorFlow and GPU
print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

# Paths
BASE_DIR = '/content/drive/MyDrive/Colab Notebooks/SLR_words'
DATA_DIR = '/content/word_features'
MODEL_DIR = os.path.join(BASE_DIR, 'model')
EVALUATION_DIR = os.path.join(BASE_DIR, 'evaluation')
METADATA_PATH = '/content/word_features/metadata.csv'

# Create directories
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(EVALUATION_DIR, exist_ok=True)

TensorFlow version: 2.18.0
GPU available: []


In [3]:
import zipfile
import pandas as pd
import os

base_dir = "/content/drive/MyDrive/Colab Notebooks/SLR_words"
zip_path = os.path.join(base_dir, "word_features.zip")
extract_path = base_dir

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [4]:
!cp -r /content/drive/MyDrive/Colab\ Notebooks/SLR_words/word_features /content/

In [5]:
df = pd.read_csv(METADATA_PATH)

df['feature_path'] = df['feature_path'].apply(
    lambda x: os.path.join(DATA_DIR, os.path.basename(x.replace('\\', '/')))
)

df.to_csv(METADATA_PATH, index=False)

print(df.head(3))

   video_id      label         type  \
0         1  anh_huong     original   
1         1  anh_huong   rotated_15   
2         1  anh_huong  rotated_-15   

                                     feature_path  
0     /content/word_features/video_1_original.npy  
1   /content/word_features/video_1_rotated_15.npy  
2  /content/word_features/video_1_rotated_-15.npy  


In [9]:
def convert_feature_path(x):
    """Convert feature_path to absolute path in DATA_DIR."""
    x = x.replace('\\', '/')  # Replace backslashes with forward slashes
    filename = os.path.basename(x)  # Extract filename, e.g., video_4_original.npy
    return os.path.join(DATA_DIR, filename)

def load_data(samples_per_label=30):
    """Load data from metadata.csv, select 30*3 files per label."""
    if not os.path.exists(METADATA_PATH):
        raise FileNotFoundError(f"Metadata file not found: {METADATA_PATH}")

    metadata = pd.read_csv(METADATA_PATH)
    # Update feature_path to absolute path
    metadata['feature_path'] = metadata['feature_path'].apply(convert_feature_path)
    X, y = [], []
    count = 0

    # Group by label
    labels = metadata['label'].unique()
    print(f"Found {len(labels)} labels: {labels}")

    for label in labels:
        label_data = metadata[metadata['label'] == label]
        video_ids = label_data['video_id'].unique()
        selected_vids = np.random.choice(video_ids, size=min(samples_per_label, len(video_ids)), replace=False)

        for vid in selected_vids:
            for vid_type in ['original', 'rotated_15', 'rotated_-15']:
                vid_data = label_data[(label_data['video_id'] == vid) & (label_data['type'] == vid_type)]
                if vid_data.empty:
                    print(f"Missing {vid_type} for video_id {vid}, label {label}")
                    continue
                feature_path = vid_data['feature_path'].iloc[0]
                features = np.load(feature_path)
                if features.shape == (30, 132):  # Check for 132-dimensional features
                    X.append(features)
                    y.append(label)
                    count += 1
                    if count % 100 == 0:
                        print(f"Loaded {count} samples")
                else:
                    print(f"Invalid shape {features.shape} for {feature_path}")

    if not X:
        raise ValueError("No valid data loaded")

    X = np.array(X)  # Shape: (n_samples, 30, 132)
    y = np.array(y)  # Shape: (n_samples,)

    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_onehot = tf.keras.utils.to_categorical(y_encoded)  # Shape: (n_samples, n_classes)

    print(f"Loaded {len(X)} samples with {len(label_encoder.classes_)} classes")
    return X, y_onehot, y_encoded, label_encoder

def build_lstm_model(input_shape, num_classes):
    """Build LSTM model for sign language word recognition."""
    model = Sequential([
        LSTM(256, input_shape=input_shape, return_sequences=True),
        Dropout(0.3),
        LSTM(128),
        Dropout(0.3),
        BatchNormalization(),
        Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def plot_training_history(history):
    """Plot training and validation loss/accuracy."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

    ax1.plot(history.history['loss'], label='Train Loss')
    ax1.plot(history.history['val_loss'], label='Validation Loss')
    ax1.set_title('Model Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()

    ax2.plot(history.history['accuracy'], label='Train Accuracy')
    ax2.plot(history.history['val_accuracy'], label='Validation Accuracy')
    ax2.set_title('Model Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(EVALUATION_DIR, 'training_history.png'))
    plt.close()

def plot_confusion_matrix(y_true, y_pred, label_encoder):
    """Plot confusion matrix."""
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title('Confusion Matrix - LSTM')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(os.path.join(EVALUATION_DIR, 'confusion_matrix.png'))
    plt.close()

def train_and_evaluate():
    """Train and evaluate LSTM model."""
    # Load data
    try:
        X, y_onehot, y_encoded, label_encoder = load_data(samples_per_label=30)
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Split train/test
    X_train, X_test, y_train, y_test, y_train_encoded, y_test_encoded = train_test_split(
        X, y_onehot, y_encoded, test_size=0.2, random_state=42, stratify=y_onehot
    )
    print(f"Train: {len(X_train)}, Test: {len(X_test)}")

    # Build model
    lstm_model = build_lstm_model(input_shape=(30, 132), num_classes=len(label_encoder.classes_))
    lstm_model.summary()

    # Callbacks
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    checkpoint = ModelCheckpoint(
        os.path.join(MODEL_DIR, 'best_lstm_model.h5'), save_best_only=True, monitor='val_loss'
    )

    # Train
    history = lstm_model.fit(
        X_train, y_train,
        validation_data=(X_test, y_test),
        epochs=50,
        batch_size=32,
        callbacks=[early_stopping, checkpoint],
        verbose=1
    )

    # Evaluate
    test_loss, test_accuracy = lstm_model.evaluate(X_test, y_test, verbose=0)
    print(f"\nTest Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    # Predict
    y_pred = lstm_model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_true_classes = np.argmax(y_test, axis=1)

    # Compute ROC-AUC
    roc_auc = roc_auc_score(y_test, y_pred, multi_class='ovr')
    print(f"ROC-AUC (One-vs-Rest): {roc_auc:.4f}")

    # Save evaluation results
    with open(os.path.join(EVALUATION_DIR, 'evaluation_metrics.txt'), 'w') as f:
        f.write(f"Test Loss: {test_loss:.4f}\n")
        f.write(f"Test Accuracy: {test_accuracy:.4f}\n")
        f.write(f"ROC-AUC (One-vs-Rest): {roc_auc:.4f}\n\n")
        f.write("Classification Report:\n")
        f.write(classification_report(y_true_classes, y_pred_classes, target_names=label_encoder.classes_))

    # Plot
    plot_training_history(history)
    plot_confusion_matrix(y_true_classes, y_pred_classes, label_encoder)

train_and_evaluate()

Found 66 labels: ['anh_huong' 'bac_si' 'bang_bo' 'bao_nhieu' 'bat_nat' 'benh' 'benh_vien'
 'bi_dong' 'binh_thuong' 'bong' 'buoi_chieu' 'buoi_sang' 'buoi_toi'
 'buoi_trua' 'cam' 'cam_on' 'cat' 'cau_cuu' 'cham_soc_suc_khoe' 'chat_doc'
 'chet' 'chua_benh' 'co_don' 'cuop' 'dau_bung' 'dau_chan' 'dau_mat'
 'dau_tai' 'dau_tay' 'di_lac' 'di_ve_sinh' 'dia_chi' 'dien_thoai'
 'don_cong_an' 'dong_y' 'dung_lai' 'gio' 'giup_do' 'hai_long' 'kham_benh'
 'khat_nuoc' 'kho_tieu' 'khong_hieu' 'kiem_tra' 'lay_benh' 'mat_ngu'
 'met_moi' 'muon' 'ngo_doc' 'nguoi_la' 'nguy_hiem' 'nhu cau' 'quen' 'sot'
 'thuc_an' 'thuoc_bo' 'tien' 'toi' 'tranh_thai' 'va_cham'
 've_sinh_ca_nhan' 'xam_hai_tinh_duc' 'xe_cuu_thuong' 'xin_loi' 'y_ta'
 'yeu_cau']
Loaded 100 samples
Loaded 200 samples
Loaded 300 samples
Loaded 400 samples
Loaded 500 samples
Loaded 600 samples
Loaded 700 samples
Loaded 800 samples
Loaded 900 samples
Loaded 1000 samples
Loaded 1100 samples
Loaded 1200 samples
Loaded 1300 samples
Loaded 1400 samples
Load

  super().__init__(**kwargs)


Epoch 1/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 211ms/step - accuracy: 0.1958 - loss: 4.5745



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 235ms/step - accuracy: 0.1969 - loss: 4.5683 - val_accuracy: 0.7374 - val_loss: 2.9668
Epoch 2/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.7130 - loss: 1.9622



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 231ms/step - accuracy: 0.7133 - loss: 1.9608 - val_accuracy: 0.8763 - val_loss: 1.2650
Epoch 3/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step - accuracy: 0.8519 - loss: 1.2264



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 240ms/step - accuracy: 0.8519 - loss: 1.2259 - val_accuracy: 0.9125 - val_loss: 0.8204
Epoch 4/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step - accuracy: 0.8944 - loss: 0.8879



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 244ms/step - accuracy: 0.8945 - loss: 0.8874 - val_accuracy: 0.9335 - val_loss: 0.5745
Epoch 5/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 208ms/step - accuracy: 0.9283 - loss: 0.6746



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 231ms/step - accuracy: 0.9283 - loss: 0.6745 - val_accuracy: 0.9251 - val_loss: 0.5251
Epoch 6/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 207ms/step - accuracy: 0.9528 - loss: 0.5135



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 229ms/step - accuracy: 0.9527 - loss: 0.5136 - val_accuracy: 0.9226 - val_loss: 0.4977
Epoch 7/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step - accuracy: 0.9388 - loss: 0.4898



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 238ms/step - accuracy: 0.9389 - loss: 0.4895 - val_accuracy: 0.9411 - val_loss: 0.3947
Epoch 8/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step - accuracy: 0.9479 - loss: 0.4057



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 244ms/step - accuracy: 0.9480 - loss: 0.4055 - val_accuracy: 0.9680 - val_loss: 0.2900
Epoch 9/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.9575 - loss: 0.3567



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 234ms/step - accuracy: 0.9575 - loss: 0.3568 - val_accuracy: 0.9731 - val_loss: 0.2605
Epoch 10/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 206ms/step - accuracy: 0.9669 - loss: 0.3069



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 229ms/step - accuracy: 0.9669 - loss: 0.3068 - val_accuracy: 0.9739 - val_loss: 0.2231
Epoch 11/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step - accuracy: 0.9768 - loss: 0.2558



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 240ms/step - accuracy: 0.9768 - loss: 0.2558 - val_accuracy: 0.9815 - val_loss: 0.1858
Epoch 12/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 231ms/step - accuracy: 0.9858 - loss: 0.2086 - val_accuracy: 0.9705 - val_loss: 0.1982
Epoch 13/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 245ms/step - accuracy: 0.9827 - loss: 0.2185 - val_accuracy: 0.9621 - val_loss: 0.2344
Epoch 14/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 221ms/step - accuracy: 0.9736 - loss: 0.2374 - val_accuracy: 0.9537 - val_loss: 0.2615
Epoch 15/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 223ms/step - accuracy: 0.9663 - loss: 0.2585 - val_accuracy: 0.9689 - val_loss: 0.2288
Epoch 16/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.97



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 234ms/step - accuracy: 0.9781 - loss: 0.2040 - val_accuracy: 0.9731 - val_loss: 0.1845
Epoch 17/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 223ms/step - accuracy: 0.9826 - loss: 0.1888 - val_accuracy: 0.9604 - val_loss: 0.2095
Epoch 18/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 225ms/step - accuracy: 0.9687 - loss: 0.2271 - val_accuracy: 0.9697 - val_loss: 0.2141
Epoch 19/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 222ms/step - accuracy: 0.9733 - loss: 0.2169 - val_accuracy: 0.9697 - val_loss: 0.1890
Epoch 20/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step - accuracy: 0.9894 - loss: 0.1383



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 222ms/step - accuracy: 0.9894 - loss: 0.1383 - val_accuracy: 0.9874 - val_loss: 0.1212
Epoch 21/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 209ms/step - accuracy: 0.9951 - loss: 0.1119



[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 229ms/step - accuracy: 0.9951 - loss: 0.1119 - val_accuracy: 0.9949 - val_loss: 0.0905
Epoch 22/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 228ms/step - accuracy: 0.9869 - loss: 0.1220 - val_accuracy: 0.9815 - val_loss: 0.1312
Epoch 23/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 226ms/step - accuracy: 0.9766 - loss: 0.1734 - val_accuracy: 0.9764 - val_loss: 0.1513
Epoch 24/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 235ms/step - accuracy: 0.9822 - loss: 0.1524 - val_accuracy: 0.9790 - val_loss: 0.1533
Epoch 25/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 223ms/step - accuracy: 0.9912 - loss: 0.1186 - val_accuracy: 0.9899 - val_loss: 0.1026
Epoch 26/50
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 223ms/step - accuracy: 0.9