In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers sentence-transformers faiss-cpu patool underthesea



In [3]:
import os
import patoolib
import json
import pandas as pd
import numpy as np
from typing import List, Dict, Tuple
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
import faiss
from underthesea import word_tokenize
import re

In [None]:
base_dir = "/content/drive/MyDrive/VSL"
os.makedirs(f"{base_dir}/data/raw", exist_ok=True)
os.makedirs(f"{base_dir}/data/videos", exist_ok=True)
os.makedirs(f"{base_dir}/data/embeddings", exist_ok=True)

In [None]:
#extract videos from rar file
rar_path = f"{base_dir}/data/raw/VIDEO.rar"
output_dir = f"{base_dir}/data/videos"
patoolib.extract_archive(rar_path, outdir=output_dir)

INFO patool: Extracting /content/drive/MyDrive/VSL/data/raw/VIDEO.rar ...
INFO:patool:Extracting /content/drive/MyDrive/VSL/data/raw/VIDEO.rar ...
INFO patool: running /usr/bin/unrar x -kb -or -- /content/drive/MyDrive/VSL/data/raw/VIDEO.rar
INFO:patool:running /usr/bin/unrar x -kb -or -- /content/drive/MyDrive/VSL/data/raw/VIDEO.rar
INFO patool: ... /content/drive/MyDrive/VSL/data/raw/VIDEO.rar extracted to `/content/drive/MyDrive/VSL/data/videos'.
INFO:patool:... /content/drive/MyDrive/VSL/data/raw/VIDEO.rar extracted to `/content/drive/MyDrive/VSL/data/videos'.


'/content/drive/MyDrive/VSL/data/videos'

In [4]:
#prepare data
class VietnameseSignLanguageData:
    def __init__(self, json_path: str):
        self.json_path = json_path
        self.data = None
        self.df = None

    def load_data(self):
        with open(self.json_path, 'r', encoding='utf-8') as f:
            self.data = json.load(f)

        self.df = pd.DataFrame(self.data['data'])
        print(f"Loaded {len(self.df)} sign language entries")
        return self.df

    def preprocess_text(self, text: str) -> str:
        if not text or pd.isna(text):
          return ""
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def create_searchable_text(self, row: pd.Series) -> str:
        parts = [
            row['word'],
            row.get('_word', ''),
            row.get('description', ''),
            row.get('tl', '')
        ]
        return " ".join([str(p) for p in parts if p and not pd.isna(p)])

    def prepare_data(self):
        self.df['word_normalized'] = self.df['word'].apply(self.preprocess_text)
        self.df['description_normalized'] = self.df['description'].apply(self.preprocess_text)
        self.df['searchable_text'] = self.df.apply(self.create_searchable_text, axis=1)
        self.df['searchable_text_normalized'] = self.df['searchable_text'].apply(self.preprocess_text)
        print("Data preprocessing complete")
        return self.df

In [5]:
#embedding model
class Embedder:
    def __init__(self, model_name: str = "dangvantuan/vietnamese-embedding"):
        print(f"Loading SentenceTransformer model: {model_name}")
        self.model = SentenceTransformer(model_name)
        print("Model loaded successfully.")

    def embed_text(self, text: str) -> np.ndarray:
        if not text:
            return np.zeros(self.model.get_sentence_embedding_dimension())
        embedding = self.model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
        return embedding

    def embed_batch(self, texts: List[str], batch_size: int = 32) -> np.ndarray:
        embeddings = self.model.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            normalize_embeddings=True,
            show_progress_bar=True
        )
        return embeddings

In [6]:
#build vector database
class VectorDatabase:
    def __init__(self, dimension: int):
        self.dimension = dimension
        self.index = None
        self.id_mapping = []

    def build_index(self, embeddings: np.ndarray, ids: List[str]):
        embeddings_normalized = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
        self.index = faiss.IndexFlatIP(self.dimension)
        self.index.add(embeddings_normalized.astype('float32'))
        self.id_mapping = ids
        print(f"Built FAISS index with {len(ids)} vectors")

    def search(self, query_embedding: np.ndarray, k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
        query_normalized = query_embedding / np.linalg.norm(query_embedding)
        query_normalized = query_normalized.reshape(1, -1).astype('float32')
        similarities, indices = self.index.search(query_normalized, k)
        return similarities[0], indices[0]

    def save_index(self, path: str):
        faiss.write_index(self.index, path)
        print(f"Index saved to {path}")

    def load_index(self, path: str):
        self.index = faiss.read_index(path)
        print(f"Index loaded from {path}")

In [16]:
#the chatbot core
#this is retrieve-based only for initial testing
class VietnameseSignLanguageChatbot:
    def __init__(self,
                 json_path: str,
                 model_name: str = "dangvantuan/vietnamese-embedding"):
        print("Loading data...")
        self.data_handler = VietnameseSignLanguageData(json_path)
        self.df = self.data_handler.load_data()
        self.df = self.data_handler.prepare_data()

        print("Initializing embedding model...")
        self.embedder = Embedder(model_name)
        self.vector_db = None

    def build_knowledge_base(self, batch_size: int = 32):

        print("Building knowledge base...")

        self.df = self.df.reset_index(drop=True)
        texts = self.df['searchable_text_normalized'].tolist()
        embeddings = self.embedder.embed_batch(texts, batch_size=batch_size)

        dimension = embeddings.shape[1]
        self.vector_db = VectorDatabase(dimension)
        self.vector_db.build_index(embeddings, self.df.index.tolist())
        self.df = self.df.reset_index(drop=True)

        print("Knowledge base built successfully!")

    #Handle user query
    def query(self, user_query: str, top_k: int = 3, similarity_threshold: float = 0.5) -> List[Dict]:
        normalized_query = self.data_handler.preprocess_text(user_query)
        exact_matches = self.df[self.df['word_normalized'] == normalized_query]
        if not exact_matches.empty:
            return [{
                'video_id': row['_id'],
                'word': row['word'],
                'description': row['description'],
                'part_of_speech': row.get('tl', ''),
                'similarity_score': 1.0,
                'type': row.get('type', 0)
            } for _, row in exact_matches.iterrows()]

        # Fallback to semantic search
        query_embedding = self.embedder.embed_text(normalized_query)
        similarities, indices = self.vector_db.search(query_embedding, k=top_k)
        results = []
        for similarity, idx in zip(similarities, indices):
            if similarity >= similarity_threshold:
                row = self.df.iloc[idx]
                result = {
                    'video_id': row['_id'],
                    'word': row['word'],
                    'description': row['description'],
                    'part_of_speech': row.get('tl', ''),
                    'similarity_score': float(similarity),
                    'type': row.get('type', 0)
                }
                results.append(result)

        return results


    def format_response(self, results: List[Dict]) -> str:
        if not results:
            return "Xin lỗi, tôi không tìm thấy kết quả phù hợp. Vui lòng thử lại với từ khóa khác."

        response = f"Tìm thấy {len(results)} kết quả:\n\n"
        for i, result in enumerate(results, 1):
            response += f"--- Kết quả {i} ---\n"
            response += f"Từ: {result['word']}\n"
            response += f"ID Video: {result['video_id']}\n"
            response += f"Mô tả: {result['description']}\n"
            response += f"Loại từ: {result['part_of_speech']}\n"
            response += f"Độ tương đồng: {result['similarity_score']:.2%}\n\n"

        return response

    def chat(self, user_query: str, top_k: int = 1) -> str:
        results = self.query(user_query, top_k=top_k)
        return self.format_response(results)

    def save_knowledge_base(self, index_path: str, data_path: str):
        self.vector_db.save_index(index_path)
        self.df.to_pickle(data_path)
        print(f"Knowledge base saved to {index_path} and {data_path}")

    def load_knowledge_base(self, index_path: str, data_path: str):
        self.df = pd.read_pickle(data_path)
        sample_embedding = self.embedder.embed_text("test")
        dimension = len(sample_embedding)

        self.vector_db = VectorDatabase(dimension)
        self.vector_db.load_index(index_path)
        self.vector_db.id_mapping = self.df.index.tolist()

        print("Knowledge base loaded successfully!")

In [17]:
#Initialize chatbot
chatbot = VietnameseSignLanguageChatbot(
    json_path="/content/drive/MyDrive/VSL/data/VSL_DATA.json",
    model_name="dangvantuan/vietnamese-embedding"
)
chatbot.build_knowledge_base(batch_size=32)


Loading data...
Loaded 4362 sign language entries
Data preprocessing complete
Initializing embedding model...
Loading SentenceTransformer model: dangvantuan/vietnamese-embedding
Model loaded successfully.
Building knowledge base...


Batches:   0%|          | 0/137 [00:00<?, ?it/s]

Built FAISS index with 4362 vectors
Knowledge base built successfully!


In [18]:
def interactive_chat(chatbot):
    print("Nhập câu hỏi của bạn (hoặc 'quit' để thoát)")

    while True:
        user_input = input("Bạn: ").strip()

        if user_input.lower() in ['quit', 'exit', 'thoát']:
            print("Cảm ơn bạn đã sử dụng chatbot! Tạm biệt!")
            break

        if not user_input:
            continue

        response = chatbot.chat(user_input, top_k=1)
        print(f"\nChatbot:\n{response}")

interactive_chat(chatbot)

Nhập câu hỏi của bạn (hoặc 'quit' để thoát)
Bạn: từ địa chỉ được diễn tả bằng ngôn ngữ kí hiệu như thế nào 

Chatbot:
Tìm thấy 1 kết quả:

--- Kết quả 1 ---
Từ: nghĩa trang
ID Video: W02371
Mô tả: Nghĩa địa.
Loại từ: Danh từ
Độ tương đồng: 75.30%


Bạn: từ địa chỉ được diễn tả như thế nào

Chatbot:
Tìm thấy 1 kết quả:

--- Kết quả 1 ---
Từ: nghĩa trang
ID Video: W02371
Mô tả: Nghĩa địa.
Loại từ: Danh từ
Độ tương đồng: 65.55%


Bạn: cách diễn tả từ địa chỉ

Chatbot:
Tìm thấy 1 kết quả:

--- Kết quả 1 ---
Từ: nghĩa trang
ID Video: W02371
Mô tả: Nghĩa địa.
Loại từ: Danh từ
Độ tương đồng: 63.80%


Bạn: quit
Cảm ơn bạn đã sử dụng chatbot! Tạm biệt!


=> Base model does not work well, the chatbot can not capture the meaning in user query correctly and not function well in question answering.
=> Currently checking on other models
