In [1]:
from __future__ import unicode_literals
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import cv2
import numpy as np
import mediapipe as mp
from tensorflow.keras.models import load_model
import time
from collections import deque, Counter
import tempfile
import threading

import requests
import pygame
import arabic_reshaper
from bidi.algorithm import get_display
from PIL import ImageFont, ImageDraw, Image

# ElevenLabs API ---
API_KEY = 'sk_e883cf78c9b9e06bb7236ca8ba711d0f5fe77281579f616b'
VOICE_ID = 'wxweiHvoC2r2jFM7mS8b'

def speak_elevenlabs(text):
    def thread_speak():
        try:
            url = f"https://api.elevenlabs.io/v1/text-to-speech/{VOICE_ID}"
            headers = {
                "Accept": "audio/mpeg",
                "Content-Type": "application/json",
                "xi-api-key": API_KEY
            }
            data = {
                "text": text,
                "model_id": "eleven_multilingual_v2",
                "voice_settings": {
                    "stability": 0.5,
                    "similarity_boost": 0.8
                }
            }

            response = requests.post(url, json=data, headers=headers)
            response.raise_for_status()

            with tempfile.NamedTemporaryFile(delete=True, suffix=".mp3") as fp:
                fp.write(response.content)
                fp.flush()
                pygame.mixer.init()
                pygame.mixer.music.load(fp.name)
                pygame.mixer.music.play()
                while pygame.mixer.music.get_busy():
                    time.sleep(0.1)

        except Exception as e:
            print(f"حدث خطأ أثناء استخدام ElevenLabs: {e}")

    threading.Thread(target=thread_speak).start()

mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

class SignLanguageInterpreter:
    def __init__(self, model_path):
        self.model = load_model(f'{model_path}.h5')
        self.model.compile()

        with open(f'{model_path}_actions.txt', encoding='utf-8') as f:
            self.actions = np.array([line.strip() for line in f if line.strip()])

        self.scaler_mean = np.load(f'{model_path}_scaler_mean.npy')
        self.scaler_scale = np.load(f'{model_path}_scaler_scale.npy')

        self.no_frames = 30
        self.stability_window = 12
        self.confidence_threshold = 0.85
        self.sequence = deque(maxlen=self.no_frames)
        self.prediction_buffer = deque(maxlen=self.stability_window)
        self.sentence = []
        self.last_prediction_time = 0
        self.cooldown_period = 2.0
        self.visual_flash = 0

    def scale_features(self, features):
        return (features - self.scaler_mean) / self.scaler_scale

    def predict(self):
        if len(self.sequence) == self.no_frames:
            scaled_sequence = self.scale_features(np.array(self.sequence))
            res = self.model.predict(np.expand_dims(scaled_sequence, axis=0), verbose=0)[0]
            confidence = np.max(res)
            predicted_class = np.argmax(res)
            return predicted_class, confidence
        return None, 0

    def update_state(self, predicted_class, confidence):
        current_time = time.time()
        if confidence >= self.confidence_threshold:
            self.prediction_buffer.append(predicted_class)
            if len(self.prediction_buffer) == self.stability_window:
                common = Counter(self.prediction_buffer).most_common(1)[0]
                if common[1] >= int(0.8 * self.stability_window):
                    final_pred = common[0]
                    predicted_word = self.actions[final_pred]
                    if (not self.sentence or 
                        predicted_word != self.sentence[-1] or 
                        current_time - self.last_prediction_time > self.cooldown_period):
                        self.last_prediction_time = current_time
                        self.sentence.append(predicted_word)
                        self.prediction_buffer.clear()
                        self.sequence.clear()  # <=== هذا السطر يمنع تكرار الإشارة السابقة
                        self.visual_flash = 3  # set flash counter
                        return True
        return False

    def reset_after_no_hand(self):
        self.sequence.clear()
        self.prediction_buffer.clear()

    def get_current_sentence(self, max_length=5):
        return '  '.join(self.sentence[-max_length:])

def mediapipe_detection(image, model):
    image = cv2.cvtColor(cv2.flip(image, 1), cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = model.process(image)
    image.flags.writeable = True
    return cv2.cvtColor(image, cv2.COLOR_RGB2BGR), results

def draw_styled_landmarks(image, results):
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(
                image, hand_landmarks, mp_hands.HAND_CONNECTIONS,
                mp_drawing.DrawingSpec(color=(121, 22, 76), thickness=2, circle_radius=4),
                mp_drawing.DrawingSpec(color=(121, 44, 250), thickness=2, circle_radius=2)
            )

def extract_keypoints(results):
    lh = np.zeros(21*3)
    rh = np.zeros(21*3)
    if results.multi_hand_landmarks:
        for idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
            handedness = results.multi_handedness[idx].classification[0].label
            landmarks = np.array([[res.x, res.y, res.z] for res in hand_landmarks.landmark]).flatten()
            if handedness == 'Right':
                rh = landmarks
            else:
                lh = landmarks
    return np.concatenate([lh, rh])

def draw_confidence_bar(image, confidence):
    bar_width = 200
    bar_height = 20
    fill_width = int(bar_width * confidence)
    cv2.rectangle(image, (10, 60), (10 + bar_width, 60 + bar_height), (255, 255, 255), 1)
    cv2.rectangle(image, (10, 60), (10 + fill_width, 60 + bar_height), (0, 255, 0), -1)
    cv2.putText(image, f'Confidence: {confidence:.2f}', (10, 55),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1, cv2.LINE_AA)

def draw_arabic_text(image, text, position, font_size=36):
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    # تأكد أن الخط موجود في جهازك، وغير المسار حسب موقع الخط
    font_path = "/Users/mac/Desktop/LISAN AL ISHARA1/NotoNaskhArabic-Regular.ttf"  
    font = ImageFont.truetype(font_path, font_size)
    img_pil = Image.fromarray(image)
    draw = ImageDraw.Draw(img_pil)
    draw.text(position, bidi_text, font=font, fill=(255, 255, 255))
    return np.array(img_pil)

start_clicked = False

def mouse_callback(event, x, y, flags, param):
    global start_clicked
    if event == cv2.EVENT_LBUTTONDOWN:
        if 250 <= x <= 390 and 200 <= y <= 260:
            start_clicked = True

def draw_start_button(frame):
    cv2.rectangle(frame, (250, 200), (390, 260), (0, 255, 0), -1)
    cv2.putText(frame, 'START', (260, 240), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)

def wait_for_start():
    global start_clicked
    cv2.namedWindow("Sign Language Interpreter")
    cv2.setMouseCallback("Sign Language Interpreter", mouse_callback)
    dummy_frame = np.zeros((480, 640, 3), dtype=np.uint8)
    while not start_clicked:
        frame = dummy_frame.copy()
        draw_start_button(frame)
        draw_arabic_text(frame, "اضغط على زر Start لبدء الترجمة", (100, 150), 28)
        cv2.imshow("Sign Language Interpreter", frame)
        if cv2.waitKey(20) & 0xFF == ord('q'):
            break
    cv2.setMouseCallback("Sign Language Interpreter", lambda *args: None)

def main():
    model_path = input("Model name (without extension): ").strip()
    no_cam = int(input("Camera source number (e.g., 0): "))

    interpreter = SignLanguageInterpreter(model_path)
    wait_for_start()

    cap = cv2.VideoCapture(no_cam)
    cap.set(cv2.CAP_PROP_BUFFERSIZE, 1)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)

    if not cap.isOpened():
        print("Error: Could not open camera.")
        return

    last_hand_time = time.time()

    with mp_hands.Hands(
        max_num_hands=2,
        min_detection_confidence=0.75,
        min_tracking_confidence=0.7
    ) as hands:
        while cap.isOpened():
            success, frame = cap.read()
            if not success:
                continue

            image, results = mediapipe_detection(frame, hands)
            draw_styled_landmarks(image, results)

            keypoints = extract_keypoints(results)
            prediction_text = ""
            confidence = 0

            hand_detected = not np.all(keypoints == 0)

            if hand_detected:
                last_hand_time = time.time()
                interpreter.sequence.append(keypoints)
                predicted_class, confidence = interpreter.predict()
                if predicted_class is not None:
                    updated = interpreter.update_state(predicted_class, confidence)
                    prediction_text = interpreter.actions[predicted_class]
            else:
                if time.time() - last_hand_time > 3.0:
                    interpreter.reset_after_no_hand()
                    if interpreter.sentence:
                        full_sentence = ' '.join(interpreter.sentence)
                        speak_elevenlabs(full_sentence)
                        interpreter.sentence.clear()

            overlay = image.copy()
            cv2.rectangle(overlay, (0, 400), (640, 480), (0, 0, 0), -1)
            image = cv2.addWeighted(overlay, 0.6, image, 0.4, 0)

            sentence_display = interpreter.get_current_sentence().replace(' ', ' ')
            image = draw_arabic_text(image, sentence_display, (10, 420), 36)

            if confidence > 0:
                draw_confidence_bar(image, confidence)

            if interpreter.visual_flash > 0:
                cv2.rectangle(image, (0, 0), (640, 480), (0, 255, 255), thickness=15)
                interpreter.visual_flash -= 1

            cv2.imshow("Sign Language Interpreter", image)

            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == '__main__':
    main()

pygame 2.6.1 (SDL 2.28.4, Python 3.12.2)
Hello from the pygame community. https://www.pygame.org/contribute.html


Model name (without extension):  test_full_sentense
Camera source number (e.g., 0):  0


I0000 00:00:1748462263.332546  247683 gl_context.cc:369] GL version: 2.1 (2.1 ATI-7.0.3), renderer: AMD Radeon Pro 555X OpenGL Engine
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1748462263.357220  248412 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748462263.382681  248412 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1748462264.598079  248414 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
