In [1]:
import os
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder


#tf.get_logger().setLevel('ERROR')           # Suppress TensorFlow logging (2)


gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Loading pipeline config and building a detection model
configs = config_util.get_configs_from_pipeline_file("sign_language_detection_15k\models\my_ssd_mobilenet_v2_fpnlite_320x320\pipeline.config")
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)

# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join("sign_language_detection_15k\models\my_ssd_mobilenet_v2_fpnlite_320x320", 'ckpt-16')).expect_partial()

@tf.function
def detect_fn(image):
    """Detect objects in image."""

    image, shapes = detection_model.preprocess(image)
    prediction_dict = detection_model.predict(image, shapes)
    detections = detection_model.postprocess(prediction_dict, shapes)

    return detections

In [2]:
import cv2
import numpy as np
import google.generativeai as genai

In [3]:
category_index = label_map_util.create_category_index_from_labelmap('sign_language_detection_15k\\annotations\\label_map.pbtxt')

In [4]:
cap = cv2.VideoCapture(0)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

genai.configure(api_key="Enter your gemini API key here")
model = genai.GenerativeModel("gemini-pro")
chat = model.start_chat(history=[])
l = []

def get_gemini_response(question):
  response = chat.send_message(question)
  return response

def hello():
    l=[]
    x = (get_gemini_response("this is sign language detection project, our model detects some signs and these are the detected signs - " + ' '.join(unique_labels) + ". Generate a simple sentence out of these signs which the impaired person wishes to communicate"))
    s = ''
    for i in x:
        s +=(i.text)
    l.append(s)
    print(l)
    return l[0]

In [5]:
unique_labels = []

while True: 
    ret, frame = cap.read()
    image_np = np.array(frame)
    
    input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
    detections = detect_fn(input_tensor)
    
    num_detections = int(detections.pop('num_detections'))
    detections = {key: value[0, :num_detections].numpy()
                  for key, value in detections.items()}
    detections['num_detections'] = num_detections

    detections['detection_classes'] = detections['detection_classes'].astype(np.int64)

    label_id_offset = 1
    image_np_with_detections = image_np.copy()

    viz_utils.visualize_boxes_and_labels_on_image_array(
                image_np_with_detections,
                detections['detection_boxes'],
                detections['detection_classes']+label_id_offset,
                detections['detection_scores'],
                category_index,
                use_normalized_coordinates=True,
                max_boxes_to_draw=5,
                min_score_thresh=.5,
                agnostic_mode=False)

    for i in range(min(5, detections['num_detections'])):
        class_id = int(detections['detection_classes'][i]) + label_id_offset
        score = detections['detection_scores'][i]
        if score > 0.7:
            label = category_index[class_id]['name']
            # Exclude face label
            if label != 'Face' and label not in unique_labels:
                unique_labels.append(label)
                print(f'Detected label: {label}, Score: {score}')
                
    cv2.imshow('object detection',  cv2.resize(image_np_with_detections, (800, 600)))
    
    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
    elif key == ord('u'):
        print(unique_labels)
        hello()
        unique_labels.clear()
        print("Cleared")

cap.release()
cv2.destroyAllWindows()


Detected label: Hello, Score: 0.9639967083930969
Detected label: Yes, Score: 0.7143480777740479
['Hello', 'Yes']
['Hello, may I have a Yes?']
Cleared
Detected label: Yes, Score: 0.7390885949134827
Detected label: Hello, Score: 0.9605106115341187
Detected label: I Love You, Score: 0.8469918966293335
Detected label: No, Score: 0.7297326326370239
['Yes', 'Hello', 'I Love You', 'No']
['Hello, I love you. Is that a yes or no?']
Cleared
