# Tensorflow 2.9.1

In [1]:
import tensorflow as tf
tf.version.VERSION

'2.9.1'

# Creating and interaction with "data" directory

In [2]:
import os

DATA_DIR = os.path.join(os.getcwd(), 'data')
MODELS_DIR = os.path.join(DATA_DIR, 'models')
for dir in [DATA_DIR, MODELS_DIR]:
    if not os.path.exists(dir):
        os.mkdir(dir)

# Model downloading code 

You can download various models but here we will download ssd_mobilenet (SSD ResNet101 V1 FPN 640x640). Here in the Model zoo, you can find more models.  <https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf2_detection_zoo.md>_. 

In [3]:
import tarfile
import urllib.request

# Download and extract model
MODEL_DATE = '20200711'
MODEL_NAME = 'ssd_mobilenet_v2_320x320_coco17_tpu-8'
MODEL_TAR_FILENAME = MODEL_NAME + '.tar.gz'
MODELS_DOWNLOAD_BASE = 'http://download.tensorflow.org/models/object_detection/tf2/'
MODEL_DOWNLOAD_LINK = MODELS_DOWNLOAD_BASE + MODEL_DATE + '/' + MODEL_TAR_FILENAME
PATH_TO_MODEL_TAR = os.path.join(MODELS_DIR, MODEL_TAR_FILENAME)
PATH_TO_CKPT = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'checkpoint/'))
PATH_TO_CFG = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, 'pipeline.config'))
if not os.path.exists(PATH_TO_CKPT):
    print('Downloading model. This may take a while... ', end='')
    urllib.request.urlretrieve(MODEL_DOWNLOAD_LINK, PATH_TO_MODEL_TAR)
    tar_file = tarfile.open(PATH_TO_MODEL_TAR)
    tar_file.extractall(MODELS_DIR)
    tar_file.close()
    os.remove(PATH_TO_MODEL_TAR)
    print('Done')

# Download labels file
LABEL_FILENAME = 'mscoco_label_map.pbtxt'
LABELS_DOWNLOAD_BASE = \
    'https://raw.githubusercontent.com/tensorflow/models/master/research/object_detection/data/'
PATH_TO_LABELS = os.path.join(MODELS_DIR, os.path.join(MODEL_NAME, LABEL_FILENAME))
if not os.path.exists(PATH_TO_LABELS):
    print('Downloading label file... ', end='')
    urllib.request.urlretrieve(LABELS_DOWNLOAD_BASE + LABEL_FILENAME, PATH_TO_LABELS)
    print('Done')

# Then Load the SSD_MobileNet model

In [4]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'    # Suppress TensorFlow logging
import tensorflow as tf
from object_detection.utils import label_map_util
from object_detection.utils import config_util
from object_detection.utils import visualization_utils as viz_utils
from object_detection.builders import model_builder

tf.get_logger().setLevel('ERROR')           # Suppress TensorFlow logging (2)

# Enable GPU dynamic memory allocation
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Load pipeline config and build a detection model
configs = config_util.get_configs_from_pipeline_file(PATH_TO_CFG)
model_config = configs['model']
detection_model = model_builder.build(model_config=model_config, is_training=False)

# Restore checkpoint
ckpt = tf.compat.v2.train.Checkpoint(model=detection_model)
ckpt.restore(os.path.join(PATH_TO_CKPT, 'ckpt-0')).expect_partial()

@tf.function
def detect_fn(image):
    """Detect objects in image."""

    image, shapes = detection_model.preprocess(image)
    prediction_dict = detection_model.predict(image, shapes)
    detections = detection_model.postprocess(prediction_dict, shapes)

    return detections, prediction_dict, tf.reshape(shapes, [-1])

# Now, Load label map data (for plotting)
For instance, if the model predicts 10, then we know that this shows person.  

In [5]:
category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS,
                                                                    use_display_name=True)
name_to_id = {value['name']: key for key, value in category_index.items()}
print(category_index)
print(name_to_id)

{1: {'id': 1, 'name': 'person'}, 2: {'id': 2, 'name': 'bicycle'}, 3: {'id': 3, 'name': 'car'}, 4: {'id': 4, 'name': 'motorcycle'}, 5: {'id': 5, 'name': 'airplane'}, 6: {'id': 6, 'name': 'bus'}, 7: {'id': 7, 'name': 'train'}, 8: {'id': 8, 'name': 'truck'}, 9: {'id': 9, 'name': 'boat'}, 10: {'id': 10, 'name': 'traffic light'}, 11: {'id': 11, 'name': 'fire hydrant'}, 13: {'id': 13, 'name': 'stop sign'}, 14: {'id': 14, 'name': 'parking meter'}, 15: {'id': 15, 'name': 'bench'}, 16: {'id': 16, 'name': 'bird'}, 17: {'id': 17, 'name': 'cat'}, 18: {'id': 18, 'name': 'dog'}, 19: {'id': 19, 'name': 'horse'}, 20: {'id': 20, 'name': 'sheep'}, 21: {'id': 21, 'name': 'cow'}, 22: {'id': 22, 'name': 'elephant'}, 23: {'id': 23, 'name': 'bear'}, 24: {'id': 24, 'name': 'zebra'}, 25: {'id': 25, 'name': 'giraffe'}, 27: {'id': 27, 'name': 'backpack'}, 28: {'id': 28, 'name': 'umbrella'}, 31: {'id': 31, 'name': 'handbag'}, 32: {'id': 32, 'name': 'tie'}, 33: {'id': 33, 'name': 'suitcase'}, 34: {'id': 34, 'name'

# Define the video stream

In [6]:
import cv2
import speech_recognition as sr
import pyttsx3 
import threading

# Function to convert text to
# speech
def SpeakText(command):
    
    # Initialize the engine
    engine = pyttsx3.init()
    engine.say(command) 
    engine.runAndWait()

detected_objects = []
scores = []


def recognize_speech():
    global detected_objects
    global scores

    with sr.Microphone() as source:
        print("Adjusting for noise... Please wait.")
        print("Listening...")
        while True:
            try:
                r.adjust_for_ambient_noise(source)
                audio = r.listen(source)
                MyText = r.recognize_google(audio).lower()
                print(f"Did you say: {MyText}")
                if "what can you see" in MyText:
                    if detected_objects:
                        response = f"I can see: {', '.join(detected_objects)}"
                    else:
                        response = "I can't see anything clearly."
                    print(response)
                    SpeakText(response)
                if "find" in MyText:
                    list1=MyText.split(" ")
                    obj1=list1[-1]
                    response = f"Looking for: {obj1}"
                    print(response)
                    SpeakText(response)
                    if obj1 in name_to_id and name_to_id[obj1] in classes:
                        max_score = 0
                        for i in range(len(classes)):
                            if classes[i] == name_to_id[obj1] and scores[i] > max_score:
                                max_score = scores[i]
                        if max_score > 0.5:
                            response = "I can see it."
                        else:
                            response = f"I cannot see what you want. I can see: {', '.join(detected_objects)}"
                    else:
                        response = f"I cannot see {obj1}. I can see: {', '.join(detected_objects)}"
                    
                    print(response)
                    SpeakText(response)

                
            except sr.UnknownValueError:
                print("Could not understand audio")
            except sr.RequestError as e:
                print(f"API request error: {e}")

# Run the following code for webcam (Credit goes to Krish Naik)

In [7]:
import numpy as np
import pandas as pd

cap = cv2.VideoCapture(0)
# Initialize the recognizer 
r = sr.Recognizer() 

# Start speech recognition in a separate thread
speech_thread = threading.Thread(target=recognize_speech, daemon=True)
speech_thread.start()

while True:
    # Read frame from camera
    ret, image_np = cap.read()

    # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
    image_np_expanded = np.expand_dims(image_np, axis=0)

    # Things to try:
    # Flip horizontally
    # image_np = np.fliplr(image_np).copy()

    # Convert image to grayscale
    # image_np = np.tile(
    #     np.mean(image_np, 2, keepdims=True), (1, 1, 3)).astype(np.uint8)

    input_tensor = tf.convert_to_tensor(np.expand_dims(image_np, 0), dtype=tf.float32)
    detections, predictions_dict, shapes = detect_fn(input_tensor)
     # ✅ Extract detected objects
    scores = detections['detection_scores'][0].numpy()
    #print(scores)
    #print(len(scores))
    classes = (detections['detection_classes'][0].numpy() + 1).astype(int)
    label_id_offset = 1
    detected_objects.clear()
    for i in range(len(scores)):
        if scores[i] > 0.5:  # Only consider objects with confidence > 50%
            new_object = category_index.get(classes[i], {'name': 'unknown'})['name']
            if new_object not in detected_objects:
                detected_objects.append(new_object)
            #detected_objects.append((detections['detection_boxes'][0].numpy(),
          #(detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
          #detections['detection_scores'][0].numpy()))

    
    
    image_np_with_detections = image_np.copy()
    
    viz_utils.visualize_boxes_and_labels_on_image_array(
          image_np_with_detections,
          detections['detection_boxes'][0].numpy(),
          (detections['detection_classes'][0].numpy() + label_id_offset).astype(int),
          detections['detection_scores'][0].numpy(),
          category_index,
          use_normalized_coordinates=True,
          max_boxes_to_draw=200,
          min_score_thresh=0.6,
          agnostic_mode=False)

    # Display output
    cv2.imshow('object detection', cv2.resize(image_np_with_detections, (800, 600)))

    if cv2.waitKey(25) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()

Adjusting for noise... Please wait.
Listening...
Did you say: find me a person
Looking for: person
I can see it.
Did you say: find me a person
Looking for: person
I cannot see what you want. I can see: 
Did you say: find me a person find me a book
Looking for: book
I cannot see what you want. I can see: person
Did you say: find me a person
Looking for: person
I can see it.
Did you say: find a person
Looking for: person
I cannot see what you want. I can see: 
Did you say: find me a book
Looking for: book
I can see it.
Did you say: find me a book
Looking for: book
I cannot see book. I can see: person
Did you say: okay it's so good it's actually incredible chat
Did you say: chat
Did you say: let's get rid of the cannot understand


KeyboardInterrupt: 

Exception in thread Thread-3:
Traceback (most recent call last):
  File "C:\Users\Sumayya\AppData\Local\Temp\ipykernel_13808\1630875376.py", line 30, in recognize_speech
  File "C:\Users\Sumayya\anaconda3\envs\tensorflow\lib\site-packages\speech_recognition\recognizers\google.py", line 262, in recognize_legacy
    return output_parser.parse(response_text)
  File "C:\Users\Sumayya\anaconda3\envs\tensorflow\lib\site-packages\speech_recognition\recognizers\google.py", line 134, in parse
    actual_result = self.convert_to_result(response_text)
  File "C:\Users\Sumayya\anaconda3\envs\tensorflow\lib\site-packages\speech_recognition\recognizers\google.py", line 183, in convert_to_result
    raise UnknownValueError()
speech_recognition.exceptions.UnknownValueError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Sumayya\anaconda3\envs\tensorflow\lib\threading.py", line 980, in _bootstrap_inner
    self.run()
  File "C:\U

# Run the following code for video. 
Put the video (for example test2.mp4) in object_detection folder (\models\research\object_detection). 