# Vision AI

## Imports

In [241]:
import cv2

import os
import asyncio

from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from dotenv import load_dotenv, find_dotenv

import google.generativeai as genai
import google
import pathlib

import edge_tts

import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

import time

import speech_recognition as sr

import math

import  pyaudio
import wave

from concurrent.futures import ThreadPoolExecutor

## Jarvis 🤖

In [242]:
load_dotenv()
API_KEY  = os.getenv("API_GEMINI")

In [243]:
template = """
Você é um assistente e eu sou o seu mestre. Você é designado a me ajudar para solucionar qualquer questão lógica que eu fizer. Sempre me trate e se refira a mim como "Mestre". Não use na sua resposta caracteres especiais como (***): {text}
"""

In [244]:
genai.configure(api_key=API_KEY)
model = genai.GenerativeModel("gemini-1.5-flash", system_instruction=template)

In [245]:
VOICES = ['pt-BR-AntonioNeural']
VOICE = VOICES[0]
OUTPUT_FILE = 'text_speech.mp3'

In [246]:
async def Voice(translate) -> None:
    communicate = edge_tts.Communicate(translate, VOICE)
    await communicate.save(OUTPUT_FILE)

In [247]:
def delete_cahche_files():
    print("My files:")
    for f in genai.list_files():
        print("  ", f.name)
        
        myfile = genai.get_file(f.name)
        myfile.delete()

In [248]:
async def text_to_text(prompt) -> None:
  response = model.generate_content(prompt)
  if __name__ == "__main__":
    await Voice(response.text)
    os.system('text_speech.mp3')

In [249]:
async def image_to_text(image_path, prompt) -> None:
  response = model.generate_content([{'mime_type':'image/jpeg', 'data': pathlib.Path(f'{image_path}').read_bytes()}, prompt])
  await Voice(response.text)
  os.system('text_speech.mp3')

In [250]:
async def video_to_text(video_file_name, prompt) -> None:
  
  video_file = genai.upload_file(path=video_file_name)
  
  while video_file.state.name == "PROCESSING":
    print('.', end='')
    time.sleep(10)
    video_file = genai.get_file(video_file.name)

  if video_file.state.name == "FAILED":
    raise ValueError(video_file.state.name)

  print("Making LLM inference request...")
  response = model.generate_content([video_file, prompt],
                                    request_options={"timeout": 600})

  await Voice(response.text)
  os.system('text_speech.mp3')
  
  delete_cahche_files()

## Functions

- ### Foto

In [251]:
def save_foto(frame):
    timesr = time.strftime("%Y%m%d_%H%M%S")
    cv2.imwrite(f"Images/{timesr}.jpg", frame)
    time.sleep(0.5)
    
    return f"Images/{timesr}.jpg"

- ### Gravar Video

In [252]:
def save_video():
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    timesr = time.strftime("%Y%m%d_%H%M%S")
    
    duration_in_seconds = 15
    fps = 30
    
    out = cv2.VideoWriter(f'Video/{timesr}.avi', fourcc, fps, (640, 480))
    
    total_frames = duration_in_seconds * fps

    frame_count = 0
    while frame_count < total_frames:
        
        status, frame = cap.read()
        out.write(frame)
        frame_count+=1
        
    return f'Video/{timesr}.avi'

- ### Audio

In [253]:
async def Mic() -> None:
  microfone = sr.Recognizer()
  print("Diga alguma coisa: ")
  with sr.Microphone() as source:
    audio = microfone.listen(source)
  try:
    frase = microfone.recognize_google(audio, language="pt-BR")
    print(frase)
    return await text_to_text(frase)
  except sr.UnknownValueError:
    print("Não entendi")
    return False

In [254]:
def Audio():
  audio = pyaudio.PyAudio()
  
  stream = audio.open(
    input = True,
    format = pyaudio.paInt16,
    channels = 1,
    rate = 44000,
    frames_per_buffer = 1024,
  )
  
  frames = []
  
  timeout = 15
  timeout_start = time.time()
  
  try:
    while time.time() < timeout_start + timeout:
      bloco = stream.read(1024)
      frames.append(bloco)
  except KeyboardInterrupt:
      pass
  
  arquivo_final = wave.open("audio/gravacao.wav", "wb")
  arquivo_final.setnchannels(1)
  arquivo_final.setframerate(44000)
  arquivo_final.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
  arquivo_final.writeframes(b"".join(frames))
  arquivo_final.close()
  
  r = sr.Recognizer()

  with sr.WavFile("audio/gravacao.wav") as source:              
    audio = r.record(source)                        

  try:
      return (""+r.recognize_google(audio, language="pt-BR"))
  except LookupError:         
      return("Sem Falas!")

- ### Imagem / Audio

In [255]:
async def Image_Audio(frame) -> None:  
  with ThreadPoolExecutor() as executor:
    future_foto = executor.submit(save_foto, frame)
    future_audio = executor.submit(Audio)

    path_foto = future_foto.result()
    prompt = future_audio.result()                       

  await image_to_text(path_foto,prompt)   

- ### Video / Audio

In [256]:
async def Video_Audio() -> None: 
  with ThreadPoolExecutor() as executor:
    future_video = executor.submit(save_video)
    future_audio = executor.submit(Audio)

    path_video = future_video.result()
    prompt = future_audio.result()
  
    await video_to_text(path_video, prompt)

## Gestos

### DISTANCIA

In [257]:
def calcular_distancia(ponto1, ponto2):
    """Calcula a distância euclidiana entre dois pontos."""
    return math.sqrt((ponto1[0] - ponto2[0])**2 + (ponto1[1] - ponto2[1])**2)

- ### 👌 (Foto)

In [258]:
def ok(h, w, hand_landmarks, frame):
    
    polegar_1 = hand_landmarks.landmark[1]
    polegar_1_x, polegar_1_y = int(polegar_1.x * w), int(polegar_1.y * h)
    
    polegar_4 = hand_landmarks.landmark[4]
    polegar_4_x, polegar_4_y = int(polegar_4.x * w), int(polegar_4.y * h)
    
    polegar_3 = hand_landmarks.landmark[3]
    polegar_3_x, polegar_3_y = int(polegar_3.x * w), int(polegar_3.y * h)
    
    indicador_8 = hand_landmarks.landmark[8]
    indicador_8_x, indicador_8_y = int(indicador_8.x * w), int(indicador_8.y * h)
    
    indicador_6 = hand_landmarks.landmark[6]
    indicador_6_x, indicador_6_y = int(indicador_6.x * w), int(indicador_6.y * h)
    
    indicador_5 = hand_landmarks.landmark[5]
    indicador_5_x, indicador_5_y = int(indicador_5.x * w), int(indicador_5.y * h)
    
    medio_12 = hand_landmarks.landmark[12]
    medio_12_y = int(medio_12.y * h)
    
    anelar_16 = hand_landmarks.landmark[16]
    anelar_16_y = int(anelar_16.y * h)
    
    mindinho_20 = hand_landmarks.landmark[20]
    mindinho_20_y = int(mindinho_20.y * h)
    
    distancia_polegar_indicador = calcular_distancia(
        (polegar_4_x, polegar_4_y),
        (indicador_8_x, indicador_8_y)
    )
    
    
    
    if (distancia_polegar_indicador < 0.05 * w and 
        indicador_5_y > indicador_6_y and
        polegar_1_y > indicador_6_y and  
        polegar_3_x > indicador_5_x):  
            return save_foto(frame)

- ### 👍 (Video)

In [259]:
def positive(h, w, hand_landmarks, frame):
    polegar_1 = hand_landmarks.landmark[1]
    polegar_1_x, polegar_1_y = int(polegar_1.x * w), int(polegar_1.y * h)
    
    polegar_4 = hand_landmarks.landmark[4]
    polegar_4_x, polegar_4_y = int(polegar_4.x * w), int(polegar_4.y * h)
    
    indicador_8 = hand_landmarks.landmark[8]
    indicador_8_x, indicador_8_y = int(indicador_8.x * w), int(indicador_8.y * h)
    indicador_5 = hand_landmarks.landmark[5]
    indicador_5_x, indicador_5_y = int(indicador_5.x * w), int(indicador_5.y * h)
    
    medio_12 = hand_landmarks.landmark[12]
    medio_12_x, medio_12_y = int(medio_12.x * w), int(medio_12.y * h)
    medio_9 = hand_landmarks.landmark[9]
    medio_9_x, medio_9_y = int(medio_9.x * w), int(medio_9.y * h)
    
    anelar_16 = hand_landmarks.landmark[16]
    anelar_16_x, anelar_16_y = int(anelar_16.x * w), int(anelar_16.y * h)
    anelar_13 = hand_landmarks.landmark[13]
    anelar_13_x, anelar_13_y = int(anelar_13.x * w), int(anelar_13.y * h)
    
    mindinho_20 = hand_landmarks.landmark[20]
    mindinho_20_x, mindinho_20_y = int(mindinho_20.x * w), int(mindinho_20.y * h)
    mindinho_17 = hand_landmarks.landmark[17]
    mindinho_17_x, mindinho_17_y = int(mindinho_17.x * w), int(mindinho_17.y * h)
    
    
    if (polegar_4_y < polegar_1_y - 0.05 * h and  
        indicador_8_y > indicador_5_y and         
        medio_12_y > medio_9_y and               
        anelar_16_y > anelar_13_y and            
        mindinho_20_y > mindinho_17_y):          
        return save_video()

- ### ☝️ (Audio)

In [260]:
def speak(h, w, hand_landmarks, frame):
    indicador_8 = hand_landmarks.landmark[8]
    indicador_8_x, indicador_8_y = int(indicador_8.x * w), int(indicador_8.y * h)
    
    indicador_5 = hand_landmarks.landmark[5]
    indicador_5_x, indicador_5_y = int(indicador_5.x * w), int(indicador_5.y * h)
    
    polegar_4 = hand_landmarks.landmark[4]
    polegar_4_x, polegar_4_y = int(polegar_4.x * w), int(polegar_4.y * h)
    polegar_1 = hand_landmarks.landmark[1]
    polegar_1_x, polegar_1_y = int(polegar_1.x * w), int(polegar_1.y * h)
    
    medio_12 = hand_landmarks.landmark[12]
    medio_12_x, medio_12_y = int(medio_12.x * w), int(medio_12.y * h)
    medio_9 = hand_landmarks.landmark[9]
    medio_9_x, medio_9_y = int(medio_9.x * w), int(medio_9.y * h)
    
    anelar_16 = hand_landmarks.landmark[16]
    anelar_16_x, anelar_16_y = int(anelar_16.x * w), int(anelar_16.y * h)
    anelar_13 = hand_landmarks.landmark[13]
    anelar_13_x, anelar_13_y = int(anelar_13.x * w), int(anelar_13.y * h)
    
    mindinho_20 = hand_landmarks.landmark[20]
    mindinho_20_x, mindinho_20_y = int(mindinho_20.x * w), int(mindinho_20.y * h)
    mindinho_17 = hand_landmarks.landmark[17]
    mindinho_17_x, mindinho_17_y = int(mindinho_17.x * w), int(mindinho_17.y * h)
    
    palma_0 = hand_landmarks.landmark[0]
    palma_y = int(palma_0.y * h)
    
    if (indicador_8_y < indicador_5_y - 0.05 * h and 
        polegar_4_x > polegar_1_x and                 
        medio_12_y > medio_9_y and                   
        anelar_16_y > anelar_13_y and                
        mindinho_20_y > mindinho_17_y):              
        return True

- ### 👆 (Foto e Audio)

In [261]:
def L(h, w, hand_landmarks, frame):
  
  indicador_8 = hand_landmarks.landmark[8]
  indicador_8_x, indicador_8_y = int(indicador_8.x * w), int(indicador_8.y * h)
  indicador_6 = hand_landmarks.landmark[6]
  indicador_6_x, indicador_6_y = int(indicador_6.x * w), int(indicador_6.y * h)
  
  polegar_4 = hand_landmarks.landmark[4]
  polegar_4_x, polegar_4_y = int(polegar_4.x * w), int(polegar_4.y * h)
  polegar_2 = hand_landmarks.landmark[2]
  polegar_2_x, polegar_2_y = int(polegar_2.x * w), int(polegar_2.y * h)
  
  medio_12 = hand_landmarks.landmark[12]
  medio_12_x, medio_12_y = int(medio_12.x * w), int(medio_12.y * h)
  medio_9 = hand_landmarks.landmark[9]
  medio_9_x, medio_9_y = int(medio_9.x * w), int(medio_9.y * h)
  
  anelar_16 = hand_landmarks.landmark[16]
  anelar_16_x, anelar_16_y = int(anelar_16.x * w), int(anelar_16.y * h)
  anelar_13 = hand_landmarks.landmark[13]
  anelar_13_x, anelar_13_y = int(anelar_13.x * w), int(anelar_13.y * h)
  
  mindinho_20 = hand_landmarks.landmark[20]
  mindinho_20_x, mindinho_20_y = int(mindinho_20.x * w), int(mindinho_20.y * h)
  mindinho_17 = hand_landmarks.landmark[17]
  mindinho_17_x, mindinho_17_y = int(mindinho_17.x * w), int(mindinho_17.y * h)
  
  if (indicador_8_y < indicador_6_y - 0.05 * h and
        polegar_4_x < polegar_2_x and
        mindinho_20_y > mindinho_17_y and                  
        medio_12_y > medio_9_y and                   
        anelar_16_y > anelar_13_y): 
        return True

- ### 🤟 (Video e Audio)

In [262]:
def rock(h, w, hand_landmarks, frame):
  
  indicador_8 = hand_landmarks.landmark[8]
  indicador_8_x, indicador_8_y = int(indicador_8.x * w), int(indicador_8.y * h)
  indicador_6 = hand_landmarks.landmark[6]
  indicador_6_x, indicador_6_y = int(indicador_6.x * w), int(indicador_6.y * h)
  
  mindinho_20 = hand_landmarks.landmark[20]
  mindinho_20_x, mindinho_20_y = int(mindinho_20.x * w), int(mindinho_20.y * h)
  mindinho_18 = hand_landmarks.landmark[18]
  mindinho_18_x, mindinho_18_y = int(mindinho_18.x * w), int(mindinho_18.y * h)
  
  medio_12 = hand_landmarks.landmark[12]
  medio_12_x, medio_12_y = int(medio_12.x * w), int(medio_12.y * h)
  medio_9 = hand_landmarks.landmark[9]
  medio_9_x, medio_9_y = int(medio_9.x * w), int(medio_9.y * h)
  
  anelar_16 = hand_landmarks.landmark[16]
  anelar_16_x, anelar_16_y = int(anelar_16.x * w), int(anelar_16.y * h)
  anelar_13 = hand_landmarks.landmark[13]
  anelar_13_x, anelar_13_y = int(anelar_13.x * w), int(anelar_13.y * h)
  
  if (indicador_8_y < indicador_6_y - 0.05 * h and
        mindinho_20_y < mindinho_18_y - 0.05 * h and                  
        medio_12_y > medio_9_y and                   
        anelar_16_y > anelar_13_y): 
        return True

## RUN 🚀

In [263]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=2,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)

mp_drawing = mp.solutions.drawing_utils

In [264]:
cap = cv2.VideoCapture(0)

while cap.isOpened():
  
    ret, frame = cap.read()
    
    if not ret:
        print("Erro ao capturar o frame.")
        break
    
    # Espelhar a camera (Desativado)
    # frame = cv2.flip(frame, 1)    
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb_frame)

    if results.multi_hand_landmarks and results.multi_handedness:
      
      for hand_landmarks, hand_handedness in zip(results.multi_hand_landmarks, results.multi_handedness):
        
        hand_label = hand_handedness.classification[0].label
        
        h, w, _ = frame.shape
        
        # Inverte as mãos Left/Right
        # Programa:
        
        if hand_label == "Right" and ok(h, w, hand_landmarks, frame):
          time.sleep(0.5)
        
        if hand_label == "Left" and positive(h, w, hand_landmarks, frame):
          time.sleep(0.5)
        
        if hand_label == "Right" and speak(h, w, hand_landmarks, frame):
          await Mic()
          time.sleep(0.5)
        
        if hand_label == "Left" and L(h, w, hand_landmarks, frame):
          await Image_Audio(frame)
          time.sleep(0.5)
        
        if hand_label == "Right" and rock(h, w, hand_landmarks, frame):
          await Video_Audio()
          time.sleep(0.5)
        
        mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)
        
    cv2.imshow("MediaPipe Hands - Gestos Especificos", frame) 
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
      break

cap.release()
cv2.destroyAllWindows()