In [None]:
# Set the parameters
##############################
player_name = 'V'
game_name = 'Cyberpunk_2077'
interact_key = 't'
facial_animation_switch = True
##############################

In [1]:
import cv2
from functions.grabscreen import grab_screen
import time
from functions.speech_to_text import speech_to_text
import speech_recognition as sr
import os
from functions.main import main
import threading
from pydub import AudioSegment
from pydub.playback import play

# Get Character List
characters = [entry.name for entry in os.scandir(os.path.join(game_name, 'characters')) if entry.is_dir() and entry.name != 'default']

# Define the region of the screen
left = 0
top = 0
width = 1920
height = 1080

# Create a window to display the captured screen
cv2.namedWindow("Screen Capture", cv2.WINDOW_NORMAL)

# Adjust the window size to match the captured screen size
cv2.resizeWindow("Screen Capture", width, height)

font = cv2.FONT_HERSHEY_TRIPLEX
font_scale = 0.7
font_color = (192, 192, 192)  # Light grey color
line_thickness = 1

speak_display = False
speech_recognition_start = False
start_time = 0

transcribed_text = "NULL"
speech_to_text_error = "NULL"
transcribed_text_display = False
speech_to_text_error_display = False

main_function_start = False

# Initialise speech recognition
r = sr.Recognizer()

def play_audio():
    # Load and play the audio
    audio = AudioSegment.from_file('temp/audio.wav')
    play(audio)

while True:
    screen = grab_screen(region=(left, top, left + width, top + height))
    screen = cv2.cvtColor(screen, cv2.COLOR_BGR2RGB)
    text_position = (screen.shape[1] - 420, 50)

    if speak_display:
        cv2.putText(screen, "Speak", text_position, font, font_scale, font_color, line_thickness)
        elapsed_time = time.time() - start_time
        if elapsed_time < 2:
            speech_recognition_start = False
        else:
            speech_recognition_start = True

    if transcribed_text_display:
        # Wrap the text if it exceeds a certain length
        text_lines = []
        line_start = 0
        line_end = 30  # Maximum characters per line

        while line_start < len(transcribed_text):
            text_lines.append(transcribed_text[line_start:line_end])
            line_start = line_end
            line_end += 30

        for i, line in enumerate(text_lines):
            y = text_position[1] + (i * 30)
            cv2.putText(screen, line, (text_position[0], y), font, font_scale, font_color, line_thickness)

        elapsed_time = time.time() - start_time
        if elapsed_time < 3:
            transcribed_text_display = True
        else:
            transcribed_text_display = False
            main_function_start = True

    if speech_to_text_error_display:
        # Wrap the text if it exceeds a certain length
        text_lines = []
        line_start = 0
        line_end = 30  # Maximum characters per line

        while line_start < len(speech_to_text_error):
            text_lines.append(speech_to_text_error[line_start:line_end])
            line_start = line_end
            line_end += 30

        for i, line in enumerate(text_lines):
            y = text_position[1] + (i * 30)
            cv2.putText(screen, line, (text_position[0], y), font, font_scale, font_color, line_thickness)

        elapsed_time = time.time() - start_time
        if elapsed_time < 3:
            speech_to_text_error_display = True
        else:
            speech_to_text_error_display = False

    if speech_recognition_start:
        with sr.Microphone() as source:
            audio = r.listen(source)

        transcribed_text, speech_to_text_error = speech_to_text(audio)
        if transcribed_text != 'NULL':
            transcribed_text_display = True
            start_time = time.time()
        else:
            speech_to_text_error_display = True
            start_time = time.time()

        speak_display = False
        speech_recognition_start = False

    if main_function_start:
        facial_animation_video_path, audio_path, coordinates  = main(screen, transcribed_text, player_name, game_name, characters, facial_animation_switch)
        if facial_animation_video_path == "":
            play_audio()
            main_function_start = False
            continue
        x = coordinates[0]
        y = coordinates[1]
        w = coordinates[2]
        h = coordinates[3]

        # Save the Screen
        cv2.imwrite('temp/screen.jpg', screen)

        # Read the image file
        image = cv2.imread('temp/screen.jpg')

        # Read the video file
        video = cv2.VideoCapture(facial_animation_video_path)

        # Get video properties
        fps = video.get(cv2.CAP_PROP_FPS)

        # Start the audio playback in a separate thread
        audio_thread = threading.Thread(target=play_audio)
        audio_thread.start()

        while True:

            # Read a frame from the video
            ret, frame = video.read()

            if not ret:
                main_function_start = False
                break

            # Resize the frame to match ROI dimensions
            frame = cv2.resize(frame, (w, h))

            # Replace the ROI in the image with the frame
            image[y:y+h, x:x+w] = frame

            # Display the image with the video overlay
            cv2.imshow('Screen Capture', image)

            # Delay to control the frame rate
            delay = int(1000 / fps)  # Calculate the delay based on video frame rate
            if cv2.waitKey(delay) == ord('q'):
                break

        # Release resources
        video.release()

        # Wait for the audio playback thread to finish
        audio_thread.join()

    cv2.imshow("Screen Capture", screen)

    # Wait for a key press and break the loop if 'q' is pressed
    if cv2.waitKey(1) == ord("q"):
        break

    if cv2.waitKey(1) == ord(interact_key) and not speak_display:
        speak_display = True
        start_time = time.time()

# Release the window and resources
cv2.destroyAllWindows()