In [None]:
#PART 1

# Import the necessary libraries
import os
from PIL import Image
import numpy as np
from rapidocr_onnxruntime import RapidOCR

# File paths
image_path = "filepath_to_the_screenshot_of_the_game.file_extension"
text_file_path = "filepath_to_the_textfile_of_all_zenbeach_words/zenbeach.txt"
audio_folder_path = "filepath_to_the_audiofolder_of_all_zenbeach_audiofiles/zenbeach_folder"
output_dir = "filepath_to_the_outputfolder_of_the_generated_audiofiles/zenbeach_audio_output"

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Image cropping settings
CROP_TOP_PERCENT = 0.15    # Percentage to remove from the top, define cropping percentages (0.0 to 1.0)
CROP_BOTTOM_PERCENT = 0.50 # Percentage to remove from the bottom
CROP_LEFT_PERCENT = 0.32   # Percentage to remove from the left
CROP_RIGHT_PERCENT = 0.32  # Percentage to remove from the right

# Word processing settings
WORD_GAP_THRESHOLD_FACTOR = 0.5 # Factor to determine how close words/parts of words need to be in order to be merged into one prediction
SPEECH_RATE = 0.4 # Speech rate for audio generation to inform how fast the audio should be played if generated with TTS

# Image Loading and Cropping
print(f"Loading image: {image_path}")
try:
    img_pil = Image.open(image_path)
    w, h = img_pil.size
    print(f"Original image size: {w}x{h}")

    left = int(w * CROP_LEFT_PERCENT)
    top = int(h * CROP_TOP_PERCENT)
    right = int(w * (1 - CROP_RIGHT_PERCENT))
    bottom = int(h * (1 - CROP_BOTTOM_PERCENT))

    if left >= right or top >= bottom:
        print(f"Warning: Invalid crop dimensions ({left},{top},{right},{bottom}). Check percentages.")
        cropped_img_pil = img_pil
        print("Using original image due to invalid crop.")
    else:
        print(f"Cropping to box: ({left}, {top}, {right}, {bottom})")
        cropped_img_pil = img_pil.crop((left, top, right, bottom))
        print(f"Cropped image size: {cropped_img_pil.width}x{cropped_img_pil.height}")

    cropped_img_np = np.array(cropped_img_pil)

except FileNotFoundError:
    print(f"Error: Image file not found at {image_path}")
    exit()
except Exception as e:
    print(f"Error during image loading or cropping: {e}")
    exit()

# OCR Initialization
print("Initializing RapidOCR engine...")
ocr_engine = RapidOCR()
print("RapidOCR engine initialized.")

# OCR Prediction
print("Running OCR on the cropped image...")
results = None
try:
    results, _ = ocr_engine(cropped_img_np)
    print("OCR processing finished.")
except NameError:
    print("Error: Image data (cropped_img_np) is not available. Check loading/cropping steps.")
except Exception as e:
    print(f"Error occurred during OCR prediction: {e}")

# Process and Combine Results
predictions = []
word_confidences = {}

if results:
    # Sort OCR results by the x-coordinate of the bounding box
    sorted_results = sorted(results, key=lambda item: item[0][0][0])

    combined_words_data = []
    if sorted_results:
        # Initialize the first word or parts of a word
        current_word_text = sorted_results[0][1]
        current_word_confidences = [sorted_results[0][2]]
        previous_box = sorted_results[0][0]
        previous_box_width = max(p[0] for p in previous_box) - min(p[0] for p in previous_box)
        previous_box_right_x = max(p[0] for p in previous_box)

        # Iterate through the rest of the words or parts of a word and combine them if they are close enough
        for i in range(1, len(sorted_results)):
            current_box, current_text, current_confidence = sorted_results[i]

            current_box_left_x = min(p[0] for p in current_box)
            current_box_width = max(p[0] for p in current_box) - current_box_left_x

            gap = current_box_left_x - previous_box_right_x

            merge_threshold = WORD_GAP_THRESHOLD_FACTOR * previous_box_width if previous_box_width > 1 else 5

            if gap < merge_threshold:
                current_word_text += current_text
                current_word_confidences.append(current_confidence)
            else:
                combined_words_data.append((current_word_text, current_word_confidences))
                current_word_text = current_text
                current_word_confidences = [current_confidence]

            previous_box = current_box
            previous_box_right_x = max(p[0] for p in previous_box)
            previous_box_width = max(p[0] for p in previous_box) - min(p[0] for p in previous_box)

        # Add the last word or parts of a word
        if current_word_text:
            combined_words_data.append((current_word_text, current_word_confidences))

    # Collect the new words and calculate their average confidence
    for word, confidences in combined_words_data:
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        predictions.append(word)
        word_confidences[word] = avg_confidence

else:
    print("No text detected or an error occurred during OCR.")

# Output
print("\n--- Combined OCR Prediction ---")
print(word_confidences) # Dictionary of predicted words and their average confidence
print(predictions) # List of predicted words



#PART 2

import difflib
import torch
import librosa
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import wave
import pyaudio
import sentencepiece

# Function to play audio files
def play_audio_file(audio_file_path, chunk=1024):
    try:
        with wave.open(audio_file_path, "rb") as f:
            p = pyaudio.PyAudio()
            stream = p.open(format=p.get_format_from_width(f.getsampwidth()),
                          channels=f.getnchannels(),
                          rate=f.getframerate(),
                          output=True)
            data = f.readframes(chunk)
            while data:
                stream.write(data)
                data = f.readframes(chunk)
            stream.stop_stream()
            stream.close()
            p.terminate()
            print(f"Successfully played audio: {audio_file_path}")
            return True
    except Exception as e:
        print(f"Error playing audio {audio_file_path}: {e}")
        return False

# Setting up text-to-speech model
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# Configuring the voice of the model
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# Single loop to process words and handle audio
if predictions:  # Check if there are any predictions
    for word in predictions:
        try:
            # Find the closest match in the text file
            audio_path = None
            with open(text_file_path, 'r') as file:
                lines = file.readlines()
                closest_match = difflib.get_close_matches(word, [line.split()[1] for line in lines], n=1, cutoff=0.6)
                if closest_match:
                    matched_word = closest_match[0]
                    print(f"Matched '{word}' to '{matched_word}'")
                    
                    # Map the filename for the matched word
                    for line in lines:
                        parts = line.strip().split()
                        if len(parts) == 2:
                            filename, word_in_file = parts
                            if word_in_file.lower() == matched_word.lower():
                                audio_path = os.path.join(audio_folder_path, filename + ".wav")
                                break
            
            # Try to play existing audio file if found
            if audio_path and os.path.exists(audio_path):
                print(f"Found audio file: {audio_path}")
                if play_audio_file(audio_path):
                    continue  # Skip TTS generation if audio played successfully
            
            # If we get here, either no audio file was found or playback failed
            # Generate speech using TTS
            print(f"Generating TTS audio for '{word}'...")
            inputs = processor(text=word, return_tensors="pt")
            with torch.no_grad():
                speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
            speech_np = speech.squeeze().cpu().numpy()
            speech_stretched = librosa.effects.time_stretch(speech_np, rate=SPEECH_RATE)
            
            # Save and play the generated audio
            generated_audio_path = os.path.join(output_dir, f"{word}.wav")
            sf.write(generated_audio_path, speech_stretched, samplerate=16000)
            
            # Try to play the generated audio
            if not play_audio_file(generated_audio_path):
                print(f"Failed to play generated audio for '{word}'")

        except Exception as e:
            print(f"Error processing word '{word}': {e}")
else:
    print("No predictions found.")


Loading image: C:/Users/diede/Downloads/ATE.jpeg
Original image size: 1600x1200
Cropping to box: (512, 180, 1088, 600)
Cropped image size: 576x420
Initializing RapidOCR engine...
RapidOCR engine initialized.
Running OCR on the cropped image...
RapidOCR engine initialized.
Running OCR on the cropped image...
OCR processing finished.

--- Combined OCR Prediction ---
{'ate': 0.9971916675567627}
['ate']
OCR processing finished.

--- Combined OCR Prediction ---
{'ate': 0.9971916675567627}
['ate']
Matched 'ate' to 'ate'
Found audio file: C:/Users/diede/Downloads/ZENBEACH\Island1.level4.ate.wav
Matched 'ate' to 'ate'
Found audio file: C:/Users/diede/Downloads/ZENBEACH\Island1.level4.ate.wav
Successfully played audio: C:/Users/diede/Downloads/ZENBEACH\Island1.level4.ate.wav
Successfully played audio: C:/Users/diede/Downloads/ZENBEACH\Island1.level4.ate.wav
