In [None]:
from PIL import Image, ImageDraw, ImageFont
import gradio as gr 
import cv2 
import numpy as np 
import platform, random, requests
import io, base64, re, time 
from ultralytics import YOLO

In [None]:
def detect_objects(image) :

    model = YOLO("yolo11n.pt")
    results = model(image)
    annotated_image = results[0].plot()

    # boxes = result.boxes  # Boxes object for bounding box outputs
    # masks = result.masks  # Masks object for segmentation masks outputs
    # keypoints = result.keypoints  # Keypoints object for pose outputs
    # probs = result.probs  # Probs object for classification outputs
    # obb = result.obb  # Oriented boxes object for OBB outputs
    # result.show()  # display to screen
    return annotated_image

In [None]:
def get_font():
    font_size = 40   
    if platform.system() == "Windows":
        font = ImageFont.truetype("BCCARDB.ttf", size=font_size)
    else:
        font = ImageFont.load_default(size=font_size)
    
    return font

def random_color():
    return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

In [None]:
OPENAI_ENDPOINT = ""
OPENAI_API_KEY = ""
DEPLOYMENT_NAME = "gpt-4o"

SPEECH_ENDPOINT = ""
SPEECH_API_KEY = ""

In [None]:
def chatbot_basic(image_array) :
    # header 정보 입력하기
    headers = {
        "Content-Type":"application/json",
        "api-key": OPENAI_API_KEY
    }

    image = Image.fromarray(image_array)
    buffered_io = io.BytesIO()
    image.save(buffered_io, format='png')
    base64_image = base64.b64encode(buffered_io.getvalue()).decode("utf-8")
    
    messages = [{"role":"system","content":[{"type":"text","text":"너는 사진속의 물체를 분석하는 전문가야, 분석결과는 한국어로 답변해줘"}]}]

    # user
    # messages.append({"role": "user", "content": history[-1]["content"]})
    # messages.append({"role": "user", "content": base64_image})
    messages.append({"role":"user","content":[{"type":"text", "text":"Yolo 모델로 물체를 감지한 이미지야, 이 이미지 및 감지된 물체에 대해 자세히 설명해줘"},{"type":"image_url","image_url":{"url":f"data:image/png;base64,{base64_image}"}}]})
    # body 정보 입력하기 
    body = { 
    "messages": messages,
    "temperature": 0.2,
    "top_p": 0.9,
    "max_tokens": 1800,       
    }         
    
    # POST 
    response = requests.post(OPENAI_ENDPOINT, headers=headers, json=body)    
    if response.status_code == 200 :
        response_json = response.json()
        content = response_json['choices'][0]['message']['content']
        ## citation_list = response_json['choices'][0]['message']['context']['citations']
        ## content = re.sub(r'\[doc(\d+)\]', r'[참조 \1]', content)        
        return content
    else :
        return ""
    


In [None]:
def request_tts(text, voice="ko-KR-SeoHyeonNeural", file_name="response_audio.mp3") :    
    content_type = "application/ssml+xml"
    output_format = "audio-16khz-64kbitrate-mono-mp3"
    
    body_raw = f"""<speak version='1.0' xml:lang='ko-KR'><voice name='{voice}'>{text}</voice></speak>"""
    
    headers = {
        "Ocp-Apim-Subscription-Key" : SPEECH_API_KEY,
        "Content-Type" : content_type,
        "X-Microsoft-OutputFormat" : output_format
    }
    response = requests.post(SPEECH_ENDPOINT, headers = headers, data=body_raw)

    if response.status_code == 200 :
        with open(file_name, 'wb') as audio_file :
            audio_file.write(response.content)
        return file_name
    else :
        return None

In [None]:


def clean_text(text) :
    text = re.sub(r'[^가-힣a-zA-Z0-9\s!?]', '', text)
    
    return text

In [None]:
def change_tts(tts_text, voice_name) :
    tts_re_text = clean_text(tts_text)
    #request tts
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    file_name = f"response_audio_{timestamp}.mp3"
    audio_output = request_tts(tts_re_text, voice=voice_name, file_name=file_name)
    return audio_output

In [None]:
def change_chatbot(history, voice_name) :
    
    if history == 0 :
        return None
    elif history[-1]['role'] == "user" :
        return None
    else :
        answer = history[-1]["content"]
        if voice_name is None:
            voice_name = "ko-KR-SeoHyeonNeural"  # Default voice
        answer_audio = change_tts(answer, voice_name)        
        return answer_audio

In [None]:

theme = gr.themes.Origin(
    primary_hue="rose",
    secondary_hue="red",
    neutral_hue="slate",
    text_size=gr.themes.Size(lg="17px", md="15px", sm="13px", xl="24px", xs="12px", xxl="28px", xxs="10px"),
    radius_size="lg",
    font=[gr.themes.GoogleFont('Gowun Batang'), gr.themes.GoogleFont('IBM Plex Sans KR '), gr.themes.GoogleFont('42dot Sans '), 'sans-serif'],
    font_mono=[gr.themes.GoogleFont('Gowun Batang'), gr.themes.GoogleFont('IBM Plex Sans KR '), gr.themes.GoogleFont('42dot Sans '), 'monospace'],
).set(
    body_background_fill='*background_fill_secondary',
    body_background_fill_dark='*neutral_800',    
    body_text_color='*neutral_700',
    body_text_size='*text_md',
    embed_radius='*radius_md',
    block_radius='*radius_md',
    block_title_radius='*radius_md',
    block_title_text_size='*text_md',
    container_radius='*radius_md',
    input_text_size='*text_sm',
    button_large_text_size='*text_md',
    form_gap_width='0px'     
)

with gr.Blocks(theme=theme) as demo :
    gr.Markdown("## Yolo와 OpenAI", height="40px")
    gr.Markdown("### 웹캠으로 화면을 촬영후 원하는 화면을 캡쳐하여 분석하세요!", height="30px")
    voice_list_female = [("서현","ko-KR-SeoHyeonNeural"), ("선희", "ko-KR-SunHiNeural"), ("순복","ko-KR-SoonBokNeural"), ("유진", "ko-KR-YuJinNeural"), ("지민","ko-KR-JiMinNeural")]
    voice_list_male = [("국민","ko-KR-GookMinNeural"), ("봉진","ko-KR-BongJinNeural"), ("인준", "ko-KR-InJoonNeural"), ("현수","ko-KR-HyunsuNeural")]       
    
    with gr.Row() :        
        webcam_input = gr.Image(label = "실시간 화면", sources="webcam", width=480, height=270, mirror_webcam=False, streaming=True)
        output_cam = gr.Image(label="검출 화면", type='pil', interactive=False)
        output_capture_image = gr.Image(label="캡처 화면", interactive=False)

    with gr.Row() :
        captrue_btn = gr.Button("화면 캡쳐", size="sm")
        send_gpt_btn = gr.Button("GPT로 분석하기", size="sm")

    with gr.Column() :
        gr.Markdown("### GPT의 분석결과", height="30px")
        with gr.Row() :            
            with gr.Column(scale=3) :
                gpt = gr.Chatbot(label="분석 결과", type="messages", height="800px")
            with gr.Column(scale=1) :
                gr.Markdown("### GPT목소리를 선택해 보세요", height="30px")
                select_voice_gender = gr.Radio(label="GPT 성별 선택", choices=["Female", "Male"], value="Female")
                select_voice_name = gr.Dropdown(label="GPT 이름", choices=voice_list_female, value="ko-KR-SeoHyeonNeural")
                chatbot_audio = gr.Audio(label="GPT Talk", interactive=False, autoplay=True)  
              
    def stream_webcam(image) :
        draw_image = detect_objects(image)
        return draw_image
    
    def click_capture(image) :
        return image 
    
    def click_send_gpt(image, history) :
        gpt_content = chatbot_basic(image)
        history.append({"role": "user", "content": gr.Image(image)}) 
        history.append({"role": "assistant", "content": gpt_content})        
        return history
    
    def gender_selects(select) :
        if select == "Female" :
            return gr.update(choices=voice_list_female, value=voice_list_female[0])                    
        else :
            return gr.update(choices=voice_list_male, value=voice_list_male[0])

    webcam_input.stream(stream_webcam, inputs=[webcam_input], outputs=[output_cam])
    captrue_btn.click(click_capture, inputs=[output_cam], outputs=[output_capture_image])
    send_gpt_btn.click(click_send_gpt, inputs=[output_capture_image, gpt], outputs=[gpt])
    # chatbot 답변 자동 tts
    gpt.change(change_chatbot, inputs=[gpt, select_voice_name], outputs=[chatbot_audio]) 
    select_voice_gender.change(gender_selects, inputs=select_voice_gender, outputs=select_voice_name)
    
demo.launch()