# Operation Polyglot: AI Language Companion
#### **Full Name:** Marinath Jeevanantham
#### **Link to SDS Profile:** https://community.superdatascience.com/u/860e4ecc

#### **Mission:** You are tasked with developing an AI Language Companion to revolutionize language learning. This intelligent assistant must harness the power of LLMs to guide users in mastering new languages by providing translations, interactive lessons, and pronunciation guidance.

Core Objectives
* Develop a language-learning AI assistant using open-source LLMs.
* Support text translation and interactive learning for selected languages.
* Enable multimodal capabilities (audio output) at advanced levels.
* Ensure a natural and engaging user experience.


In [None]:
!pip install -q python-dotenv
!pip install -q --upgrade transformers accelerate
!pip install -q --upgrade bitsandbytes
!pip install -q torch
!pip install -q --upgrade gradio
!pip install -q gTTS
!pip install -q pydub

In [41]:
import os
from dotenv import load_dotenv
from google.colab import drive
from huggingface_hub import login
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
import gradio as gr
from gtts import gTTS
import re
from pydub import AudioSegment
from threading import Thread
import string

In [None]:
'''
#Mount Google Drive
drive.mount('/content/drive')

#Define the correct path
dotenv_path = "/content/drive/MyDrive/Colab Notebooks/.env"

#Load the .env file
load_dotenv(dotenv_path)

#Login Huggingface
login(token=os.getenv("HUGGINGFACE_API_KEY"))
'''

# **LEVEL 1 - INITIATE**

In [None]:
pipe= pipeline("text-generation",
               model="microsoft/Phi-3.5-mini-instruct"
               ,device_map="cuda"
               ,torch_dtype="auto"
               ,trust_remote_code=False #"lightblue/suzume-llama-3-8B-multilingual" #
               )

In [None]:
torch.cuda.empty_cache()  # Clears unused memory from CUDA

system_message = "You are a AI Language Companion that breaks traditional barriers and empowers users to become fluent in the languages of their dreams."
chat = [{"role": "system", "content": system_message}]

# Chat loop
while True:
  torch.cuda.empty_cache()
  prompt = input("Enter your prompt:")
  # To break out of the chat loop
  if prompt.lower() in ["exit", "quit", "bye"]:
    print("Chatbot: Goodbye! Have a great day!")
    break

  # Append user prompt
  chat.append({"role": "user", "content": prompt})

  # Get response from the Chatbot
  response = pipe(chat, do_sample=True, max_new_tokens=100)
  print(response[0]['generated_text'][-1]['content'])
  # To store previous histroy as the response object returned by the pipeline actually contains the entire chat so far
  chat = response[0]['generated_text']

# **LEVEL 3 - OPERATIVE**

In [None]:
# Model to process text input
text_model_id = "microsoft/Phi-3.5-mini-instruct"
text_model = AutoModelForCausalLM.from_pretrained(
    text_model_id,
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=False,
)
text_tokenizer = AutoTokenizer.from_pretrained(text_model_id)

# Model to process audio input
transcribe_pipe = pipeline("automatic-speech-recognition",
              model="distil-whisper/distil-large-v3",  # Faster and very close in performance to the full-size "openai/whisper-large-v3"
              device_map="cuda",
              torch_dtype="auto"
              )

# Model to detect language for audio output
lang_detect_pipe = pipeline("text-classification",
              model="papluca/xlm-roberta-base-language-detection",
              device_map="cuda",
              torch_dtype="auto"
              )

In [25]:
# Create a language Dictionary
supported_languages = {
    "Arabic": "ar", "Chinese": "zh", "French": "fr", "German": "de", "Italian": "it", "Japanese": "ja", "Hindi": "hi", "Portuguese": "pt", "Russian": "ru", "Spanish": "es"
}

# Initial system instructions
system_instructions = """
You are LingoMigo, an AI Language Companion dedicated solely to help users learn the latest selected language of {}.

Rules:
* Provide a concise response in under 5 sentences, ensuring it stays under 150 tokens total. If you need to end abruptly, do so gracefully with a summary.
* Enclose any phrases/sentences in the latest selected language in double quotes.
* The user only knows English, so provide your answers primarily in English. If the user explicitly asks you to converse completely in the latest selected language, confirm with the user if that is what they want.
* If a query is unrelated language learning, politely ask the user to keep their queries focused on language learning.
* Do not make up any information. If you are unsure, say so.

Your tone should be friendly, direct, and encouraging, always keep the answers concise and creative.
"""

# Welcome message to choose the language to learn
welcome_message = "Welcome! I am LingoMigo, your AI Language Companion. Please select a language you would like to learn from the dropdown above to proceed."

In [None]:
def prompt_language_selection(message, history):
  # Prompt the user to select a language before asking any questions
  assistant_message = "Please select a language to proceed."
  history.append({"role": "user", "content": message})
  history.append({"role": "assistant", "content":assistant_message})
  yield history, None, None, None # 1st None clears the text/audio input, 2nd None is for the audio output

def extract_selected_lang_segments(latest_response, lang_code):
    #Extract all substrings within double quotes that are in the selected language.
    segments = re.findall(r'"(.*?)"', latest_response)
    lang_segments = []
    for seg in segments:
        # Remove leading/trailing punctuation and lowercase
        cleaned = seg.strip(string.punctuation).lower()
        result = lang_detect_pipe(cleaned, top_k=1, truncation=True)
        if result[0]['label'].lower() != "en":
            lang_segments.append(cleaned)
    #Remove duplicates
    seen = set()
    lang_segments_deduped = []
    for item in lang_segments:
        if item not in seen:
            seen.add(item)
            lang_segments_deduped.append(item)
    return lang_segments_deduped

def generate_audio_from_segments(segments, lang_code, silence_duration_ms=1000):
    # Create a silent audio segment
    silence = AudioSegment.silent(duration=silence_duration_ms)
    combined_audio = AudioSegment.empty()

    for seg in segments: #Each seg is a list of quoted segments in the selected language
      tts = gTTS(seg, lang=lang_code)
      temp_filename = "temp.mp3"
      tts.save(temp_filename)
      # Append the audio segment followed by silence
      audio_seg = AudioSegment.from_mp3(temp_filename)
      combined_audio += audio_seg + silence

    audio_file = "combined_output.mp3"
    combined_audio.export(audio_file, format="mp3")
    return audio_file

def language_assistant(message, history, language_selected, current_language_state):
  # Check if a language has been selected
  if language_selected == "Select a language":
    yield from prompt_language_selection(message, history)
  else:
    # Append the new user message
    history.append({"role": "user", "content": message})


    # Check if the language has changed compared to our stored state
    if current_language_state is None or current_language_state != language_selected:
        # Add a marker message to alert the model of the language change.
        marker = {
            "role": "assistant",
            "content": f"Latest selected language is {language_selected}."
        }
        history.append(marker)
        # Update the current language state.
        current_language_state = language_selected


    # Remove any previous system instructions from the history, so that only the latest language selected goes in the system message and all the previous system messages are filtered out from history
    filtered_history = [msg for msg in history if msg["role"] != "system"]
    # Passing system instructions
    chat = [{"role": "system", "content": system_instructions.format(language_selected)}] + filtered_history #System role is used here for instructions instead of assistant role because, the "system" role is hidden by deafult in the Gradio chatbot interface

    # Generate the response from the assistant
    messages = text_tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    # Tokenize the messages string
    model_inputs = text_tokenizer([messages], return_tensors="pt").to('cuda')
    streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=150,
        do_sample=True,
        temperature=0.5,  # Measures how daring the model gets: higher means more creative, lower means safer.
        top_k=50,         # Tells the model to pick its next word from the 50 most likely options.
        top_p=0.9         # Sets a probability cutoff: the model only picks from words that, combined, have a 90% chance.
    )
    t = Thread(target=text_model.generate, kwargs=generate_kwargs)
    t.start()

    # Stream the response incrementally
    chat_response = []
    for new_text in streamer:
        chat_response.append(new_text)
        # Yield partial text output with no audio until the full response is ready.
        partial_message = {"role": "assistant", "content": "".join(chat_response)}
        yield history + [partial_message], None, None, current_language_state

    # Concatenate all the streamed strings in the list into one final string
    latest_response = "".join(chat_response)
    # Add final assistant response to history
    history.append({"role": "assistant", "content": latest_response})

    # Get the language code based on the language selected
    lang_code = supported_languages[language_selected]
    # Extract select language segments from the response
    selected_lang_segments = extract_selected_lang_segments(latest_response, lang_code)

    # Generate Speech audio for the selected language segments
    audio_file = generate_audio_from_segments(selected_lang_segments, lang_code)

    yield history, None, audio_file, current_language_state


def transcribe_audio(audio, history, language_selected, current_language_state):
  # Generate the response based on history
  transcribe = transcribe_pipe(audio)
  # Check if a language has been selected
  if language_selected == "Select a language":
    yield from prompt_language_selection(transcribe["text"], history)
  else:
    # Pass the transcribed audio to the language assistant
    yield from language_assistant(transcribe["text"], history, language_selected, current_language_state)

# Create a chatbot using gradio
with gr.Blocks(fill_height=True,fill_width=True) as demo:
    # Add Title
    gr.Markdown("<h3 style='text-align: center;'> LingoMigo - Your Personal Language Companion</h3>")
    # Language selection
    language_dropdown = gr.Dropdown(
        choices=["Select a language"] + list(supported_languages.keys()),
        label="You can switch languages anytime you like"
    )

    # Chatbot component to display conversation history
    chatbot = gr.Chatbot(
        label="CHAT ASSISTANT",
        value=[{"role": "assistant", "content": welcome_message}], # Pre-fill the chatbot with welcome messages before the user inputs anything
        type="messages"
        )

    with gr.Row(equal_height=True):
      # Text input for manual messages
      text_input = gr.Textbox(label="Type your Question")
      # Audio input from microphone.
      audio_input = gr.Audio(label="Record your Question", sources="microphone", type="filepath")

    # Audio output TTS
    audio_output = gr.Audio(label="PRONUNCIATION ASSISTANT", value=None, autoplay=True)

    # State to track current language selection
    current_language_state = gr.State(None)

    # For text input
    text_input.submit(language_assistant,
                      inputs=[text_input, chatbot, language_dropdown, current_language_state],
                      outputs=[chatbot, text_input, audio_output, current_language_state])
    # For audio input
    audio_input.input(transcribe_audio,
                      inputs=[audio_input, chatbot, language_dropdown, current_language_state],
                      outputs=[chatbot, audio_input, audio_output, current_language_state])

demo.launch()